sitemap to help robots prioritise pages
[sheet.git] / charset.plp
1 <(common.inc.plp)><:
2         our $VERSION = 'v1.0';
3
4 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
5  "http://www.w3.org/TR/html4/loose.dtd">
6 <html lang="en">
7
8 <head>
9 <meta http-equiv="content-type" content="<:= $header{content_type} :>">
10 <title>charset cheat sheet</title>
11 <meta name="description" content="Reference sheet with all glyphs in common character encoding tables, and an overview of Unicode ranges and UTF-8 bytes.">
12 <meta name="keywords" content="charset, codepage, unicode, ascii, utf8, latin, glyph, character, encoding, reference, common, overview, table">
13 <:= stylesheet(qw'light') :>
14 </head>
15
16 <body id="charset">
17 <h1>Character encoding</h1>
18
19 <:
20 my $diinfo = do 'digraphs.inc.pl';
21 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
22         keys %$diinfo;
23
24 use Encode qw(decode resolve_alias);
25 # generate character table(s)
26 # (~16x faster than decoding in loop;
27 #  substr strings is twice as fast as splitting to an array)
28 my %ALIAS = (
29 #       default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
30         default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
31         0 => [qw(cp437 cp863)],
32         1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
33         2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
34         5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
35         7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
36         8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
37 );
38 my @request = map {
39         if (my $input = $_) {
40                 my %row = (offset => 0);
41                 my $endpoint = 255;
42                 if ($input =~ s/^--//) {
43                         $row{offset} = $endpoint > 160 ? 160 : 48;
44                 }
45                 elsif ($input =~ s/^-//) {
46                         $row{offset} = $endpoint > 128 ? 128 : 32;
47                 }
48                 if ($input =~ s/-$//) {
49                         $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
50                 }
51
52                 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
53                         my $start = hex($1) << ($2 ? 4 : 8);
54                         my $end = $2 ? hex($2) << 4 : $start + 240;
55                         $row{table} = join '', map { chr } $start .. $end+15;
56                         utf8::upgrade($row{table});  # prevent latin1 output
57                         $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
58                 }
59                 elsif ($input eq 'U') {
60                         $row{table} = ' ' x 512;
61                         $row{set} = 'Unicode planes';
62                         $row{cell} = do 'charset-ucplanes.inc.pl';
63                 }
64                 elsif ($row{set} = resolve_alias($input)) {
65                         if ($row{set} eq 'Internal') {
66                                 $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096);
67                                 $row{set} = 'Unicode BMP';
68                                 $row{cell} = do 'charset-unicode.inc.pl';
69                         }
70                         elsif ($row{set} eq 'utf-8-strict') {
71                                 $row{table} = undef;
72                                 $row{set} = 'UTF-8';
73                                 $row{cell} = do 'charset-utf8.inc.pl';
74                         }
75                         else {
76                                 $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
77                         }
78                 }
79                 else {
80                         print "<p>Encoding $input unknown</p>\n";
81                 }
82                 \%row;
83         }
84         else {
85                 ();
86         }
87 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
88         $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
89 my $NOCHAR = chr 0xFFFD;
90
91 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
92         substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
93         substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
94                 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
95                 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
96         );
97 }
98
99 my @nibble = (0..9, 'A'..'F');
100 for my $row (@request) {
101         printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
102         printf '<caption>%s</caption>', $row->{set};
103         print '<col>' x 17;
104         for my $section (qw{thead}) {
105                 print "<$section><tr><th>↱";
106                 print '<th>', $_ for @nibble;
107                 print "\n";
108         }
109         print '<tbody>';
110         for my $msb (0 .. (length($row->{table}) || 256) - 1 >> 4) {
111                 printf '<tr><th>%X', $msb + ($row->{offset} >> 4);
112                 for my $lsb (0 .. $#nibble) {
113                         if ($row->{cell}) {
114                                 print $row->{cell}->(($msb<<4) + $lsb);
115                                 next;
116                         }
117
118                         my $glyph = substr $row->{table}, ($msb<<4) + $lsb, 1;
119                         if ($glyph eq $NOCHAR) {
120                                 print '<td>';
121                                 next;
122                         }
123
124                         my $info = [ord $glyph];
125                         if (defined (my $mnem = $di{ord $glyph})) {
126                                 $info = $diinfo->{$mnem};
127                         }
128                         else {
129                                 require Unicode::UCD;
130                                 my $fullinfo = Unicode::UCD::charinfo(ord $glyph);
131                                 $info = [@$fullinfo{qw/code name category script string/}] if $fullinfo;
132                         }
133                         my ($codepoint, $name, $prop, $script, $string) = @$info;
134
135                         $glyph = EscapeHTML($string || $glyph);
136                         my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
137                         my @class = ('X', grep {$_} $prop, $script);
138
139                         $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
140
141                         printf "\n".'<td class="%s" title="%s">%s',
142                                 join(' ', @class), EscapeHTML($desc), $glyph;
143                 }
144                 print "\n";
145         }
146         print "</table></div>\n";
147 }
148
149 :>
150 <hr>
151
152 <div class="legend">
153         <table class="glyphs"><tr>
154         <td class="X Cc">control
155         <td class="X Zs"><span>whitespace</span>
156         <td class="X Mn">diacritic<table class="glyphs"><tr>
157                 <td class="X Sk">letter
158                 </table>
159         <td class="X Po">punctuation<table class="glyphs"><tr>
160                 <td class="X Pf">quote
161                 </table>
162         <td class="X So">symbol<table class="glyphs"><tr>
163                 <td class="X Sm">math
164                 <td class="X Sc">currency
165                 </table>
166         <td class="X No">numeric
167         <td class="X Greek">greek<table class="glyphs"><tr>
168                 <td class="X Latin">latin
169                 <td class="X Cyrillic">cyrillic
170                 </table>
171         <td class="X Aramaic">aramaic<table class="glyphs"><tr>
172                 <td class="X Brahmic">brahmic
173                 <td class="X Arabic">arabic
174                 </table>
175         <td class="X Syllabic">syllabic<table class="glyphs"><tr>
176                 <td class="X African">african
177                 <td class="X Hiragana">japanese
178                 <td class="X Han">cjk
179                 <td class="X Bopomofo">chinese
180                 </table>
181         <td class="X Alpha">alphabetic
182         </table>
183
184         <table class="glyphs"><tr>
185         <td class="X">unicode 5.0
186         <td class="X Xr">proposed
187         <td class="X Xd">deprecated
188         <td class="">unassigned
189         <td class="X Xi">invalid
190         </table>
191 </div>
192
193 <p class="footer">
194         <a href="/" rel="home">sheet.shiar.nl</a>/charset.<a href="/source/charset.plp"
195          rel="code" title="Written in Perl">plp</a>
196         <a href="http://git.shiar.nl/sheet.git/history/HEAD:/charset.plp"
197          rel="vcs-git" title="Git repository"><:= $VERSION :></a>
198         created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
199         <a href="http://www.fsf.org/licensing/licenses/agpl-3.0.html" rel="copyright"
200          title="Licensed under the GNU Affero General Public License, version 3">AGPLv3</a>
201 </p>
202
203 </html>