forward old vi.shiar.net domains
[sheet.git] / charset.plp
1 <(common.inc.plp)><:
2         our $VERSION = 'v1.0';
3
4 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
5  "http://www.w3.org/TR/html4/loose.dtd">
6 <html lang="en">
7
8 <head>
9 <meta http-equiv="content-type" content="<:= $header{content_type} :>">
10 <title>charset cheat sheet</title>
11 <meta name="description" content="Reference sheet with all glyphs in common character encoding tables, and an overview of Unicode ranges and UTF-8 bytes.">
12 <meta name="keywords" content="charset, codepage, unicode, ascii, utf8, latin, glyph, character, encoding, reference, common, overview, table">
13 <:= stylesheet(qw'light') :>
14 <link rel="icon" type="image/png" href="/clip.png">
15 </head>
16
17 <body id="charset">
18 <h1>Character encoding</h1>
19
20 <:
21 my $diinfo = do 'digraphs.inc.pl';
22 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
23         keys %$diinfo;
24
25 use Encode qw(decode resolve_alias);
26 # generate character table(s)
27 # (~16x faster than decoding in loop;
28 #  substr strings is twice as fast as splitting to an array)
29 my %ALIAS = (
30 #       default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
31         default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
32         0 => [qw(cp437 cp863)],
33         1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
34         2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
35         5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
36         7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
37         8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
38 );
39 my @request = map {
40         if (my $input = $_) {
41                 my %row = (offset => 0);
42                 my $endpoint = 255;
43                 if ($input =~ s/^--//) {
44                         $row{offset} = $endpoint > 160 ? 160 : 48;
45                 }
46                 elsif ($input =~ s/^-//) {
47                         $row{offset} = $endpoint > 128 ? 128 : 32;
48                 }
49                 if ($input =~ s/-$//) {
50                         $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
51                 }
52
53                 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
54                         my $start = hex($1) << ($2 ? 4 : 8);
55                         my $end = $2 ? hex($2) << 4 : $start + 240;
56                         $row{table} = join '', map { chr } $start .. $end+15;
57                         utf8::upgrade($row{table});  # prevent latin1 output
58                         $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
59                 }
60                 elsif ($input eq 'U') {
61                         $row{table} = ' ' x 512;
62                         $row{set} = 'Unicode planes';
63                         $row{cell} = do 'charset-ucplanes.inc.pl';
64                 }
65                 elsif ($row{set} = resolve_alias($input)) {
66                         if ($row{set} eq 'Internal') {
67                                 $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096);
68                                 $row{set} = 'Unicode BMP';
69                                 $row{cell} = do 'charset-unicode.inc.pl';
70                         }
71                         elsif ($row{set} eq 'utf-8-strict') {
72                                 $row{table} = undef;
73                                 $row{set} = 'UTF-8';
74                                 $row{cell} = do 'charset-utf8.inc.pl';
75                         }
76                         else {
77                                 $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
78                         }
79                 }
80                 else {
81                         print "<p>Encoding $input unknown</p>\n";
82                 }
83                 \%row;
84         }
85         else {
86                 ();
87         }
88 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
89         $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
90 my $NOCHAR = chr 0xFFFD;
91
92 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
93         substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
94         substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
95                 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
96                 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
97         );
98 }
99
100 my @nibble = (0..9, 'A'..'F');
101 for my $row (@request) {
102         printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
103         printf '<caption>%s</caption>', $row->{set};
104         print '<col>' x 17;
105         for my $section (qw{thead}) {
106                 print "<$section><tr><th>↱";
107                 print '<th>', $_ for @nibble;
108                 print "\n";
109         }
110         print '<tbody>';
111         for my $msb (0 .. (length($row->{table}) || 256) - 1 >> 4) {
112                 printf '<tr><th>%X', $msb + ($row->{offset} >> 4);
113                 for my $lsb (0 .. $#nibble) {
114                         if ($row->{cell}) {
115                                 print $row->{cell}->(($msb<<4) + $lsb);
116                                 next;
117                         }
118
119                         my $glyph = substr $row->{table}, ($msb<<4) + $lsb, 1;
120                         if ($glyph eq $NOCHAR) {
121                                 print '<td>';
122                                 next;
123                         }
124
125                         my $info = [ord $glyph];
126                         if (defined (my $mnem = $di{ord $glyph})) {
127                                 $info = $diinfo->{$mnem};
128                         }
129                         else {
130                                 require Unicode::UCD;
131                                 my $fullinfo = Unicode::UCD::charinfo(ord $glyph);
132                                 $info = [@$fullinfo{qw/code name category script string/}] if $fullinfo;
133                         }
134                         my ($codepoint, $name, $prop, $script, $string) = @$info;
135
136                         $glyph = EscapeHTML($string || $glyph);
137                         my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
138                         my @class = ('X', grep {$_} $prop, $script);
139
140                         $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
141
142                         printf "\n".'<td class="%s" title="%s">%s',
143                                 join(' ', @class), EscapeHTML($desc), $glyph;
144                 }
145                 print "\n";
146         }
147         print "</table></div>\n";
148 }
149
150 :>
151 <hr>
152
153 <div class="legend">
154         <table class="glyphs"><tr>
155         <td class="X Cc">control
156         <td class="X Zs"><span>whitespace</span>
157         <td class="X Mn">diacritic<table class="glyphs"><tr>
158                 <td class="X Sk">letter
159                 </table>
160         <td class="X Po">punctuation<table class="glyphs"><tr>
161                 <td class="X Pf">quote
162                 </table>
163         <td class="X So">symbol<table class="glyphs"><tr>
164                 <td class="X Sm">math
165                 <td class="X Sc">currency
166                 </table>
167         <td class="X No">numeric
168         <td class="X Greek">greek<table class="glyphs"><tr>
169                 <td class="X Latin">latin
170                 <td class="X Cyrillic">cyrillic
171                 </table>
172         <td class="X Aramaic">aramaic<table class="glyphs"><tr>
173                 <td class="X Brahmic">brahmic
174                 <td class="X Arabic">arabic
175                 </table>
176         <td class="X Syllabic">syllabic<table class="glyphs"><tr>
177                 <td class="X African">african
178                 <td class="X Hiragana">japanese
179                 <td class="X Han">cjk
180                 <td class="X Bopomofo">chinese
181                 </table>
182         <td class="X Alpha">alphabetic
183         </table>
184
185         <table class="glyphs"><tr>
186         <td class="X">unicode 5.0
187         <td class="X Xr">proposed
188         <td class="X Xd">deprecated
189         <td class="">unassigned
190         <td class="X Xi">invalid
191         </table>
192 </div>
193
194 <p class="footer">
195         <a href="/" rel="home">sheet.shiar.nl</a>/charset.<a href="/source/charset.plp"
196          rel="code" title="Written in Perl">plp</a>
197         <a href="http://git.shiar.nl/sheet.git/history/HEAD:/charset.plp"
198          rel="vcs-git" title="Git repository"><:= $VERSION :></a>
199         created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
200         <a href="http://www.fsf.org/licensing/licenses/agpl-3.0.html" rel="copyright"
201          title="Licensed under the GNU Affero General Public License, version 3">AGPLv3</a>
202 </p>
203
204 </html>