91de26bd81c002e5cff1e7fd896c76996c25c479
[sheet.git] / charset.plp
1 <:
2 use utf8;
3 use strict;
4 use warnings;
5 use open IO => ':utf8';
6
7 our $VERSION = '1.0';
8
9 $header{content_type} = 'text/html; charset=utf-8';
10
11 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
12  "http://www.w3.org/TR/html4/loose.dtd">
13 <html>
14
15 <head>
16 <title>charset cheat sheet</title>
17 <meta http-equiv="content-type" content="utf-8">
18 <link rel="stylesheet" type="text/css" media="all" href="/base.css">
19 </head>
20
21 <body>
22 <h1>Character encoding</h1>
23
24 <:
25 my $diinfo = do 'digraphs.inc.pl';
26 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
27         keys %$diinfo;
28
29 use Encode qw(decode resolve_alias);
30 # generate character table(s)
31 # (~16x faster than decoding in loop;
32 #  substr strings is twice as fast as splitting to an array)
33 my %ALIAS = (
34 #       default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
35         default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
36         0 => [qw(cp437 cp863)],
37         1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
38         2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
39         5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
40         7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
41         8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
42 );
43 my @request = map {
44         if (my $input = $_) {
45                 my %row = (offset => 0);
46                 my $endpoint = 255;
47                 if ($input =~ s/^--//) {
48                         $row{offset} = $endpoint > 160 ? 160 : 48;
49                 }
50                 elsif ($input =~ s/^-//) {
51                         $row{offset} = $endpoint > 128 ? 128 : 32;
52                 }
53                 if ($input =~ s/-$//) {
54                         $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
55                 }
56
57                 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
58                         my $start = hex($1) << ($2 ? 4 : 8);
59                         my $end = $2 ? hex($2) << 4 : $start + 240;
60                         $row{table} = join '', map { chr } $start .. $end+15;
61                         utf8::upgrade($row{table});  # prevent latin1 output
62                         $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
63                 }
64                 elsif ($row{set} = resolve_alias($input)) {
65                         if ($row{set} eq 'Internal') {
66                                 $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096);
67                                 $row{set} = 'Unicode BMP';
68                                 $row{cell} = do 'charset-unicode.inc.pl';
69                         }
70                         elsif ($row{set} eq 'utf-8-strict') {
71                                 $row{table} = undef;
72                                 $row{set} = 'UTF-8';
73                                 $row{cell} = do 'charset-utf8.inc.pl';
74                         }
75                         else {
76                                 $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
77                         }
78                 }
79                 else {
80                         print "<p>Encoding $input unknown</p>\n";
81                 }
82                 \%row;
83         }
84         else {
85                 ();
86         }
87 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
88         $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
89 my $NOCHAR = chr 0xFFFD;
90
91 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
92         substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
93         substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
94                 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
95                 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
96         );
97 }
98
99 sub quote {
100         local $_ = shift;
101         s/"/&quot;/g;
102         s/</&lt;/g;
103         s/>/&gt;/g;
104         return $_;
105 }
106
107 print "<ul>\n";
108
109 my @nibble = (0..9, 'A'..'F');
110 for my $row (@request) {
111         print '<li><table class="glyphs">';
112         printf '<caption>%s</caption>', $row->{set};
113         print '<col>';
114         for my $section (qw{thead}) {
115                 print "<$section><tr><th>↱";
116                 print '<th>', $_ for @nibble;
117                 print "\n";
118         }
119         print '<tbody>';
120         for my $msb (0 .. (length($row->{table}) || 256) - 1 >> 4) {
121                 printf '<tr><th>%X', $msb + ($row->{offset} >> 4);
122                 for my $lsb (0 .. $#nibble) {
123                         if ($row->{cell}) {
124                                 print $row->{cell}->(($msb<<4) + $lsb);
125                                 next;
126                         }
127
128                         my $glyph = substr $row->{table}, ($msb<<4) + $lsb, 1;
129                         if ($glyph eq $NOCHAR) {
130                                 print '<td>';
131                                 next;
132                         }
133
134                         my $info = [ord $glyph];
135                         if (defined (my $mnem = $di{ord $glyph})) {
136                                 $info = $diinfo->{$mnem};
137                         }
138                         my ($codepoint, $name, $prop, $script, $string) = @$info;
139
140                         $glyph = quote($string || $glyph);
141                         my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
142                         my @class = ('X', grep {$_} $prop, $script);
143
144                         $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
145
146                         printf "\n".'<td class="%s" title="%s">%s',
147                                 join(' ', @class), quote($desc), $glyph;
148                 }
149                 print "\n";
150         }
151         print "</table>\n";
152 }
153
154 print "</ul>\n";
155
156 :>
157 <hr>
158
159 <div class="legend">
160         <table class="glyphs"><tr>
161         <td class="X Cc">control
162         <td class="X Zs"><span>whitespace</span>
163         <td class="X Mn">diacritic<table class="glyphs"><tr>
164                 <td class="X Sk">letter
165                 </table>
166         <td class="X Po">punctuation<table class="glyphs"><tr>
167                 <td class="X Pf">quote
168                 </table>
169         <td class="X So">symbol<table class="glyphs"><tr>
170                 <td class="X Sm">math
171                 <td class="X Sc">currency
172                 </table>
173         <td class="X No">numeric
174         <td class="X Greek">greek<table class="glyphs"><tr>
175                 <td class="X Latin">latin
176                 <td class="X Cyrillic">cyrillic
177                 </table>
178         <td class="X Aramaic">aramaic<table class="glyphs"><tr>
179                 <td class="X Brahmic">brahmic
180                 <td class="X Arabic">arabic
181                 </table>
182         <td class="X Syllabic">syllabic<table class="glyphs"><tr>
183                 <td class="X African">african
184                 <td class="X Hiragana">japanese
185                 <td class="X Han">cjk
186                 <td class="X Bopomofo">chinese
187                 </table>
188         <td class="X Alpha">alphabetic
189         </table>
190
191         <table class="glyphs"><tr>
192         <td class="X">unicode 5.0
193         <td class="X Xr">proposed
194         <td class="X Xd">deprecated
195         <td class="">unassigned
196         <td class="X Xi">invalid
197         </table>
198 </div>
199
200 <p class="footer">
201         <a href="http://sheet.shiar.nl/" rel="home">sheet.shiar.nl</a>/charset
202         <a href="git://git.shiar.nl/sheet" rel="vcs-git" title="Git repository"><:= "v$VERSION" :></a>
203         created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
204         <a title="Licensed under the GNU Affero General Public License, version 3" rel="copyright"
205            href="http://www.fsf.org/licensing/licenses/agpl-3.0.html">AGPLv3</a> •
206         last update <:
207                 use Time::Format qw(time_format);
208                 print time_format('yyyy-mm-dd', (stat $ENV{SCRIPT_FILENAME})[9]);
209         :>
210 </p>
211
212 </html>