41af494365bdfb096ccde286d68a236228872a8c
[sheet.git] / charset.plp
1 <:
2 use utf8;
3 use strict;
4 use warnings;
5 use open IO => ':utf8';
6
7 our $VERSION = '1.0';
8
9 $header{content_type} = 'text/html; charset=utf-8';
10
11 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
12  "http://www.w3.org/TR/html4/loose.dtd">
13 <html>
14
15 <head>
16 <title>charset cheat sheet</title>
17 <meta http-equiv="content-type" content="utf-8">
18 <link rel="stylesheet" type="text/css" media="all" href="/base.css">
19 </head>
20
21 <body>
22 <h1>Character encoding</h1>
23
24 <:
25 my $diinfo = do 'digraphs.inc.pl';
26 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
27         keys %$diinfo;
28
29 use Encode qw(decode resolve_alias);
30 # generate character table(s)
31 # (~16x faster than decoding in loop;
32 #  substr strings is twice as fast as splitting to an array)
33 my %ALIAS = (
34 #       default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
35         default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
36         0 => [qw(cp437 cp863)],
37         1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
38         2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
39         5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
40         7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
41         8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
42 );
43 my @request = map {
44         if (my $input = $_) {
45                 my %row = (offset => 0);
46                 my $endpoint = 255;
47                 if ($input =~ s/^--//) {
48                         $row{offset} = $endpoint > 160 ? 160 : 48;
49                 }
50                 elsif ($input =~ s/^-//) {
51                         $row{offset} = $endpoint > 128 ? 128 : 32;
52                 }
53                 if ($input =~ s/-$//) {
54                         $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
55                 }
56                 if ($row{set} = resolve_alias($input)) {
57                         if ($row{set} eq 'Internal') {
58                                 $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096);
59                                 $row{set} = 'Unicode BMP';
60                                 $row{cell} = do 'charset-unicode.inc.pl';
61                         }
62                         elsif ($row{set} eq 'utf-8-strict') {
63                                 $row{table} = undef;
64                                 $row{set} = 'UTF-8';
65                                 $row{cell} = do 'charset-utf8.inc.pl';
66                         }
67                         else {
68                                 $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
69                         }
70                 }
71                 else {
72                         print "<p>Encoding $input unknown</p>\n";
73                 }
74                 \%row;
75         }
76         else {
77                 ();
78         }
79 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
80         $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
81 my $NOCHAR = chr 0xFFFD;
82
83 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
84         substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
85         substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
86                 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
87                 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
88         );
89 }
90
91 sub quote {
92         local $_ = shift;
93         s/"/&quot;/g;
94         s/</&lt;/g;
95         s/>/&gt;/g;
96         return $_;
97 }
98
99 print "<ul>\n";
100
101 my @nibble = (0..9, 'A'..'F');
102 for my $row (@request) {
103         print '<li><table class="glyphs">';
104         printf '<caption>%s</caption>', $row->{set};
105         print '<col>';
106         for my $section (qw{thead}) {
107                 print "<$section><tr><th>↱";
108                 print '<th>', $_ for @nibble;
109                 print "\n";
110         }
111         print '<tbody>';
112         for my $msb (0 .. (length($row->{table}) || 256) - 1 >> 4) {
113                 printf '<tr><th>%X', $msb + ($row->{offset} >> 4);
114                 for my $lsb (0 .. $#nibble) {
115                         if ($row->{cell}) {
116                                 print $row->{cell}->(($msb<<4) + $lsb);
117                                 next;
118                         }
119
120                         my $glyph = substr $row->{table}, ($msb<<4) + $lsb, 1;
121                         if ($glyph eq $NOCHAR) {
122                                 print '<td>';
123                                 next;
124                         }
125
126                         my $info = [ord $glyph];
127                         if (defined (my $mnem = $di{ord $glyph})) {
128                                 $info = $diinfo->{$mnem};
129                         }
130                         my ($codepoint, $name, $prop, $script, $string) = @$info;
131
132                         $glyph = quote($string || $glyph);
133                         my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
134                         my @class = ('X', grep {$_} $prop, $script);
135
136                         $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
137
138                         printf "\n".'<td class="%s" title="%s">%s',
139                                 join(' ', @class), quote($desc), $glyph;
140                 }
141                 print "\n";
142         }
143         print "</table>\n";
144 }
145
146 print "</ul>\n";
147
148 :>
149 <hr>
150
151 <div class="legend">
152         <table class="glyphs"><tr>
153         <td class="X Cc">control
154         <td class="X Zs"><span>whitespace</span>
155         <td class="X Mn">diacritic<table class="glyphs"><tr>
156                 <td class="X Sk">letter
157                 </table>
158         <td class="X Po">punctuation<table class="glyphs"><tr>
159                 <td class="X Pf">quote
160                 </table>
161         <td class="X So">symbol<table class="glyphs"><tr>
162                 <td class="X Sm">math
163                 <td class="X Sc">currency
164                 </table>
165         <td class="X No">numeric
166         <td class="X Greek">greek<table class="glyphs"><tr>
167                 <td class="X Latin">latin
168                 <td class="X Cyrillic">cyrillic
169                 </table>
170         <td class="X Aramaic">aramaic<table class="glyphs"><tr>
171                 <td class="X Brahmic">brahmic
172                 <td class="X Arabic">arabic
173                 </table>
174         <td class="X Syllabic">syllabic<table class="glyphs"><tr>
175                 <td class="X African">african
176                 <td class="X Hiragana">japanese
177                 <td class="X Han">cjk
178                 <td class="X Bopomofo">chinese
179                 </table>
180         <td class="X Alpha">alphabetic
181         </table>
182
183         <table class="glyphs"><tr>
184         <td class="X">unicode 5.0
185         <td class="X Xr">proposed
186         <td class="X Xd">deprecated
187         <td class="">unassigned
188         <td class="X Xi">invalid
189         </table>
190 </div>
191
192 <p class="footer">
193         <a href="http://sheet.shiar.nl/" rel="home">sheet.shiar.nl</a>/charset
194         <a href="git://git.shiar.nl/sheet" rel="vcs-git" title="Git repository"><:= "v$VERSION" :></a>
195         created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
196         <a title="Licensed under the GNU Affero General Public License, version 3" rel="copyright"
197            href="http://www.fsf.org/licensing/licenses/agpl-3.0.html">AGPLv3</a> •
198         last update <:
199                 use Time::Format qw(time_format);
200                 print time_format('yyyy-mm-dd', (stat $ENV{SCRIPT_FILENAME})[9]);
201         :>
202 </p>
203
204 </html>