5 use open IO => ':utf8';
9 $header{content_type} = 'text/html; charset=utf-8';
11 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
12 "http://www.w3.org/TR/html4/loose.dtd">
16 <title>charset cheat sheet</title>
17 <meta http-equiv="content-type" content="utf-8">
18 <link rel="stylesheet" type="text/css" media="all" href="/base.css">
22 <h1>Character encoding</h1>
25 my $diinfo = do 'digraphs.inc.pl';
26 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
29 use Encode qw(decode resolve_alias);
30 # generate character table(s)
31 # (~16x faster than decoding in loop;
32 # substr strings is twice as fast as splitting to an array)
34 # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
35 default => [qw(unicode utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
36 0 => [qw(cp437 cp863)],
37 1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
38 2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
39 5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
40 7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
41 8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
45 my %row = (offset => 0);
47 if ($input =~ s/^--//) {
48 $row{offset} = $endpoint > 160 ? 160 : 48;
50 elsif ($input =~ s/^-//) {
51 $row{offset} = $endpoint > 128 ? 128 : 32;
53 if ($input =~ s/-$//) {
54 $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
56 if ($row{set} = resolve_alias($input)) {
57 if ($row{set} eq 'Internal') {
58 $row{table} = ' 'x640;
59 $row{set} = 'Unicode BMP';
61 elsif ($row{set} eq 'utf-8-strict') {
66 $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
70 print "<p>Encoding $input unknown</p>\n";
77 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
78 $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
79 my $NOCHAR = chr 0xFFFD;
81 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
82 substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
83 substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
84 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
85 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
97 sub printcell_unicode {
100 print "\n".'<td class="X">?';
102 elsif ($value == 0) {
103 print '<td colspan="2" class="X Cc">control';
105 elsif ($value == 2) {
106 print '<td colspan="6" class="X Ll Latin">latin';
108 elsif ($value == 8) {
109 print '<td colspan="2" class="X Cc">control';
111 elsif ($value == 10) {
112 print '<td colspan="6" class="X Ll Latin">latin supplement';
114 elsif ($value == 0x10) {
115 print '<td colspan="8" class="X Ll Latin">latin ext-A';
117 elsif ($value == 0x18) {
118 print '<td colspan="8" class="X Ll Latin">latin ext-B';
120 elsif ($value == 0x20) {
121 print '<td colspan="5" class="X Ll Latin">latin ext-B';
123 elsif ($value == 0x25) {
124 print '<td colspan="6" class="X Ll Latin">IPA';
126 elsif ($value == 0x2B) {
127 print '<td colspan="5" class="X Sk">spacing modifier';
129 elsif ($value == 0x30) {
130 print '<td colspan="8" class="X Mn">diacritics';
132 elsif ($value == 0x38) {
133 print '<td colspan="8" class="X Ll Greek">greek';
135 elsif ($value == 0x40) {
136 print '<td colspan="16" class="X Ll Cyrillic">cyrillic';
138 elsif ($value == 0x50) {
139 print '<td colspan="3" class="X Ll Cyrillic">cyrillic+';
141 elsif ($value == 0x53) {
142 print '<td colspan="5" class="X Ll Armenian">armenian';
144 elsif ($value == 0x58) {
145 print '<td colspan="8" class="X Ll Hebrew">hebrew';
147 elsif ($value == 0x60) {
148 print '<td colspan="16" class="X Ll Arabic">arabic';
150 elsif ($value == 0x70) {
151 print '<td colspan="5" class="X Ll Aramaic">syriac';
153 elsif ($value == 0x75) {
154 print '<td colspan="3" class="X Ll Arabic">arabic+';
156 elsif ($value == 0x78) {
157 print '<td colspan="4" class="X Ll African">thaana';
159 elsif ($value == 0x7C) {
160 print '<td colspan="4" class="X Ll African">nko';
162 elsif ($value == 0x80) {
163 print '<td colspan="4" class="X di-rare">samaritan';
165 elsif ($value == 0x84) {
166 print '<td colspan="2" class="X di-rare Ll Aramaic">manda';
168 elsif ($value == 0x86) {
169 print '<td colspan="12" class="di-invalid">reserved';
171 elsif ($value == 0x90) {
172 print '<td colspan="8" class="X Ll Brahmic">devanagari';
174 elsif ($value == 0x98) {
175 print '<td colspan="8" class="X Ll Brahmic">bengali';
177 elsif ($value == 0xA0) {
178 print '<td colspan="8" class="X Ll Brahmic">gurmukhi';
180 elsif ($value == 0xA8) {
181 print '<td colspan="8" class="X Ll Brahmic">gujarati';
183 elsif ($value == 0xB0) {
184 print '<td colspan="8" class="X Ll Brahmic">oriya';
186 elsif ($value == 0xB8) {
187 print '<td colspan="8" class="X Ll Brahmic">tamil';
189 elsif ($value == 0xC0) {
190 print '<td colspan="8" class="X Ll Brahmic">telugu';
192 elsif ($value == 0xC8) {
193 print '<td colspan="8" class="X Ll Brahmic">kannada';
195 elsif ($value == 0xD0) {
196 print '<td colspan="8" class="X Ll Brahmic">malayalam';
198 elsif ($value == 0xD8) {
199 print '<td colspan="8" class="X Ll Brahmic">sinhala';
201 elsif ($value == 0xE0) {
202 print '<td colspan="8" class="X Ll Brahmic Khmer">thai';
204 elsif ($value == 0xE8) {
205 print '<td colspan="8" class="X Ll Brahmic Khmer">lao';
207 elsif ($value == 0xF0) {
208 print '<td colspan="16" class="X Ll Brahmic">tibetan';
210 elsif ($value == 0x100) {
211 print '<td colspan="10" class="X Ll Brahmic">myanmar';
213 elsif ($value == 0x10A) {
214 print '<td colspan="6" class="X Ll Aramaic">georgian';
216 elsif ($value == 0x110) {
217 print '<td colspan="16" class="X Ll Hangul">hangeul jamo';
219 elsif ($value == 0x120) {
220 print '<td colspan="16" class="X Ll African">ethiopic';
222 elsif ($value == 0x130) {
223 print '<td colspan="8" class="X Ll African">ethiopic';
225 elsif ($value == 0x138) {
226 print '<td colspan="2" class="X Ll African">eth+';
228 elsif ($value == 0x13A) {
229 print '<td colspan="6" class="X Ll Syllabic">cherokee';
231 elsif ($value == 0x140) {
232 print '<td colspan="16" rowspan="2" class="X Ll Syllabic">unified canadian aboriginal syllabics';
234 elsif ($value == 0x160) {
235 print '<td colspan="8" class="X Ll Syllabic">unified canadian syllabics';
237 elsif ($value == 0x168) {
238 print '<td colspan="2" class="X Ll X">ogham';
240 elsif ($value == 0x16A) {
241 print '<td colspan="6" class="X Ll X">runic';
243 elsif ($value == 0x170) {
244 print '<td colspan="2" class="X Ll Brahmic">tagalog';
246 elsif ($value == 0x172) {
247 print '<td colspan="2" class="X Ll Brahmic">hanun';
249 elsif ($value == 0x174) {
250 print '<td colspan="2" class="X Ll Brahmic">buhid';
252 elsif ($value == 0x176) {
253 print '<td colspan="2" class="X Ll Brahmic" title="tagbanwa">tagb';
255 elsif ($value == 0x178) {
256 print '<td colspan="8" class="X Ll Brahmic Khmer">khmer';
258 elsif ($value == 0x180) {
259 print '<td colspan="11" class="X Ll Aramaic">mongolian';
261 elsif ($value == 0x18B) {
262 print '<td colspan="5" class="X Ll Syllabic di-rare">canadian+';
264 elsif ($value == 0x190) {
265 print '<td colspan="5" class="X Ll Brahmic">limbu';
267 elsif ($value == 0x195) {
268 print '<td colspan="4" class="X Ll Brahmic">tai le';
270 elsif ($value == 0x198) {
271 print '<td colspan="6" class="X Ll Brahmic">new tai lue';
273 elsif ($value == 0x19E) {
274 print '<td colspan="2" class="X Ll Brahmic Khmer" title="khmer symbols">km';
276 elsif ($value == 0x1A0) {
277 print '<td colspan="2" class="X Ll Brahmic">lontara';
279 elsif ($value == 0x1A2) {
280 print '<td colspan="9" class="X Ll Brahmic di-rare">tai tham';
282 elsif ($value == 0x1AB) {
283 print '<td colspan="5" class="di-invalid">reserved';
285 elsif ($value == 0x1B0) {
286 print '<td colspan="8" class="X Ll ">balinese';
288 elsif ($value == 0x1B8) {
289 print '<td colspan="4" class="X Ll ">sundanese';
291 elsif ($value == 0x1BC) {
292 print '<td colspan="4" class="X Ll di-rare">batak';
294 elsif ($value == 0x1C0) {
295 print '<td colspan="5" class="X Ll ">lepcha';
297 elsif ($value == 0x1C5) {
298 print '<td colspan="3" class="X Ll ">ol chiki';
300 elsif ($value == 0x1C8) {
301 print '<td colspan="5" class="di-invalid">reserved';
303 elsif ($value == 0x1CD) {
304 print '<td colspan="3" class="X Ll di-rare">vedic';
306 elsif ($value == 0x1D0) {
307 print '<td colspan="8" class="X Ll Latin">phonetic';
309 elsif ($value == 0x1D8) {
310 print '<td colspan="4" class="X Ll Latin">phonetic+';
312 elsif ($value == 0x1DC) {
313 print '<td colspan="4" class="X Mn">combining';
315 elsif ($value == 0x1E0) {
316 print '<td colspan="16" class="X Ll Latin">latin extended additional';
318 elsif ($value == 0x1F0) {
319 print '<td colspan="16" class="X Ll Greek">greek+';
321 elsif ($value == 0x200) {
322 print '<td colspan="7" class="X Pd">general punctuation';
324 elsif ($value == 0x207) {
325 print '<td colspan="3" class="X Latin">su[bp]script';
327 elsif ($value == 0x20A) {
328 print '<td colspan="3" class="X Sc">currency';
330 elsif ($value == 0x20D) {
331 print '<td colspan="3" class="X Mn">overlay';
333 elsif ($value == 0x210) {
334 print '<td colspan="5" class="X So">letterlike';
336 elsif ($value == 0x215) {
337 print '<td colspan="4" class="X Latin">number';
339 elsif ($value == 0x219) {
340 print '<td colspan="7" class="X So">arrows';
342 elsif ($value == 0x220) {
343 print '<td colspan="16" class="X Sm">mathematical symbols';
345 elsif ($value == 0x230) {
346 print '<td colspan="16" class="X So">miscellaneous technical';
348 elsif ($value == 0x240) {
349 print '<td colspan="4" class="X So">control';
351 elsif ($value == 0x244) {
352 print '<td colspan="2" class="X So">OCR';
354 elsif ($value == 0x246) {
355 print '<td colspan="10" class="X Latin">enclosed alphanumerics';
357 elsif ($value == 0x250) {
358 print '<td colspan="8" class="X So">box drawing';
360 elsif ($value == 0x258) {
361 print '<td colspan="2" class="X So">blocks';
363 elsif ($value == 0x25A) {
364 print '<td colspan="6" class="X So">geometric shapes';
366 elsif ($value == 0x260) {
367 print '<td colspan="16" class="X So">miscellaneous symbols';
369 elsif ($value == 0x270) {
370 print '<td colspan="12" class="X So">dingbats';
372 elsif ($value == 0x27C) {
373 print '<td colspan="3" class="X So">maths-A';
375 elsif ($value == 0x27F) {
376 print '<td colspan="1" class="X So" title="supplemental arrows-A">arr';
382 if ($value <= 0x7F) {
383 print '<td rowspan="8" colspan="16" class="X di-a"',
384 ' title="U+0000 – U+007F">Single byte ASCII'
387 elsif ($value <= 0xBF) {
388 print '<td rowspan="4" colspan="16" class="X di-d"',
389 '>Multi-byte continuation'
392 elsif ($value <= 0xC1) {
393 print '<td colspan="2" class="X di-b" style="border-right:none; border-bottom:none"',
394 ' title="U+0000 – U+007F">(Overl.)'
397 elsif ($value <= 0xDF) {
398 print '<td rowspan="2" colspan="14" class="X di-prop" style="border-left:none"',
399 ' title="U+0080 – U+03FF">2-byte sequence start'
401 print '<td rowspan="1" colspan="16" class="X di-prop" style="border-top:none"',
402 ' title="U+0400 – U+07FF">'
405 elsif ($value <= 0xEF) {
406 print '<td colspan="16" class="X di-prop"',
407 ' title="U+0800 – U+FFFF">3-byte sequence start'
410 elsif ($value <= 0xF4) {
411 print '<td colspan="5" class="X di-prop" style="border-right:none"',
412 ' title="U+1·0000 – U+10·FFFF">4-byte sequence'
415 elsif ($value <= 0xF7) {
416 print '<td colspan="3" class="X di-b" style="border-left:none"',
417 ' title="U+11·0000 – U+1FF·FFFF">(Overflow)'
420 elsif ($value <= 0xFB) {
421 print '<td colspan="4" class="X di-b"',
422 ' title="U+200·0000 – U+3FFF·FFFF">5-byte'
425 elsif ($value <= 0xFD) {
426 print '<td colspan="2" class="X di-b"',
427 ' title="U+4000·0000 – 7FFFF·FFFF">6-byte'
430 elsif ($value <= 0xFF) {
431 print '<td colspan="2" class="di-invalid">Invalid'
435 print "\n".'<td class="X">?';
441 my @nibble = (0..9, 'A'..'F');
442 for my $row (@request) {
443 print '<li><table class="glyphs">';
444 printf '<caption>%s</caption>', $row->{set};
446 for my $section (qw{thead}) {
447 print "<$section><tr><th>↱";
448 print '<th>', $_ for @nibble;
452 for my $msb (0 .. (length($row->{table}) || 256) - 1 >> 4) {
453 printf '<tr><th>%X', $msb + ($row->{offset} >> 4);
454 for my $lsb (0 .. $#nibble) {
455 if ($row->{set} eq 'UTF-8') {
456 printcell_utf8(($msb<<4) + $lsb);
459 elsif ($row->{set} eq 'Unicode BMP') {
460 printcell_unicode(($msb<<4) + $lsb);
464 my $glyph = substr $row->{table}, ($msb<<4) + $lsb, 1;
465 if ($glyph eq $NOCHAR) {
469 my $info = [ord $glyph];
470 if (defined (my $mnem = $di{ord $glyph})) {
471 $info = $diinfo->{$mnem};
473 my ($codepoint, $name, $prop, $script, $string) = @$info;
475 $glyph = quote($string || $glyph);
476 my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
477 my @class = ('X', grep {$_} $prop, $script);
479 $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
481 printf "\n".'<td class="%s" title="%s">%s',
482 join(' ', @class), quote($desc), $glyph;
495 <a href="http://sheet.shiar.nl/" rel="home">sheet.shiar.nl</a>/charset
496 <a href="git://git.shiar.nl/sheet" rel="vcs-git" title="Git repository"><:= "v$VERSION" :></a>
497 created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
498 <a title="Licensed under the GNU Affero General Public License, version 3" rel="copyright"
499 href="http://www.fsf.org/licensing/licenses/agpl-3.0.html">AGPLv3</a> •
501 use Time::Format qw(time_format);
502 print time_format('yyyy-mm-dd', (stat $ENV{SCRIPT_FILENAME})[9]);