5 use open IO => ':utf8';
9 $header{content_type} = 'text/html; charset=utf-8';
11 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
12 "http://www.w3.org/TR/html4/loose.dtd">
16 <title>charset cheat sheet</title>
17 <meta http-equiv="content-type" content="utf-8">
18 <link rel="stylesheet" type="text/css" media="all" href="/base.css">
22 <h1>Character encoding</h1>
25 my $diinfo = do 'digraphs.inc.pl';
26 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
29 use Encode qw(decode resolve_alias);
30 # generate character table(s)
31 # (~16x faster than decoding in loop;
32 # substr strings is twice as fast as splitting to an array)
34 default => [qw(utf-8 iso-8859-1 cp437)],
35 0 => [qw(cp437 cp863)],
36 1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
37 2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
38 5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
39 7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
40 8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
42 my @request = grep { defined } map {
43 $_ ? (resolve_alias($_) or print("Encoding $_ unknown") && ()) : ();
44 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
45 $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
46 my @tables = map { $_ eq 'utf-8-strict' ? undef : $_ eq 'Internal' ? ' 'x512 : decode($_, pack 'C*', 0..255) } @request;
47 my $NOCHAR = chr 0xFFFD;
49 for my $cp437 (grep {$request[$_] eq 'cp437'} 0 .. $#request) {
50 substr($tables[$cp437], 237, 1) = pack 'U*', 0x3D5; # phi sign
51 substr($tables[$cp437], 0, 32) = pack 'U*', map {hex} qw(
52 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
53 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
65 sub printcell_unicode {
68 print "\n".'<td class="X">?';
71 print '<td colspan="2" class="X Cc">control';
74 print '<td colspan="6" class="X Ll Latin">latin';
77 print '<td colspan="2" class="X Cc">control';
79 elsif ($value == 10) {
80 print '<td colspan="6" class="X Ll Latin">latin supplement';
82 elsif ($value == 0x10) {
83 print '<td colspan="8" class="X Ll Latin">latin ext-A';
85 elsif ($value == 0x18) {
86 print '<td colspan="8" class="X Ll Latin">latin ext-B';
88 elsif ($value == 0x20) {
89 print '<td colspan="5" class="X Ll Latin">latin ext-B';
91 elsif ($value == 0x25) {
92 print '<td colspan="6" class="X Ll Latin">IPA';
94 elsif ($value == 0x2B) {
95 print '<td colspan="5" class="X Sk">spacing modifier';
97 elsif ($value == 0x30) {
98 print '<td colspan="8" class="X Mn">diacritics';
100 elsif ($value == 0x38) {
101 print '<td colspan="8" class="X Ll Greek">greek';
103 elsif ($value == 0x40) {
104 print '<td colspan="16" class="X Ll Cyrillic">cyrillic';
106 elsif ($value == 0x50) {
107 print '<td colspan="3" class="X Ll Cyrillic">cyrillic+';
109 elsif ($value == 0x53) {
110 print '<td colspan="5" class="X Ll Armenian">armenian';
112 elsif ($value == 0x58) {
113 print '<td colspan="8" class="X Ll Hebrew">hebrew';
115 elsif ($value == 0x60) {
116 print '<td colspan="16" class="X Ll Arabic">arabic';
118 elsif ($value == 0x70) {
119 print '<td colspan="5" class="X Ll Aramaic">syriac';
121 elsif ($value == 0x75) {
122 print '<td colspan="3" class="X Ll Arabic">arabic+';
124 elsif ($value == 0x78) {
125 print '<td colspan="4" class="X Ll African">thaana';
127 elsif ($value == 0x7C) {
128 print '<td colspan="4" class="X Ll African">nko';
130 elsif ($value == 0x80) {
131 print '<td colspan="4" class="X di-rare">samaritan';
133 elsif ($value == 0x84) {
134 print '<td colspan="2" class="X di-rare Ll Aramaic">manda';
136 elsif ($value == 0x86) {
137 print '<td colspan="12" class="di-invalid">reserved';
139 elsif ($value == 0x90) {
140 print '<td colspan="8" class="X Ll Brahmic">devanagari';
142 elsif ($value == 0x98) {
143 print '<td colspan="8" class="X Ll Brahmic">bengali';
145 elsif ($value == 0xA0) {
146 print '<td colspan="8" class="X Ll Brahmic">gurmukhi';
148 elsif ($value == 0xA8) {
149 print '<td colspan="8" class="X Ll Brahmic">gujarati';
151 elsif ($value == 0xB0) {
152 print '<td colspan="8" class="X Ll Brahmic">oriya';
154 elsif ($value == 0xB8) {
155 print '<td colspan="8" class="X Ll Brahmic">tamil';
157 elsif ($value == 0xC0) {
158 print '<td colspan="8" class="X Ll Brahmic">telugu';
160 elsif ($value == 0xC8) {
161 print '<td colspan="8" class="X Ll Brahmic">kannada';
163 elsif ($value == 0xD0) {
164 print '<td colspan="8" class="X Ll Brahmic">malayalam';
166 elsif ($value == 0xD8) {
167 print '<td colspan="8" class="X Ll Brahmic">sinhala';
169 elsif ($value == 0xE0) {
170 print '<td colspan="8" class="X Ll Brahmic Khmer">thai';
172 elsif ($value == 0xE8) {
173 print '<td colspan="8" class="X Ll Brahmic Khmer">lao';
175 elsif ($value == 0xF0) {
176 print '<td colspan="16" class="X Ll Brahmic">tibetan';
178 elsif ($value == 0x100) {
179 print '<td colspan="10" class="X Ll Brahmic">myanmar';
181 elsif ($value == 0x10A) {
182 print '<td colspan="6" class="X Ll Aramaic">georgian';
184 elsif ($value == 0x110) {
185 print '<td colspan="16" class="X Ll Hangul">hangeul jamo';
187 elsif ($value == 0x120) {
188 print '<td colspan="16" class="X Ll African">ethiopic';
190 elsif ($value == 0x130) {
191 print '<td colspan="8" class="X Ll African">ethiopic';
193 elsif ($value == 0x138) {
194 print '<td colspan="2" class="X Ll African">eth+';
196 elsif ($value == 0x13A) {
197 print '<td colspan="6" class="X Ll X">cherokee';
199 elsif ($value == 0x140) {
200 print '<td colspan="16" rowspan="2" class="X Ll Syllabic">unified canadian aboriginal syllabics';
202 elsif ($value == 0x160) {
203 print '<td colspan="8" class="X Ll Syllabic">unified canadian syllabics';
205 elsif ($value == 0x168) {
206 print '<td colspan="2" class="X Ll X">ogham';
208 elsif ($value == 0x16A) {
209 print '<td colspan="6" class="X Ll X">runic';
211 elsif ($value == 0x170) {
212 print '<td colspan="2" class="X Ll Brahmic">tagalog';
214 elsif ($value == 0x172) {
215 print '<td colspan="2" class="X Ll Brahmic">hanun';
217 elsif ($value == 0x174) {
218 print '<td colspan="2" class="X Ll Brahmic">buhid';
220 elsif ($value == 0x176) {
221 print '<td colspan="2" class="X Ll Brahmic" title="tagbanwa">tagb';
223 elsif ($value == 0x178) {
224 print '<td colspan="8" class="X Ll Brahmic Khmer">khmer';
226 elsif ($value == 0x180) {
227 print '<td colspan="11" class="X Ll Aramaic">mongolian';
229 elsif ($value == 0x18B) {
230 print '<td colspan="5" class="X Ll Syllabic di-rare">canadian+';
232 elsif ($value == 0x190) {
233 print '<td colspan="5" class="X Ll Brahmic">limbu';
235 elsif ($value == 0x195) {
236 print '<td colspan="4" class="X Ll Brahmic">tai le';
238 elsif ($value == 0x198) {
239 print '<td colspan="6" class="X Ll Brahmic">new tai lue';
241 elsif ($value == 0x19E) {
242 print '<td colspan="2" class="X Ll Brahmic Khmer" title="khmer symbols">km';
244 elsif ($value == 0x1A0) {
245 print '<td colspan="2" class="X Ll Brahmic">lontara';
247 elsif ($value == 0x1A2) {
248 print '<td colspan="9" class="X Ll Brahmic di-rare">tai tham';
250 elsif ($value == 0x1AB) {
251 print '<td colspan="5" class="di-invalid">reserved';
253 elsif ($value == 0x1B0) {
254 print '<td colspan="8" class="X Ll ">balinese';
256 elsif ($value == 0x1B8) {
257 print '<td colspan="4" class="X Ll ">sundanese';
259 elsif ($value == 0x1BC) {
260 print '<td colspan="4" class="X Ll di-rare">batak';
262 elsif ($value == 0x1C0) {
263 print '<td colspan="5" class="X Ll ">lepcha';
265 elsif ($value == 0x1C5) {
266 print '<td colspan="3" class="X Ll ">ol chiki';
268 elsif ($value == 0x1C8) {
269 print '<td colspan="5" class="di-invalid">reserved';
271 elsif ($value == 0x1CD) {
272 print '<td colspan="3" class="X Ll di-rare">vedic';
274 elsif ($value == 0x1D0) {
275 print '<td colspan="8" class="X Ll Latin">phonetic';
277 elsif ($value == 0x1D8) {
278 print '<td colspan="4" class="X Ll Latin">phonetic+';
280 elsif ($value == 0x1DC) {
281 print '<td colspan="4" class="X Mn">combining';
283 elsif ($value == 0x1E0) {
284 print '<td colspan="16" class="X Ll Latin">latin extended additional';
286 elsif ($value == 0x1F0) {
287 print '<td colspan="16" class="X Ll Greek">greek+';
293 if ($value <= 0x7F) {
294 print '<td rowspan="8" colspan="16" class="X di-a"',
295 ' title="U+0000 – U+007F">Single byte ASCII'
298 elsif ($value <= 0xBF) {
299 print '<td rowspan="4" colspan="16" class="X di-d"',
300 '>Multi-byte continuation'
303 elsif ($value <= 0xC1) {
304 print '<td colspan="2" class="X di-b" style="border-right:none; border-bottom:none"',
305 ' title="U+0000 – U+007F">(Overl.)'
308 elsif ($value <= 0xDF) {
309 print '<td rowspan="2" colspan="14" class="X di-prop" style="border-left:none"',
310 ' title="U+0080 – U+03FF">2-byte sequence start'
312 print '<td rowspan="1" colspan="16" class="X di-prop" style="border-top:none"',
313 ' title="U+0400 – U+07FF">'
316 elsif ($value <= 0xEF) {
317 print '<td colspan="16" class="X di-prop"',
318 ' title="U+0800 – U+FFFF">3-byte sequence start'
321 elsif ($value <= 0xF4) {
322 print '<td colspan="5" class="X di-prop" style="border-right:none"',
323 ' title="U+1·0000 – U+10·FFFF">4-byte sequence'
326 elsif ($value <= 0xF7) {
327 print '<td colspan="3" class="X di-b" style="border-left:none"',
328 ' title="U+11·0000 – U+1FF·FFFF">(Overflow)'
331 elsif ($value <= 0xFB) {
332 print '<td colspan="4" class="X di-b"',
333 ' title="U+200·0000 – U+3FFF·FFFF">5-byte'
336 elsif ($value <= 0xFD) {
337 print '<td colspan="2" class="X di-b"',
338 ' title="U+4000·0000 – 7FFFF·FFFF">6-byte'
341 elsif ($value <= 0xFF) {
342 print '<td colspan="2" class="di-invalid">Invalid'
346 print "\n".'<td class="X">?';
352 my @nibble = (0..9, 'A'..'F');
353 for my $tablenum (0 .. $#tables) {
354 print '<li><table class="glyphs">';
355 printf '<caption>%s</caption>', $request[$tablenum] eq 'utf-8-strict' ? 'UTF-8' : $request[$tablenum];
357 for my $section (qw{thead}) {
358 print "<$section><tr><th>↱";
359 print '<th>', $_ for @nibble;
363 for my $msb (0 .. (length($tables[$tablenum]) || 256) - 1 >> 4) {
364 printf '<tr><th>%X', $msb;
365 for my $lsb (0 .. $#nibble) {
366 if ($request[$tablenum] eq 'utf-8-strict') {
367 printcell_utf8(($msb<<4) + $lsb);
370 elsif ($request[$tablenum] eq 'Internal') {
371 printcell_unicode(($msb<<4) + $lsb);
375 my $glyph = substr $tables[$tablenum], ($msb<<4) + $lsb, 1;
376 if ($glyph eq $NOCHAR) {
380 my $info = [ord $glyph];
381 if (defined (my $mnem = $di{ord $glyph})) {
382 $info = $diinfo->{$mnem};
384 my ($codepoint, $name, $prop, $script, $string) = @$info;
386 $glyph = quote($string || $glyph);
387 my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
388 my @class = ('X', grep {$_} $prop, $script);
390 $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
392 printf "\n".'<td class="%s" title="%s">%s',
393 join(' ', @class), quote($desc), $glyph;
406 <a href="http://sheet.shiar.nl/" rel="home">sheet.shiar.nl</a>/charset
407 <a href="git://git.shiar.nl/sheet" rel="vcs-git" title="Git repository"><:= "v$VERSION" :></a>
408 created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
409 <a title="Licensed under the GNU Affero General Public License, version 3" rel="copyright"
410 href="http://www.fsf.org/licensing/licenses/agpl-3.0.html">AGPLv3</a> •
412 use Time::Format qw(time_format);
413 print time_format('yyyy-mm-dd', (stat $ENV{SCRIPT_FILENAME})[9]);