4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
18 my @tablist = split m{/+}, $Request || 'default';
20 use List::Util qw( first pairmap pairfirst pairs );
23 <h1>Character encodings</h1>
27 if ($tablist[0] eq 'default') {
28 say "Overview of Unicode allocation and common latin code pages.";
29 say "Compare alternate charsets:";
32 say "Charset comparison:";
36 my ($title, $href, $selected) = @_;
38 $selected ? '<strong>%s</strong>' : '<a href="%2$s">%s</a>',
39 EscapeHTML($title), $href
45 join " ·\n", pairmap {
46 optionlink($b || ucfirst $a, "/charset/$a?compare", $a eq $Request);
58 norteur => 'North European',
70 use Shiar_Sheet::FormatChar;
71 my $glyphs = Shiar_Sheet::FormatChar->new;
75 # generate character table(s)
76 my $input = shift or return;
79 default => [qw( unicode+0-639 utf-8 iso-8859-1 iso-8859-15 cp1252 cp437 cp850 )],
80 us => [qw( cp437 cp863 gsm0338 AdobeStandardEncoding )],
81 ebcdic => [qw( cp37 cp500 cp1047 posix-bc cp1026 cp875 )],
82 iso => [map {"iso-8859-$_"} 1 .. 11, 13 .. 16],
83 dos => [qw( cp437 cp865 cp861 cp860 cp863 cp850 cp857 cp852 cp775
84 cp737 cp869 cp866 cp855 cp862 cp864 )],
85 aix => [qw( cp1006 )],
86 win => [qw( cp1252 cp1250 cp1254 cp1257 cp1258 cp1253 cp1251 cp1255 cp1256 cp874 )],
87 mac => [qw( MacRoman MacRomanian MacRumanian MacCroatian MacCentralEurRoman MacTurkish MacIcelandic MacSami
88 MacGreek MacCyrillic MacHebrew MacArabic MacFarsi MacThai )],
91 westeur => [qw( iso-8859-1 iso-8859-15 cp1252 iso-8859-14 cp850 hp-roman8 nextstep MacRoman )],
92 centeur => [qw( iso-8859-2 iso-8859-16 cp1250 cp852 MacRomanian MacCroatian MacCentralEurRoman )], # MacRumanian only for DB
93 turkish => [qw( iso-8859-9 iso-8859-3 cp1254 cp857 MacTurkish )],
94 norteur => [qw( baltic nordic )],
95 baltic => [qw( iso-8859-4 iso-8859-13 cp1257 cp775 )],
96 nordic => [qw( iso-8859-10 cp865 cp861 MacIcelandic MacSami )],
97 cyrillic => [qw( koi8-r koi8-u koi8-f iso-8859-5 cp1251 MacCyrillic cp866 cp855
98 U4 U2DE-2DF UA64-A69 U50-52 )], # MacUkrainian is broken
99 arabic => [qw( iso-8859-6 cp1256 MacArabic cp864 cp1006 MacFarsi
100 U6 U8A-8F+0-31+64 U75-77 )],
101 greek => [qw( iso-8859-7 cp1253 MacGreek cp737 cp869 U37-3F U1F )],
102 hebrew => [qw( iso-8859-8 cp1255 MacHebrew cp862 U59-5F )],
103 thai => [qw( iso-8859-11 cp874 MacThai )],
104 vietnamese => [qw( viscii cp1258 MacVietnamese )],
105 symbol => [qw( symbol dingbats MacDingbats AdobeZdingbat AdobeSymbol )],
120 if (my $follow = $ALIAS->{$input}) {
121 tabinput($_) for ref $follow ? @{$follow} : $follow;
125 state $visible = {}; # all present tables
126 my %row = (offset => 0, cols => 16);
128 my $params = $input =~ s/[+](.*)\z// ? $1 : undef;
130 if (not defined $params) {
132 'cp437' => ['cp850' => 0, 'ascii' => '0-31+128'], # ascii range overridden later
133 'gsm0338' => ['ascii' => '0-127'],
135 'iso-8859-2' => ['iso-8859-1' => '160'],
136 'iso-8859-3' => ['iso-8859-1' => '160'], #TODO: also apply to iso-8859-9
137 'iso-8859-4' => ['iso-8859-2' => '160'],
138 'iso-8859-5' => ['iso-8859-1' => '160'],
139 'iso-8859-6' => ['cp1256' => '128', 'iso-8859-1' => '160'],
140 'iso-8859-7' => ['iso-8859-1' => '160'],
141 'iso-8859-8' => ['iso-8859-1' => '160'],
142 'iso-8859-9' => ['iso-8859-1' => '208-223+240'],
143 'iso-8859-10' => ['iso-8859-4' => '160'],
144 'iso-8859-11' => ['iso-8859-1' => '160'],
145 'iso-8859-13' => ['iso-8859-4' => '160'],
146 'iso-8859-14' => ['iso-8859-1' => '160'],
147 'iso-8859-15' => ['iso-8859-1' => '160-191'],
148 'iso-8859-16' => ['iso-8859-2' => '160'],
149 'hp-roman8' => ['iso-8859-1' => '160'],
151 'cp1252' => ['iso-8859-1' => '128-159'],
152 'cp1250' => ['iso-8859-2' => '128-191', 'cp1252' => '128'],
153 'cp1254' => ['iso-8859-9' => '128-159', 'cp1252' => '128-159+208'],
154 'cp874' => ['iso-8859-11' => '128-159', 'cp1252' => '128'], # windows-874 actually cp1162
155 'cp1257' => ['iso-8859-13' => '128-159+255', 'cp1252' => '128'],
156 'cp1251' => ['cp1252' => '128'],
157 'cp1253' => ['cp1252' => '128'],
158 'cp1255' => ['iso-8859-8' => '128-223', 'cp1252' => '128'],
159 'cp1256' => ['cp1252' => '128'],
160 'cp1258' => ['cp1252' => '128-159+192'],
162 'cp850' => ['cp437' => '144'],
163 'cp860' => ['cp437' => '128-175'],
164 'cp861' => ['cp865' => '128-175'],
165 'cp863' => ['cp437' => '128-175'],
166 'cp865' => ['cp437' => '144-175'],
167 'cp852' => ['cp850' => '128', 'cp437' => '128'],
168 'cp857' => ['cp850' => '128-175+208-239', 'cp437' => '128'],
169 'cp775' => ['cp850' => '128'], # partial cp437
170 'cp866' => ['cp437' => '128-175+224'],
171 'cp855' => ['cp437' => '128'],
172 'cp1006' => ['iso-8859-6' => '160', 'cp437' => '128'],
173 'cp737' => ['cp437' => '128-175+224'],
174 'cp869' => ['cp437' => '128'],
175 'cp862' => ['cp437' => '128-159'],
176 'cp864' => ['MacArabic' => '128', 'iso-8859-6' => '128', 'cp437' => '128'], #TODO: compare form variants
178 'koi8-u' => ['koi8-r' => 128],
179 'koi8-f' => ['koi8-u' => 128],
181 'MacRomanian' => ['MacRoman' => '160-191+208-223'],
182 'MacRumanian' => ['MacRomanian' => '160-191+208-223', 'MacRoman' => '160-191+208-223'],
183 'MacCroatian' => ['MacRoman' => '160'],
184 'MacCentralEurRoman' => ['MacRoman' => '128'],
185 'MacIcelandic'=> ['MacRoman' => '160-175+208-239'], #TODO: gaps at C/E
186 'MacTurkish' => ['MacRoman' => '208-223'], # F5 is unassigned
187 'MacSami' => ['MacIcelandic' => '144', 'MacRoman' => '144'],
188 'MacGreek' => ['MacRoman' => '128'],
189 'MacCyrillic' => ['MacRoman' => '128'],
190 'MacHebrew' => ['iso-8859-8' => '128', 'MacRoman' => '128-143+160'], # partial ascii
191 'MacArabic' => ['iso-8859-6' => '128', 'cp864' => '128', 'MacRoman' => '128'], #TODO: multiple parents
192 'MacFarsi' => ['MacArabic' => '176-191', 'MacRoman' => '128'],
194 'cp37' => ['posix-bc' => '0'],
195 'posix-bc' => ['cp1047' => '64'],
196 'cp500' => ['cp37' => '64-95+176-191'],
197 'cp1047' => ['cp37' => '16-95+160-191'], #TODO: gap at 3/4
198 'cp1026' => ['cp37' => '64'],
199 'cp875' => ['cp37' => '48'],
203 my @parents = @{ $INHERIT->{$input} || [] };
205 if (my ($parent, $part) = pairfirst { defined $visible->{$a} } @parents) {
206 $row{parent} = $parent;
208 $params = 128 unless $visible->{$parent}
209 or ($input eq 'MacCroatian' and defined $visible->{MacRomanian});
211 elsif (defined $visible->{ascii}) {
212 $row{parent} = $parents[0];
213 $params = $parents[1] // 128;
214 $params = 128 if $params >= 128; # ascii offset at most
217 $row{parent} = $parents[0];
218 $params = $parents[1] if $parents[1] == 0; # apply ascii end
220 $visible->{$_} //= 0 for $row{parent} || ();
223 # manual option to double table width
224 $row{cols} *= 2 if $params =~ s/[+]\z//;
227 if (length $params) {
230 (?: (?: [-] (?<stop> \d+) )? (?: [+] (?<restart> \d+) ) )?
231 (?: [-] (?<endpoint> \d+) )? \z
233 "Unknown range parameters for $input",
234 "<q>$params</q> is not in format start(-stop)(+restart(-end))",
237 $row{offset} = $+{offset};
238 $endpoint = $+{endpoint} if $+{endpoint};
239 if (my $restart = $+{restart}) {
240 my $skip = int(($+{stop} || $row{offset}) / $row{cols});
241 for ($skip + 1 .. ($restart / $row{cols}) - 1) {
242 $row{skip}->{ $_ * $row{cols} - $row{offset} }++;
247 if ($input =~ /^U([0-9a-fA-F]+)(?:-([0-9a-fA-F]+))?/) {
248 my $start = hex($1) << ($2 ? 4 : 8);
249 my $end = $2 ? (hex($2) << 4) + $row{cols} - 1 : $start + 255;
250 $row{table} = join '', map { chr } $start .. $end;
251 utf8::upgrade($row{table}); # prevent latin1 output
252 $row{endpoint} = $end - $start;
253 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
254 $row{offset} = $start % 256;
256 elsif ($input eq 'U') {
257 $row{set} = 'Unicode planes';
258 $row{cell} = do 'charset-ucplanes.inc.pl'
259 or Alert('Table data could not be read', $@ || $!);
261 $row{endpoint} = 1023 * $row{cell}->{colsize};
263 elsif ($row{set} = Encode::resolve_alias($input)) {
264 if ($row{set} eq 'Internal') {
265 $row{set} = 'Unicode BMP';
266 $row{cell} = do 'charset-unicode.inc.pl'
267 or Alert('Table data could not be read', $@ || $!);
268 $row{endpoint} = ($endpoint || 8191) * $row{cell}->{colsize};
270 elsif ($row{set} eq 'utf-8-strict') {
272 $row{cell} = do 'charset-utf8.inc.pl'
273 or Alert('Table data could not be read', $@ || $!);
274 $row{endpoint} = 255;
277 if ($row{set} eq 'MacHebrew' or $row{set} eq 'MacThai') {
278 # array of possibly multiple characters per code point
280 map { Encode::decode($row{set}, pack 'C*', $_) } $row{offset} .. $endpoint
284 # ~16x faster than decoding in loop;
285 # substr strings is twice as fast as splitting to an array
286 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
288 $row{endpoint} = $endpoint - $row{offset};
290 if ($row{set} eq 'cp437' and !$row{offset}) {
291 substr($row{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
292 substr($row{table}, 0, 32) = pack 'U*', map {hex} qw(
293 2007 263A 263B 2665 2666 2663 2660 2022
294 25D8 25CB 25D9 2642 2640 266A 266B 263C
295 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8
296 2191 2193 2192 2190 221F 2194 25B2 25BC
300 $visible->{ascii} = # assume common base
301 $visible->{ $row{set} } = 1;
305 Alert("Encoding <q>$input</q> unknown");
308 push @request, \%row;
310 tabinput($_) for @tablist;
312 my $NOCHAR = chr 0xFFFD;
315 my ($info, $offset) = @_;
316 my $table = $info->{cell} or return;
317 my $def = $table->{$offset} or return;
318 my ($len, $class, $name, $title) = @{$def};
320 my $cols = $info->{cols};
321 my $colsize = $table->{colsize} || 1;
324 $name //= $len <= 2 ? 'res' : 'reserved';
326 if (my $part = $offset/$colsize % $cols) {
328 my $rest = $cols - $part; # remaining
329 $rest = $len if $len < $rest; #TODO: optimise
331 # continued on new row
332 my @next = ($len * $colsize, "$class joinu");
335 push @next, $name, $title;
340 # minority on next row
341 push @next, '"', $title || $name;
343 $table->{$offset + $colsize*$rest} //= \@next;
348 elsif (my $rows = int($len / $cols)) {
350 my $rowsize = $colsize * $cols;
351 if ($len -= $rows * $cols) {
352 # partial row remains
353 $table->{$offset + $rowsize * $rows} //= [$len*$colsize, "$class joinu", '', $title];
357 # coalesce multiple rows
359 $info->{skip}->{$offset += $rowsize}++;
363 $info->{skip}->{$offset += $rowsize} = 0;
366 $attr .= sprintf ' rowspan=%d', $rows;
370 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
371 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
372 $attr .= sprintf ' class="%s"', $class if $class;
373 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
374 return "<td$attr>$name";
377 for my $row (@request) {
378 my $cols = $row->{cols};
379 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
380 my $coldigits = ceil(log($colsize * $cols) / log(16)); # uniform length of hexadecimal header
381 my $rowdiv = 16 ** $coldigits; # row divide for column digits
382 $rowdiv = 1 if $rowdiv != $cols * $colsize; # divide only if all columns are matched
385 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
386 my $title = $row->{set};
387 $title .= " <aside>(over $_)</aside>"
388 for grep { $_ ne 'iso-8859-1' } $row->{parent} // ();
389 printf '<caption>%s</caption>', $title;
390 print '<col>' x ($cols + 1);
391 for my $section (qw{thead}) {
392 print "<$section><tr><th>", $rowdiv == 1 ? '+' : '↱';
393 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
397 while ($offset < $row->{endpoint}) {
398 if ($row->{skip}->{$offset}) {
399 $offset += $cols * $colsize;
404 if (defined $row->{skip}->{$offset}) {
408 if (my $rowmod = $offset % $rowdiv) {
409 # offset in column units
410 printf '<small>+%X</small>', $rowmod;
414 printf '%X', ($offset + $row->{offset}) / $rowdiv;
419 print range_cell($row, $offset);
423 my $glyph = ref $row->{table} eq 'ARRAY' ? $row->{table}->[$offset] :
424 substr $row->{table}, $offset, 1;
425 if ($glyph eq $NOCHAR) {
430 if (exists $get{compare}) {
432 my $cp = $offset + $row->{offset};
433 printf '<td class="%s" title="%3$s">%2$s',
434 $cp == ord $glyph ? 'l4' :
435 $row->{parent} && $glyph eq
436 Encode::decode($row->{parent}, pack 'C', $cp) ? 'l3' :
437 $visible->{$glyph} ? 'l2' :
439 $glyphs->glyph_html($glyph);
440 $visible->{$glyph}++;
444 print "\n".$glyphs->glyph_cell($glyph);
451 say '</table></div>';
458 <table class="glyphs"><tr><: if (exists $get{compare}) { :>
459 <td class="X l4">unicode
460 <td class="X l3">inherited
461 <td class="X l2">existing
462 <td class="X l1">original
463 <td class="">unassigned
465 <td class="X Cc">control
466 <td class="X Zs"><span>whitespace</span>
467 <td class="X Mn">diacritic<table class="glyphs"><tr>
468 <td class="X Sk">letter
470 <td class="X Po">punctuation<table class="glyphs"><tr>
471 <td class="X Pf">quote
473 <td class="X So">symbol<table class="glyphs"><tr>
474 <td class="X Sm">math
475 <td class="X Sc">currency
477 <td class="X No">numeric
478 <td class="X Greek">greek<table class="glyphs"><tr>
479 <td class="X Latin">latin
480 <td class="X Cyrillic">cyrillic
482 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
483 <td class="X Brahmic">brahmic
484 <td class="X Arabic">arabic
486 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
487 <td class="X African">african
488 <td class="X Hiragana">japanese
489 <td class="X Han">cjk
490 <td class="X Bopomofo">chinese
492 <td class="X Alpha">alphabetic
495 <table class="glyphs"><tr>
496 <td class="X">unicode 7.0
497 <td class="X Xr">proposed
498 <td class="X Xd">deprecated
499 <td class="">unassigned
500 <td class="X Xi">invalid