4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
18 my @tablist = split m{/+}, $Request || 'default';
21 <h1>Character encoding</h1>
25 use Shiar_Sheet::FormatChar;
26 my $glyphs = Shiar_Sheet::FormatChar->new;
30 # generate character table(s)
31 my $input = shift or return;
34 default => [qw( unicode+0-639 utf-8 iso-8859-1 iso-8859-15 cp1252 cp437 cp850 )],
35 us => [qw( cp437 cp863 gsm0338 AdobeStandardEncoding )],
36 ebcdic => [qw( cp37 cp500 cp1047 posix-bc cp1026 cp875 )],
37 iso => [map {"iso-8859-$_"} 1 .. 11, 13 .. 16],
38 dos => [qw( cp437 cp865 cp861 cp860 cp863 cp850 cp857 cp852 cp775
39 cp737 cp869 cp866 cp855 cp862 cp864 )],
40 aix => [qw( cp1006 )],
41 win => [qw( cp1252 cp1250 cp1254 cp1257 cp1258 cp1253 cp1251 cp1255 cp1256 cp874 )],
42 mac => [qw( MacRoman MacRomanian MacRumanian MacCroatian MacCentralEurRoman MacTurkish MacIcelandic MacSami
43 MacGreek MacCyrillic MacHebrew MacArabic MacFarsi MacThai )],
46 westeur => [qw( iso-8859-1 iso-8859-15 cp1252 iso-8859-14 cp850 hp-roman8 nextstep MacRoman )],
47 centeur => [qw( iso-8859-2 iso-8859-16 cp1250 cp852 MacRomanian MacCroatian MacCentralEurRoman )], # MacRumanian only for DB
48 turkish => [qw( iso-8859-9 iso-8859-3 cp1254 cp857 MacTurkish )],
49 baltic => [qw( iso-8859-4 iso-8859-13 cp1257 cp775 )],
50 nordic => [qw( iso-8859-10 cp865 cp861 MacIcelandic MacSami )],
51 cyrillic => [qw( koi8-r koi8-u koi8-f iso-8859-5 cp1251 MacCyrillic cp866 cp855
52 U4 U2DE-2DF UA64-A69 U50-52 )], # MacUkrainian is broken
53 arabic => [qw( iso-8859-6 cp1256 MacArabic cp864 cp1006 MacFarsi
54 U6 U8A-8F+0-31+64 U75-77 )],
55 greek => [qw( iso-8859-7 cp1253 MacGreek cp737 cp869 U37-3F U1F )],
56 hebrew => [qw( iso-8859-8 cp1255 MacHebrew cp862 U59-5F )],
57 thai => [qw( iso-8859-11 cp874 MacThai )],
58 vietnamese => [qw( viscii cp1258 MacVietnamese )],
59 symbol => [qw( symbol dingbats MacDingbats AdobeZdingbat AdobeSymbol )],
74 if (my $follow = $ALIAS->{$input}) {
75 tabinput($_) for ref $follow ? @{$follow} : $follow;
79 state $visible = {}; # all present tables
80 my %row = (offset => 0, cols => 16);
82 my $params = $input =~ s/[+](.*)\z// ? $1 : undef;
84 if (not defined $params) {
85 use List::Util qw( first pairfirst pairs );
88 'cp437' => ['cp850' => 0], # ascii range overridden later
89 'gsm0338' => ['ascii' => '0-127'],
91 'iso-8859-2' => ['iso-8859-1' => '160'],
92 'iso-8859-3' => ['iso-8859-1' => '160'], #TODO: also apply to iso-8859-9
93 'iso-8859-4' => ['iso-8859-2' => '160'],
94 'iso-8859-5' => ['iso-8859-1' => '160'],
95 'iso-8859-6' => ['cp1256' => '128', 'iso-8859-1' => '160'],
96 'iso-8859-7' => ['iso-8859-1' => '160'],
97 'iso-8859-8' => ['iso-8859-1' => '160'],
98 'iso-8859-9' => ['iso-8859-1' => '208-223+240'],
99 'iso-8859-10' => ['iso-8859-4' => '160'],
100 'iso-8859-11' => ['iso-8859-1' => '160'],
101 'iso-8859-13' => ['iso-8859-4' => '160'],
102 'iso-8859-14' => ['iso-8859-1' => '160'],
103 'iso-8859-15' => ['iso-8859-1' => '160-191'],
104 'iso-8859-16' => ['iso-8859-2' => '160'],
105 'hp-roman8' => ['iso-8859-1' => '160'],
107 'cp1252' => ['iso-8859-1' => '128-159'],
108 'cp1250' => ['iso-8859-2' => '128-191', 'cp1252' => '128'],
109 'cp1254' => ['iso-8859-9' => '128-159', 'cp1252' => '128-159+208'],
110 'cp874' => ['iso-8859-11' => '128-159', 'cp1252' => '128'], # windows-874 actually cp1162
111 'cp1257' => ['iso-8859-13' => '128-159+255', 'cp1252' => '128'],
112 'cp1251' => ['cp1252' => '128'],
113 'cp1253' => ['cp1252' => '128'],
114 'cp1255' => ['iso-8859-8' => '128-223', 'cp1252' => '128'],
115 'cp1256' => ['cp1252' => '128'],
116 'cp1258' => ['cp1252' => '128-159+192'],
118 'cp850' => ['cp437' => '144'],
119 'cp860' => ['cp437' => '128-175'],
120 'cp861' => ['cp865' => '128-175'],
121 'cp863' => ['cp437' => '128-175'],
122 'cp865' => ['cp437' => '144-175'],
123 'cp852' => ['cp850' => '128', 'cp437' => '128'],
124 'cp857' => ['cp850' => '128-175+208-239', 'cp437' => '128'],
125 'cp775' => ['cp850' => '128'], # partial cp437
126 'cp866' => ['cp437' => '128-175+224'],
127 'cp855' => ['cp437' => '128'],
128 'cp1006' => ['iso-8859-6' => '160', 'cp437' => '128'],
129 'cp737' => ['cp437' => '128-175+224'],
130 'cp869' => ['cp437' => '128'],
131 'cp862' => ['cp437' => '128-159'],
132 'cp864' => ['MacArabic' => '128', 'iso-8859-6' => '128', 'cp437' => '128'], #TODO: compare form variants
134 'koi8-u' => ['koi8-r' => 128],
135 'koi8-f' => ['koi8-u' => 128],
137 'MacRomanian' => ['MacRoman' => '160-191+208-223'],
138 'MacRumanian' => ['MacRomanian' => '160-191+208-223', 'MacRoman' => '160-191+208-223'],
139 'MacCroatian' => ['MacRoman' => '160'],
140 'MacCentralEurRoman' => ['MacRoman' => '128'],
141 'MacIcelandic'=> ['MacRoman' => '160-175+208-239'], #TODO: gaps at C/E
142 'MacTurkish' => ['MacRoman' => '208-223'], # F5 is unassigned
143 'MacSami' => ['MacIcelandic' => '144', 'MacRoman' => '144'],
144 'MacGreek' => ['MacRoman' => '128'],
145 'MacCyrillic' => ['MacRoman' => '128'],
146 'MacHebrew' => ['iso-8859-8' => '128', 'MacRoman' => '128-143+160'], # partial ascii
147 'MacArabic' => ['iso-8859-6' => '128', 'cp864' => '128', 'MacRoman' => '128'], #TODO: multiple parents
148 'MacFarsi' => ['MacArabic' => '176-191', 'MacRoman' => '128'],
150 'cp37' => ['posix-bc' => '0'],
151 'posix-bc' => ['cp1047' => '64'],
152 'cp500' => ['cp37' => '64-95+176-191'],
153 'cp1047' => ['cp37' => '16-95+160-191'], #TODO: gap at 3/4
154 'cp1026' => ['cp37' => '64'],
155 'cp875' => ['cp37' => '48'],
159 my @parents = @{ $INHERIT->{$input} || [] };
161 if (my ($parent, $part) = pairfirst { defined $visible->{$a} } @parents) {
162 $row{parent} = $parent;
164 $params = 128 unless $visible->{$parent}
165 or ($input eq 'MacCroatian' and defined $visible->{MacRomanian});
167 elsif (defined $visible->{ascii}) {
168 $row{parent} = $parents[0];
169 $params = $parents[1] || 128;
170 $params = 128 if $params >= 128; # ascii offset at most
173 $row{parent} = $parents[0];
175 $visible->{$_} //= 0 for $row{parent};
178 if (defined $params) {
181 (?: (?: [-] (?<stop> \d+) )? (?: [+] (?<restart> \d+) ) )?
182 (?: [-] (?<endpoint> \d+) )? \z
184 "Unknown range parameters for $input",
185 "<q>$params</q> is not in format start(-stop)(+restart(-end))",
188 $row{offset} = $+{offset};
189 $endpoint = $+{endpoint} if $+{endpoint};
190 if (my $restart = $+{restart}) {
191 my $skip = int(($+{stop} || $row{offset}) / $row{cols});
192 for ($skip + 1 .. ($restart / $row{cols}) - 1) {
193 $row{skip}->{ $_ * $row{cols} - $row{offset} }++;
198 if ($input =~ /^U([0-9a-fA-F]+)(?:-([0-9a-fA-F]+))?/) {
199 my $start = hex($1) << ($2 ? 4 : 8);
200 my $end = $2 ? (hex($2) << 4) + $row{cols} - 1 : $start + 255;
201 $row{table} = join '', map { chr } $start .. $end;
202 utf8::upgrade($row{table}); # prevent latin1 output
203 $row{endpoint} = $end - $start;
204 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
205 $row{offset} = $start % 256;
207 elsif ($input eq 'U') {
208 $row{set} = 'Unicode planes';
209 $row{cell} = do 'charset-ucplanes.inc.pl'
210 or Alert('Table data could not be read', $@ || $!);
212 $row{endpoint} = 1023 * $row{cell}->{colsize};
214 elsif ($row{set} = Encode::resolve_alias($input)) {
215 if ($row{set} eq 'Internal') {
216 $row{set} = 'Unicode BMP';
217 $row{cell} = do 'charset-unicode.inc.pl'
218 or Alert('Table data could not be read', $@ || $!);
219 $row{endpoint} = ($endpoint || 8191) * $row{cell}->{colsize};
221 elsif ($row{set} eq 'utf-8-strict') {
223 $row{cell} = do 'charset-utf8.inc.pl'
224 or Alert('Table data could not be read', $@ || $!);
225 $row{endpoint} = 255;
228 if ($row{set} eq 'MacHebrew' or $row{set} eq 'MacThai') {
229 # array of possibly multiple characters per code point
231 map { Encode::decode($row{set}, pack 'C*', $_) } $row{offset} .. $endpoint
235 # ~16x faster than decoding in loop;
236 # substr strings is twice as fast as splitting to an array
237 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
239 $row{endpoint} = $endpoint - $row{offset};
241 if ($row{set} eq 'cp437' and !$row{offset}) {
242 substr($row{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
243 substr($row{table}, 0, 32) = pack 'U*', map {hex} qw(
244 2007 263A 263B 2665 2666 2663 2660 2022
245 25D8 25CB 25D9 2642 2640 266A 266B 263C
246 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8
247 2191 2193 2192 2190 221F 2194 25B2 25BC
251 $visible->{ascii} = # assume common base
252 $visible->{ $row{set} } = 1;
256 Alert("Encoding <q>$input</q> unknown");
259 push @request, \%row;
261 tabinput($_) for @tablist;
263 my $NOCHAR = chr 0xFFFD;
266 my ($info, $offset) = @_;
267 my $table = $info->{cell} or return;
268 my $def = $table->{$offset} or return;
269 my ($len, $class, $name, $title) = @{$def};
271 my $cols = $info->{cols};
272 my $colsize = $table->{colsize} || 1;
275 $name //= $len <= 2 ? 'res' : 'reserved';
277 if (my $part = $offset/$colsize % $cols) {
279 my $rest = $cols - $part; # remaining
280 $rest = $len if $len < $rest; #TODO: optimise
282 # continued on new row
283 my @next = ($len * $colsize, "$class joinu");
286 push @next, $name, $title;
291 # minority on next row
292 push @next, '"', $title || $name;
294 $table->{$offset + $colsize*$rest} //= \@next;
299 elsif (my $rows = int($len / $cols)) {
301 if ($len -= $rows * $cols) {
302 # partial row remains
303 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
306 $attr .= sprintf ' rowspan=%d', $rows;
310 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
311 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
312 $attr .= sprintf ' class="%s"', $class if $class;
313 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
314 return "<td$attr>$name";
317 for my $row (@request) {
318 my $cols = $row->{cols};
319 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
320 my $coldigits = ceil(log($colsize * $cols) / log(16)); # uniform length of hexadecimal header
321 my $rowdiv = 16 ** $coldigits; # row divide for column digits
322 $rowdiv = 1 if $rowdiv != $cols * $colsize; # divide only if all columns are matched
325 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
326 my $title = $row->{set};
327 $title .= " <aside>(over $_)</aside>"
328 for grep { $_ ne 'iso-8859-1' } $row->{parent} // ();
329 printf '<caption>%s</caption>', $title;
330 print '<col>' x ($cols + 1);
331 for my $section (qw{thead}) {
332 print "<$section><tr><th>", $rowdiv == 1 ? '+' : '↱';
333 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
337 while ($offset < $row->{endpoint}) {
338 if ($row->{skip}->{$offset}) {
339 $offset += $cols * $colsize;
345 if (my $rowmod = $offset % $rowdiv) {
346 # offset in column units
347 printf '<small>+%X</small>', $rowmod;
351 printf '%X', ($offset + $row->{offset}) / $rowdiv;
356 print range_cell($row, $offset);
360 my $glyph = ref $row->{table} eq 'ARRAY' ? $row->{table}->[$offset] :
361 substr $row->{table}, $offset, 1;
362 if ($glyph eq $NOCHAR) {
367 if (exists $get{compare}) {
369 my $cp = $offset + $row->{offset};
370 printf '<td class="%s" title="%3$s">%2$s',
371 $cp == ord $glyph ? 'l4' :
372 $row->{parent} && $glyph eq
373 Encode::decode($row->{parent}, pack 'C', $cp) ? 'l3' :
374 $visible->{$glyph} ? 'l2' :
376 $glyphs->glyph_html($glyph);
377 $visible->{$glyph}++;
381 print "\n".$glyphs->glyph_cell($glyph);
388 say '</table></div>';
395 <table class="glyphs"><tr><: if (exists $get{compare}) { :>
396 <td class="X l4">unicode
397 <td class="X l3">inherited
398 <td class="X l2">existing
399 <td class="X l1">original
400 <td class="">unassigned
402 <td class="X Cc">control
403 <td class="X Zs"><span>whitespace</span>
404 <td class="X Mn">diacritic<table class="glyphs"><tr>
405 <td class="X Sk">letter
407 <td class="X Po">punctuation<table class="glyphs"><tr>
408 <td class="X Pf">quote
410 <td class="X So">symbol<table class="glyphs"><tr>
411 <td class="X Sm">math
412 <td class="X Sc">currency
414 <td class="X No">numeric
415 <td class="X Greek">greek<table class="glyphs"><tr>
416 <td class="X Latin">latin
417 <td class="X Cyrillic">cyrillic
419 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
420 <td class="X Brahmic">brahmic
421 <td class="X Arabic">arabic
423 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
424 <td class="X African">african
425 <td class="X Hiragana">japanese
426 <td class="X Han">cjk
427 <td class="X Bopomofo">chinese
429 <td class="X Alpha">alphabetic
432 <table class="glyphs"><tr>
433 <td class="X">unicode 7.0
434 <td class="X Xr">proposed
435 <td class="X Xd">deprecated
436 <td class="">unassigned
437 <td class="X Xi">invalid