4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
18 my @tablist = split m{/+}, $Request || 'default';
21 <h1>Character encoding</h1>
25 use Shiar_Sheet::FormatChar;
26 my $glyphs = Shiar_Sheet::FormatChar->new;
30 # generate character table(s)
31 my $input = shift or return;
34 default => [qw( unicode+0-639 utf-8 iso-8859-1 iso-8859-15 cp1252 cp437 cp850 )],
35 us => [qw( cp437 cp863 AdobeStandardEncoding gsm0338 )],
36 ebcdic => [qw( cp37 cp500 cp875 cp1026 cp1047 posix-bc )],
37 iso => [map {"iso-8859-$_"} 1 .. 11, 13 .. 16],
38 dos => [qw( cp437 cp865 cp861 cp860 cp863 cp850 cp857 cp852 cp775
39 cp737 cp869 cp866 cp855 cp862 cp864 )],
40 aix => [qw( cp1006 )],
41 win => [qw( cp1252 cp1250 cp1254 cp1257 cp1258 cp1253 cp1251 cp1255 cp1256 cp874 )],
43 westeur => [qw( iso-8859-1 iso-8859-15 cp1252 iso-8859-14 cp850 hp-roman8 nextstep MacRoman )],
44 centeur => [qw( iso-8859-2 iso-8859-16 cp1250 cp852 MacRomanian MacCroatian MacCentralEurRoman )], # MacRumanian only for DB
45 turkish => [qw( iso-8859-9 iso-8859-3 cp1254 cp857 MacTurkish )],
46 baltic => [qw( iso-8859-4 iso-8859-13 cp1257 cp775 )],
47 nordic => [qw( iso-8859-10 cp865 cp861 MacIcelandic MacSami )],
48 cyrillic => [qw( koi8-r koi8-u koi8-f iso-8859-5 cp1251 MacCyrillic cp866 cp855 )], # MacUkrainian is broken
49 arabic => [qw( iso-8859-6 cp1006 cp864 cp1256 MacArabic )], # MacFarsi same as MacArabic?
50 greek => [qw( iso-8859-7 cp1253 MacGreek cp737 cp869 )],
51 hebrew => [qw( iso-8859-8 cp1255 MacHebrew cp862 )],
52 thai => [qw( iso-8859-11 cp874 MacThai )],
53 vietnamese => [qw( viscii cp1258 MacVietnamese )],
54 symbol => [qw( symbol dingbats MacDingbats AdobeZdingbat AdobeSymbol )],
68 if (my $follow = $ALIAS->{$input}) {
69 tabinput($_) for ref $follow ? @{$follow} : $follow;
73 state $visible = {}; # all present tables
74 my %row = (offset => 0, cols => 16);
76 my $params = $input =~ s/[+](.*)\z// ? $1 : undef;
78 if (not defined $params) {
79 use List::Util qw( first pairfirst pairs );
82 'cp437' => ['cp850' => 0], # ascii range overridden later
83 'gsm0338' => ['ascii' => '0-127'],
85 'iso-8859-2' => ['iso-8859-1' => '160'],
86 'iso-8859-3' => ['iso-8859-1' => '160'],
87 'iso-8859-4' => ['iso-8859-2' => '160'],
88 'iso-8859-5' => ['iso-8859-1' => '160'],
89 'iso-8859-6' => ['iso-8859-1' => '160'],
90 'iso-8859-7' => ['iso-8859-1' => '160'],
91 'iso-8859-8' => ['iso-8859-1' => '160'],
92 'iso-8859-9' => ['iso-8859-1' => '208-223+240'],
93 'iso-8859-10' => ['iso-8859-4' => '160'],
94 'iso-8859-11' => ['iso-8859-1' => '160'],
95 'iso-8859-13' => ['iso-8859-4' => '160'],
96 'iso-8859-14' => ['iso-8859-1' => '160'],
97 'iso-8859-15' => ['iso-8859-1' => '160-191'],
98 'iso-8859-16' => ['iso-8859-2' => '160'],
99 'hp-roman8' => ['iso-8859-1' => '160'],
101 'cp1252' => ['iso-8859-1' => '128-159'],
102 'cp1250' => ['iso-8859-2' => '128-191', 'cp1252' => '128'],
103 'cp1254' => ['iso-8859-9' => '128-159', 'cp1252' => '128-159+208'],
104 'cp874' => ['iso-8859-11' => '128-159', 'cp1252' => '128'], # windows-874 actually cp1162
105 'cp1257' => ['iso-8859-13' => '128-159+255', 'cp1252' => '128'],
106 'cp1251' => ['cp1252' => '128'],
107 'cp1253' => ['cp1252' => '128'],
108 'cp1255' => ['iso-8859-8' => '128-223', 'cp1252' => '128'],
109 'cp1256' => ['cp1252' => '128'],
110 'cp1258' => ['cp1252' => '128-159+192'],
112 'cp850' => ['cp437' => '144'],
113 'cp860' => ['cp437' => '128-175'],
114 'cp861' => ['cp865' => '128-175'],
115 'cp863' => ['cp437' => '128-175'],
116 'cp865' => ['cp437' => '144-175'],
117 'cp852' => ['cp850' => '128', 'cp437' => '128'],
118 'cp857' => ['cp850' => '128-175+208-239', 'cp437' => '128'],
119 'cp775' => ['cp850' => '128'], # partial cp437
120 'cp866' => ['cp437' => '128-175+224'],
121 'cp855' => ['cp437' => '128'],
122 'cp1006' => ['cp437' => '128'],
123 'cp737' => ['cp437' => '128-175+224'],
124 'cp869' => ['cp437' => '128'],
125 'cp862' => ['cp437' => '128-159'],
126 'cp864' => ['cp437' => '128'],
128 'koi8-u' => ['koi8-r' => 128],
129 'koi8-f' => ['koi8-u' => 128],
131 'MacRomanian' => ['MacRoman' => '160'],
132 'MacCroatian' => ['MacRoman' => '160'],
133 'MacCentralEurRoman' => ['MacRoman' => '128'],
134 'MacTurkish' => ['MacRoman' => '208-223'], # F5 is unassigned
135 'MacCyrillic' => ['MacRoman' => '128'],
136 'MacHebrew' => ['MacRoman' => '128'],
139 my @parents = @{ $INHERIT->{$input} || [] };
141 if (my ($parent, $part) = pairfirst { defined $visible->{$a} } @parents) {
142 $row{parent} = $parent;
144 $params = 128 unless $visible->{$parent}
145 or ($input eq 'MacCroatian' and defined $visible->{MacRomanian});
147 elsif (defined $visible->{ascii}) {
148 $row{parent} = $parents[0];
149 $params = $parents[1] || 128;
150 $params = 128 if $params >= 128; # ascii offset at most
153 $row{parent} = $parents[0];
155 $visible->{$_} //= 0 for $row{parent};
158 if (defined $params) {
161 (?: (?: [-] (?<stop> \d+) )? (?: [+] (?<restart> \d+) ) )?
162 (?: [-] (?<endpoint> \d+) )? \z
164 "Unknown range parameters for $input",
165 "<q>$params</q> is not in format start(-stop)(+restart(-end))",
168 $row{offset} = $+{offset};
169 $endpoint = $+{endpoint} if $+{endpoint};
170 if (my $restart = $+{restart}) {
171 my $skip = int(($+{stop} || $row{offset}) / $row{cols});
172 for ($skip + 1 .. ($restart / $row{cols}) - 1) {
173 $row{skip}->{ $_ * $row{cols} - $row{offset} }++;
178 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
179 my $start = hex($1) << ($2 ? 4 : 8);
180 my $end = $2 ? hex($2) << 4 : $start + 240;
181 $row{table} = join '', map { chr } $start .. $end+15;
182 utf8::upgrade($row{table}); # prevent latin1 output
183 $row{endpoint} = $end + 14 - $start;
184 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
186 elsif ($input eq 'U') {
187 $row{set} = 'Unicode planes';
188 $row{cell} = do 'charset-ucplanes.inc.pl'
189 or Alert('Table data could not be read', $@ || $!);
191 $row{endpoint} = 1023 * $row{cell}->{colsize};
193 elsif ($row{set} = Encode::resolve_alias($input)) {
194 if ($row{set} eq 'Internal') {
195 $row{set} = 'Unicode BMP';
196 $row{cell} = do 'charset-unicode.inc.pl'
197 or Alert('Table data could not be read', $@ || $!);
198 $row{endpoint} = ($endpoint || 8191) * $row{cell}->{colsize};
200 elsif ($row{set} eq 'utf-8-strict') {
202 $row{cell} = do 'charset-utf8.inc.pl'
203 or Alert('Table data could not be read', $@ || $!);
204 $row{endpoint} = 255;
207 if ($row{set} eq 'MacHebrew') {
208 # array of possibly multiple characters per code point
210 map { Encode::decode($row{set}, pack 'C*', $_) } $row{offset} .. $endpoint
214 # ~16x faster than decoding in loop;
215 # substr strings is twice as fast as splitting to an array
216 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
218 $row{endpoint} = $endpoint - $row{offset};
220 if ($row{set} eq 'cp437' and !$row{offset}) {
221 substr($row{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
222 substr($row{table}, 0, 32) = pack 'U*', map {hex} qw(
223 2007 263A 263B 2665 2666 2663 2660 2022
224 25D8 25CB 25D9 2642 2640 266A 266B 263C
225 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8
226 2191 2193 2192 2190 221F 2194 25B2 25BC
230 $visible->{ascii} = # assume common base
231 $visible->{ $row{set} } = 1;
235 Alert("Encoding <q>$input</q> unknown");
238 push @request, \%row;
240 tabinput($_) for @tablist;
242 my $NOCHAR = chr 0xFFFD;
245 my ($info, $offset) = @_;
246 my $table = $info->{cell} or return;
247 my $def = $table->{$offset} or return;
248 my ($len, $class, $name, $title) = @{$def};
250 my $cols = $info->{cols};
251 my $colsize = $table->{colsize} || 1;
254 $name //= $len <= 2 ? 'res' : 'reserved';
256 if (my $part = $offset/$colsize % $cols) {
258 my $rest = $cols - $part; # remaining
259 $rest = $len if $len < $rest; #TODO: optimise
261 # continued on new row
262 my @next = ($len * $colsize, "$class joinu");
265 push @next, $name, $title;
270 # minority on next row
271 push @next, '"', $title || $name;
273 $table->{$offset + $colsize*$rest} //= \@next;
278 elsif (my $rows = int($len / $cols)) {
280 if ($len -= $rows * $cols) {
281 # partial row remains
282 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
285 $attr .= sprintf ' rowspan=%d', $rows;
289 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
290 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
291 $attr .= sprintf ' class="%s"', $class if $class;
292 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
293 return "<td$attr>$name";
296 for my $row (@request) {
297 my $cols = $row->{cols};
298 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
299 my $coldigits = ceil(log($colsize * $cols) / log(16)); # uniform length of hexadecimal header
300 my $rowdiv = 16 ** $coldigits; # row divide for column digits
301 $rowdiv = 1 if $rowdiv != $cols * $colsize; # divide only if all columns are matched
304 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
305 my $title = $row->{set};
306 $title .= " <aside>(over $_)</aside>"
307 for grep { $_ ne 'iso-8859-1' } $row->{parent} // ();
308 printf '<caption>%s</caption>', $title;
309 print '<col>' x ($cols + 1);
310 for my $section (qw{thead}) {
311 print "<$section><tr><th>", $rowdiv == 1 ? '+' : '↱';
312 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
316 while ($offset < $row->{endpoint}) {
317 if ($row->{skip}->{$offset}) {
318 $offset += $cols * $colsize;
324 if (my $rowmod = $offset % $rowdiv) {
325 # offset in column units
326 printf '<small>+%X</small>', $rowmod;
330 printf '%X', ($offset + $row->{offset}) / $rowdiv;
335 print range_cell($row, $offset);
339 my $glyph = ref $row->{table} eq 'ARRAY' ? $row->{table}->[$offset] :
340 substr $row->{table}, $offset, 1;
341 if ($glyph eq $NOCHAR) {
346 if (exists $get{compare}) {
348 my $cp = $offset + $row->{offset};
349 printf '<td class="%s" title="%3$s">%2$s',
350 $cp == ord $glyph ? 'l4' :
351 $row->{parent} && $glyph eq
352 Encode::decode($row->{parent}, pack 'C', $cp) ? 'l3' :
353 $visible->{$glyph} ? 'l2' :
355 $glyphs->glyph_html($glyph);
356 $visible->{$glyph}++;
360 print "\n".$glyphs->glyph_cell($glyph);
367 say '</table></div>';
374 <table class="glyphs"><tr><: if (exists $get{compare}) { :>
375 <td class="X l4">unicode
376 <td class="X l3">inherited
377 <td class="X l2">existing
378 <td class="X l1">original
379 <td class="">unassigned
381 <td class="X Cc">control
382 <td class="X Zs"><span>whitespace</span>
383 <td class="X Mn">diacritic<table class="glyphs"><tr>
384 <td class="X Sk">letter
386 <td class="X Po">punctuation<table class="glyphs"><tr>
387 <td class="X Pf">quote
389 <td class="X So">symbol<table class="glyphs"><tr>
390 <td class="X Sm">math
391 <td class="X Sc">currency
393 <td class="X No">numeric
394 <td class="X Greek">greek<table class="glyphs"><tr>
395 <td class="X Latin">latin
396 <td class="X Cyrillic">cyrillic
398 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
399 <td class="X Brahmic">brahmic
400 <td class="X Arabic">arabic
402 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
403 <td class="X African">african
404 <td class="X Hiragana">japanese
405 <td class="X Han">cjk
406 <td class="X Bopomofo">chinese
408 <td class="X Alpha">alphabetic
411 <table class="glyphs"><tr>
412 <td class="X">unicode 7.0
413 <td class="X Xr">proposed
414 <td class="X Xd">deprecated
415 <td class="">unassigned
416 <td class="X Xi">invalid