4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
18 my @tablist = split /[^\w-]+/, $Request || 'default';
21 <h1>Character encoding</h1>
25 use Shiar_Sheet::FormatChar;
26 my $glyphs = Shiar_Sheet::FormatChar->new;
30 # generate character table(s)
31 my $input = shift or return;
34 default => [qw( unicode+0-192 utf-8 iso-8859-1 cp1252+128-159 iso-8859-15+160-191 cp437 cp850+128 )],
35 us => [qw( cp437 cp863+128 AdobeStandardEncoding gsm0338+0-127 )],
36 ebcdic => [qw( cp37 cp500 cp875 cp1026 cp1047 posix-bc )],
37 westeur => [qw( iso-8859-1 iso-8859-15+160-191 cp1252+128-159 iso-8859-14+160 cp850+128 MacRoman+128 nextstep+128 hp-roman8+160 )],
38 centeur => [qw( iso-8859-2 iso-8859-16+160 cp1250+128 cp852+128 MacCentralEurRoman+128 MacCroatian+128 MacRomanian+128 )], # MacRumanian only for DB
39 turkish => [qw( iso-8859-3 iso-8859-9+128 cp857+128 cp1254+128 MacTurkish+128 )],
40 baltic => [qw( iso-8859-4 iso-8859-13+160 cp775+128 cp1257+128 )],
41 nordic => [qw( iso-8859-10 cp865+128 cp861+160-191 MacIcelandic+128 MacSami+160 )],
42 cyrillic => [qw( koi8-f koi8-r+128-192 koi8-u+160-192 iso-8859-5+128 cp1251+128 MacCyrillic+128 cp855+128 cp866+128 )], # MacUkrainian is broken
43 arabic => [qw( iso-8859-6 cp1006+160 cp864+128 cp1256+128 MacArabic )], # MacFarsi same as MacArabic?
44 greek => [qw( iso-8859-7 cp1253+128 MacGreek+128 cp737+128 cp869+128 )],
45 hebrew => [qw( iso-8859-8 cp1255+128 MacHebrew+128 cp862+128 )],
46 thai => [qw( iso-8859-11 cp874+128-159 MacThai+128 )],
47 vietnamese => [qw( viscii cp1258 MacVietnamese )],
48 symbol => [qw( symbol dingbats MacDingbats AdobeZdingbat AdobeSymbol )],
62 if (my $follow = $ALIAS->{$input}) {
63 tabinput($_) for ref $follow ? @{$follow} : $follow;
67 my %row = (offset => 0, cols => 16);
69 my $params = $input =~ s/[+](.*)\z// ? $1 : undef;
71 if (defined $params and $params =~ m/^ (\d+) (-\d+)? /x) {
73 $endpoint = -$2 if $2;
76 $row{setnote} = 'over cp437' if $input eq 'cp850';
77 $row{setnote} = 'over iso-8859-1' if $input =~ /^iso-8859-|^cp125/;
80 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
81 my $start = hex($1) << ($2 ? 4 : 8);
82 my $end = $2 ? hex($2) << 4 : $start + 240;
83 $row{table} = join '', map { chr } $start .. $end+15;
84 utf8::upgrade($row{table}); # prevent latin1 output
85 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
87 elsif ($input eq 'U') {
88 $row{table} = ' ' x 1024;
89 $row{set} = 'Unicode planes';
90 $row{cell} = do 'charset-ucplanes.inc.pl'
91 or Alert('Table data could not be read', $@ || $!);
94 elsif ($row{set} = Encode::resolve_alias($input)) {
95 if ($row{set} eq 'Internal') {
96 $row{table} = ' ' x ($endpoint < 255 ? 640 : 8192);
97 $row{set} = 'Unicode BMP';
98 $row{cell} = do 'charset-unicode.inc.pl'
99 or Alert('Table data could not be read', $@ || $!);
101 elsif ($row{set} eq 'utf-8-strict') {
104 $row{cell} = do 'charset-utf8.inc.pl'
105 or Alert('Table data could not be read', $@ || $!);
108 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
109 # (~16x faster than decoding in loop;
110 # substr strings is twice as fast as splitting to an array)
112 if ($row{set} eq 'cp437' and !$row{offset}) {
113 substr($row{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
114 substr($row{table}, 0, 32) = pack 'U*', map {hex} qw(
115 2007 263A 263B 2665 2666 2663 2660 2022
116 25D8 25CB 25D9 2642 2640 266A 266B 263C
117 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8
118 2191 2193 2192 2190 221F 2194 25B2 25BC
124 Alert("Encoding <q>$input</q> unknown");
127 push @request, \%row;
129 tabinput($_) for @tablist;
131 my $NOCHAR = chr 0xFFFD;
134 my ($info, $offset) = @_;
135 my $table = $info->{cell} or return;
136 my $def = $table->{$offset} or return;
137 my ($len, $class, $name, $title) = @{$def};
139 my $cols = $info->{cols};
140 my $colsize = $table->{colsize} || 1;
143 $name //= $len <= 2 ? 'res' : 'reserved';
145 if (my $part = $offset/$colsize % $cols) {
147 my $rest = $cols - $part; # remaining
148 $rest = $len if $len < $rest; #TODO: optimise
150 # continued on new row
151 my @next = ($len * $colsize, "$class joinu");
154 push @next, $name, $title;
159 # minority on next row
160 push @next, '"', $title || $name;
162 $table->{$offset + $colsize*$rest} //= \@next;
167 elsif (my $rows = int($len / $cols)) {
169 if ($len -= $rows * $cols) {
170 # partial row remains
171 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
174 $attr .= sprintf ' rowspan=%d', $rows;
178 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
179 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
180 $attr .= sprintf ' class="%s"', $class if $class;
181 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
182 return "<td$attr>$name";
185 for my $row (@request) {
186 my $cols = $row->{cols};
187 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
188 my $coldigits = ceil(log($colsize * $cols) / log(16)); # uniform length of hexadecimal header
189 my $rowdiv = 16 ** $coldigits; # row divide for column digits
190 $rowdiv = 1 if $rowdiv != $cols * $colsize; # divide only if all columns are matched
192 my $endpoint = $offset + (length($row->{table}) || 256) * $colsize;
194 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
195 my $title = $row->{set};
196 $title .= " <aside>($_)</aside>" for $row->{setnote} // ();
197 printf '<caption>%s</caption>', $title;
198 print '<col>' x ($cols + 1);
199 for my $section (qw{thead}) {
200 print "<$section><tr><th>", $rowdiv == 1 ? '+' : '↱';
201 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
205 while ($offset < $endpoint - 1) {
208 if (my $rowmod = $offset % $rowdiv) {
209 # offset in column units
210 printf '<small>+%X</small>', $rowmod;
214 printf '%X', ($offset + $row->{offset}) / $rowdiv;
219 print range_cell($row, $offset);
223 my $glyph = substr $row->{table}, $offset, 1;
224 if ($glyph eq $NOCHAR) {
229 print "\n".$glyphs->glyph_cell($glyph);
236 say '</table></div>';
243 <table class="glyphs"><tr>
244 <td class="X Cc">control
245 <td class="X Zs"><span>whitespace</span>
246 <td class="X Mn">diacritic<table class="glyphs"><tr>
247 <td class="X Sk">letter
249 <td class="X Po">punctuation<table class="glyphs"><tr>
250 <td class="X Pf">quote
252 <td class="X So">symbol<table class="glyphs"><tr>
253 <td class="X Sm">math
254 <td class="X Sc">currency
256 <td class="X No">numeric
257 <td class="X Greek">greek<table class="glyphs"><tr>
258 <td class="X Latin">latin
259 <td class="X Cyrillic">cyrillic
261 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
262 <td class="X Brahmic">brahmic
263 <td class="X Arabic">arabic
265 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
266 <td class="X African">african
267 <td class="X Hiragana">japanese
268 <td class="X Han">cjk
269 <td class="X Bopomofo">chinese
271 <td class="X Alpha">alphabetic
274 <table class="glyphs"><tr>
275 <td class="X">unicode 7.0
276 <td class="X Xr">proposed
277 <td class="X Xd">deprecated
278 <td class="">unassigned
279 <td class="X Xi">invalid