4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
18 my @tablist = split /[^\w-]+/, $Request || 'default';
21 <h1>Character encoding</h1>
25 use Shiar_Sheet::FormatChar;
26 my $glyphs = Shiar_Sheet::FormatChar->new;
29 # generate character table(s)
30 my $input = shift or return;
33 default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
34 us => [qw(cp437 -cp863)],
35 westeur => [qw(iso-8859-1 -cp1252 -MacRoman -cp850)],
36 centeur => [qw(iso-8859-2 -cp1250 -cp852 -MacCentralEurRoman -MacCroatian -MacRumanian)],
37 cyrillic => [qw(koi8-f -iso-8859-5 -cp1251 -MacCyrillic -cp855 -cp866)],
38 greek => [qw(iso-8859-7 -cp1253 -MacGreek -cp737 -cp869)],
39 hebrew => [qw(iso-8859-8 -cp1255 -MacHebrew -cp862)],
47 if (my $follow = $ALIAS->{$input}) {
48 return map { tabinput($_) } ref $follow ? @{$follow} : $follow;
51 my %row = (offset => 0, cols => 16);
53 if ($input =~ s/^--//) {
54 $row{offset} = $endpoint > 160 ? 160 : 48;
56 elsif ($input =~ s/^-//) {
57 $row{offset} = $endpoint > 128 ? 128 : 32;
59 if ($input =~ s/-$//) {
60 $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
63 $row{setnote} = 'over cp437' if $input eq 'cp850';
64 $row{setnote} = 'over iso-8859-1' if $input =~ /^iso-8859-|^cp125/;
67 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
68 my $start = hex($1) << ($2 ? 4 : 8);
69 my $end = $2 ? hex($2) << 4 : $start + 240;
70 $row{table} = join '', map { chr } $start .. $end+15;
71 utf8::upgrade($row{table}); # prevent latin1 output
72 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
74 elsif ($input eq 'U') {
75 $row{table} = ' ' x 1024;
76 $row{set} = 'Unicode planes';
77 $row{cell} = do 'charset-ucplanes.inc.pl'
78 or Alert('Table data could not be read', $@ || $!);
81 elsif ($row{set} = Encode::resolve_alias($input)) {
82 if ($row{set} eq 'Internal') {
83 $row{table} = ' ' x ($endpoint < 255 ? 640 : 8192);
84 $row{set} = 'Unicode BMP';
85 $row{cell} = do 'charset-unicode.inc.pl'
86 or Alert('Table data could not be read', $@ || $!);
88 elsif ($row{set} eq 'utf-8-strict') {
91 $row{cell} = do 'charset-utf8.inc.pl'
92 or Alert('Table data could not be read', $@ || $!);
95 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
96 # (~16x faster than decoding in loop;
97 # substr strings is twice as fast as splitting to an array)
99 if ($row{set} eq 'cp437') {
100 substr($row{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
101 substr($row{table}, 0, 32) = pack 'U*', map {hex} qw(
102 2007 263A 263B 2665 2666 2663 2660 2022
103 25D8 25CB 25D9 2642 2640 266A 266B 263C
104 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8
105 2191 2193 2192 2190 221F 2194 25B2 25BC
111 Alert("Encoding <q>$input</q> unknown");
116 my @request = map { tabinput($_) } @tablist;
118 my $NOCHAR = chr 0xFFFD;
121 my ($info, $offset) = @_;
122 my $table = $info->{cell} or return;
123 my $def = $table->{$offset} or return;
124 my ($len, $class, $name, $title) = @{$def};
126 my $cols = $info->{cols};
127 my $colsize = $table->{colsize} || 1;
130 $name //= $len <= 2 ? 'res' : 'reserved';
132 if (my $part = $offset/$colsize % $cols) {
134 my $rest = $cols - $part; # remaining
135 $rest = $len if $len < $rest; #TODO: optimise
137 # continued on new row
138 my @next = ($len * $colsize, "$class joinu");
141 push @next, $name, $title;
146 # minority on next row
147 push @next, '"', $title || $name;
149 $table->{$offset + $colsize*$rest} //= \@next;
154 elsif (my $rows = int($len / $cols)) {
156 if ($len -= $rows * $cols) {
157 # partial row remains
158 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
161 $attr .= sprintf ' rowspan=%d', $rows;
165 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
166 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
167 $attr .= sprintf ' class="%s"', $class if $class;
168 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
169 return "<td$attr>$name";
172 for my $row (@request) {
173 my $cols = $row->{cols};
174 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
175 my $coldigits = ceil(log($colsize * $cols) / log(16)); # uniform length of hexadecimal header
176 my $rowdiv = 16 ** $coldigits; # row divide for column digits
177 $rowdiv = 1 if $rowdiv != $cols * $colsize; # divide only if all columns are matched
179 my $endpoint = $offset + (length($row->{table}) || 256) * $colsize;
181 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
182 my $title = $row->{set};
183 $title .= " <aside>($_)</aside>" for $row->{setnote} // ();
184 printf '<caption>%s</caption>', $title;
185 print '<col>' x ($cols + 1);
186 for my $section (qw{thead}) {
187 print "<$section><tr><th>", $rowdiv == 1 ? '+' : '↱';
188 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
192 while ($offset < $endpoint - 1) {
195 if (my $rowmod = $offset % $rowdiv) {
196 # offset in column units
197 printf '<small>+%X</small>', $rowmod;
201 printf '%X', ($offset + $row->{offset}) / $rowdiv;
206 print range_cell($row, $offset);
210 my $glyph = substr $row->{table}, $offset, 1;
211 if ($glyph eq $NOCHAR) {
216 print "\n".$glyphs->glyph_cell($glyph);
223 say '</table></div>';
230 <table class="glyphs"><tr>
231 <td class="X Cc">control
232 <td class="X Zs"><span>whitespace</span>
233 <td class="X Mn">diacritic<table class="glyphs"><tr>
234 <td class="X Sk">letter
236 <td class="X Po">punctuation<table class="glyphs"><tr>
237 <td class="X Pf">quote
239 <td class="X So">symbol<table class="glyphs"><tr>
240 <td class="X Sm">math
241 <td class="X Sc">currency
243 <td class="X No">numeric
244 <td class="X Greek">greek<table class="glyphs"><tr>
245 <td class="X Latin">latin
246 <td class="X Cyrillic">cyrillic
248 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
249 <td class="X Brahmic">brahmic
250 <td class="X Arabic">arabic
252 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
253 <td class="X African">african
254 <td class="X Hiragana">japanese
255 <td class="X Han">cjk
256 <td class="X Bopomofo">chinese
258 <td class="X Alpha">alphabetic
261 <table class="glyphs"><tr>
262 <td class="X">unicode 7.0
263 <td class="X Xr">proposed
264 <td class="X Xd">deprecated
265 <td class="">unassigned
266 <td class="X Xi">invalid