4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
19 <h1>Character encoding</h1>
23 use Shiar_Sheet::FormatChar;
24 my $glyphs = Shiar_Sheet::FormatChar->new;
25 my $cols = 16; # columns
27 # generate character table(s)
28 # (~16x faster than decoding in loop;
29 # substr strings is twice as fast as splitting to an array)
31 # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
32 default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
33 0 => [qw(cp437 -cp863)],
34 1 => [qw(iso-8859-1 -cp1252 -MacRoman -cp850)],
35 2 => [qw(iso-8859-2 -cp1250 -cp852 -MacCentralEurRoman -MacCroatian -MacRumanian)],
36 5 => [qw(koi8-f -iso-8859-5 -cp1251 -MacCyrillic -cp855 -cp866)],
37 7 => [qw(iso-8859-7 -cp1253 -MacGreek -cp737 -cp869)],
38 8 => [qw(iso-8859-8 -cp1255 -MacHebrew -cp862)],
42 my %row = (offset => 0);
44 if ($input =~ s/^--//) {
45 $row{offset} = $endpoint > 160 ? 160 : 48;
47 elsif ($input =~ s/^-//) {
48 $row{offset} = $endpoint > 128 ? 128 : 32;
50 if ($input =~ s/-$//) {
51 $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
54 $row{setnote} = 'over cp437' if $input eq 'cp850';
55 $row{setnote} = 'over iso-8859-1' if $input =~ /^iso-8859-|^cp125/;
58 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
59 my $start = hex($1) << ($2 ? 4 : 8);
60 my $end = $2 ? hex($2) << 4 : $start + 240;
61 $row{table} = join '', map { chr } $start .. $end+15;
62 utf8::upgrade($row{table}); # prevent latin1 output
63 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
65 elsif ($input eq 'U') {
66 $row{table} = ' ' x 1024;
67 $row{set} = 'Unicode planes';
68 $row{cell} = do 'charset-ucplanes.inc.pl'
69 or Alert('Table data could not be read', $@ || $!);
72 elsif ($row{set} = Encode::resolve_alias($input)) {
73 if ($row{set} eq 'Internal') {
74 $row{table} = ' ' x ($endpoint < 255 ? 640 : 8192);
75 $row{set} = 'Unicode BMP';
76 $row{cell} = do 'charset-unicode.inc.pl'
77 or Alert('Table data could not be read', $@ || $!);
79 elsif ($row{set} eq 'utf-8-strict') {
82 $row{cell} = do 'charset-utf8.inc.pl'
83 or Alert('Table data could not be read', $@ || $!);
86 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
90 Alert("Encoding <q>$input</q> unknown");
92 $row{set} ? \%row : ();
97 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
98 $Request =~ /\w/ ? split(m{[/+\s]}, $Request) : 'default';
99 my $NOCHAR = chr 0xFFFD;
101 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
102 substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
103 substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
104 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
105 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
110 my ($table, $offset) = @_;
111 my $def = $table->{$offset} or return;
112 my ($len, $class, $name, $title) = @{$def};
114 my $colsize = $table->{colsize} || 1;
117 $name //= $len <= 2 ? 'res' : 'reserved';
119 if (my $part = $offset/$colsize % $cols) {
121 my $rest = $cols - $part; # remaining
122 $rest = $len if $len < $rest; #TODO: optimise
124 # continued on new row
125 my @next = ($len * $colsize, "$class joinu");
128 push @next, $name, $title;
133 # minority on next row
134 push @next, '"', $title || $name;
136 $table->{$offset + $colsize*$rest} //= \@next;
141 elsif (my $rows = int($len / $cols)) {
143 if ($len -= $rows * $cols) {
144 # partial row remains
145 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
148 $attr .= sprintf ' rowspan=%d', $rows;
152 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
153 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
154 $attr .= sprintf ' class="%s"', $class if $class;
155 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
156 return "<td$attr>$name";
159 for my $row (@request) {
160 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
161 my $coldigits = ceil(log($colsize * $cols) / log(16)); # uniform length of hexadecimal header
162 my $rowdiv = 16 ** $coldigits; # divider of row headers
164 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
165 my $title = $row->{set};
166 $title .= " <aside>($_)</aside>" for $row->{setnote} // ();
167 printf '<caption>%s</caption>', $title;
168 print '<col>' x ($cols + 1);
169 for my $section (qw{thead}) {
170 print "<$section><tr><th>↱";
171 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
175 for my $msb (0 .. ((length($row->{table}) || 256) - 1) / $cols) {
178 my $rowlabel = ($msb + int($row->{offset} / $cols)) * $cols * $colsize;
179 if (my $rowmod = $rowlabel % $rowdiv) {
180 # offset in column units
181 printf '<small>+%X</small>', $rowmod;
185 printf '%X', $rowlabel / $rowdiv;
188 for my $lsb (0 .. $cols - 1) {
189 my $val = ( ($msb * $cols) + $lsb ) * $colsize;
191 print range_cell($row->{cell}, $val);
195 my $glyph = substr $row->{table}, $val, 1;
196 if ($glyph eq $NOCHAR) {
201 print "\n".$glyphs->glyph_cell($glyph);
205 say '</table></div>';
212 <table class="glyphs"><tr>
213 <td class="X Cc">control
214 <td class="X Zs"><span>whitespace</span>
215 <td class="X Mn">diacritic<table class="glyphs"><tr>
216 <td class="X Sk">letter
218 <td class="X Po">punctuation<table class="glyphs"><tr>
219 <td class="X Pf">quote
221 <td class="X So">symbol<table class="glyphs"><tr>
222 <td class="X Sm">math
223 <td class="X Sc">currency
225 <td class="X No">numeric
226 <td class="X Greek">greek<table class="glyphs"><tr>
227 <td class="X Latin">latin
228 <td class="X Cyrillic">cyrillic
230 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
231 <td class="X Brahmic">brahmic
232 <td class="X Arabic">arabic
234 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
235 <td class="X African">african
236 <td class="X Hiragana">japanese
237 <td class="X Han">cjk
238 <td class="X Bopomofo">chinese
240 <td class="X Alpha">alphabetic
243 <table class="glyphs"><tr>
244 <td class="X">unicode 7.0
245 <td class="X Xr">proposed
246 <td class="X Xd">deprecated
247 <td class="">unassigned
248 <td class="X Xi">invalid