4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
19 <h1>Character encoding</h1>
22 use Shiar_Sheet::FormatChar;
23 my $glyphs = Shiar_Sheet::FormatChar->new;
24 my $cols = 16; # columns
25 my $colsize = 1; # increment per column
27 # generate character table(s)
28 # (~16x faster than decoding in loop;
29 # substr strings is twice as fast as splitting to an array)
31 # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
32 default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
33 0 => [qw(cp437 -cp863)],
34 1 => [qw(iso-8859-1 -cp1252 -MacRoman -cp850)],
35 2 => [qw(iso-8859-2 -cp1250 -cp852 -MacCentralEurRoman -MacCroatian -MacRumanian)],
36 5 => [qw(koi8-f -iso-8859-5 -cp1251 -MacCyrillic -cp855 -cp866)],
37 7 => [qw(iso-8859-7 -cp1253 -MacGreek -cp737 -cp869)],
38 8 => [qw(iso-8859-8 -cp1255 -MacHebrew -cp862)],
42 my %row = (offset => 0);
44 if ($input =~ s/^--//) {
45 $row{offset} = $endpoint > 160 ? 160 : 48;
47 elsif ($input =~ s/^-//) {
48 $row{offset} = $endpoint > 128 ? 128 : 32;
50 if ($input =~ s/-$//) {
51 $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
54 $row{setnote} = 'over cp437' if $input eq 'cp850';
55 $row{setnote} = 'over iso-8859-1' if $input =~ /^iso-8859-|^cp125/;
58 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
59 my $start = hex($1) << ($2 ? 4 : 8);
60 my $end = $2 ? hex($2) << 4 : $start + 240;
61 $row{table} = join '', map { chr } $start .. $end+15;
62 utf8::upgrade($row{table}); # prevent latin1 output
63 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
65 elsif ($input eq 'U') {
66 $row{table} = ' ' x 1024;
67 $row{set} = 'Unicode planes';
68 $row{cell} = do 'charset-ucplanes.inc.pl'
69 or Alert('Table data could not be read', $@ || $!);
73 elsif ($row{set} = Encode::resolve_alias($input)) {
74 if ($row{set} eq 'Internal') {
75 $row{table} = ' ' x ($endpoint < 255 ? 640 : 8192);
76 $row{set} = 'Unicode BMP';
77 $row{cell} = do 'charset-unicode.inc.pl'
78 or Alert('Table data could not be read', $@ || $!);
80 elsif ($row{set} eq 'utf-8-strict') {
83 $row{cell} = do 'charset-utf8.inc.pl'
84 or Alert('Table data could not be read', $@ || $!);
87 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
91 Alert("Encoding <q>$input</q> unknown");
93 $row{set} ? \%row : ();
98 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
99 $Request =~ /\w/ ? split(m{[/+\s]}, $Request) : 'default';
100 my $NOCHAR = chr 0xFFFD;
102 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
103 substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
104 substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
105 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
106 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
111 my ($table, $offset) = @_;
112 my $def = $table->{$offset} or return;
113 my ($len, $class, $name, $title) = @{$def};
117 $name //= $len <= 2 ? 'res' : 'reserved';
119 if (my $part = $offset/$colsize % $cols) {
121 my $rest = $cols - $part; # remaining
122 $rest = $len if $len < $rest; #TODO: optimise
124 # continued on new row
125 my @next = ($len * $colsize, "$class joinu");
128 push @next, $name, $title;
133 # minority on next row
134 push @next, '"', $title || $name;
136 $table->{$offset + $colsize*$rest} //= \@next;
141 elsif (my $rows = int($len / $cols)) {
143 if ($len -= $rows * $cols) {
144 # partial row remains
145 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
148 $attr .= sprintf ' rowspan=%d', $rows;
152 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
153 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
154 $attr .= sprintf ' class="%s"', $class if $class;
155 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
156 return "<td$attr>$name";
159 for my $row (@request) {
160 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
161 my $title = $row->{set};
162 $title .= " <aside>($_)</aside>" for $row->{setnote} // ();
163 printf '<caption>%s</caption>', $title;
164 print '<col>' x ($cols + 1);
165 my $coldigits = $colsize * $cols <= 16 ? 1 : 2; # uniform length of hexadecimal header
166 for my $section (qw{thead}) {
167 print "<$section><tr><th>↱";
168 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
172 for my $msb (0 .. ((length($row->{table}) || 256) - 1) / $cols) {
173 printf '<tr><th>%X', ($msb + int($row->{offset} / $cols)) * $cols * $colsize;
174 for my $lsb (0 .. $cols - 1) {
175 my $val = ( ($msb * $cols) + $lsb ) * $colsize;
177 print range_cell($row->{cell}, $val);
181 my $glyph = substr $row->{table}, $val, 1;
182 if ($glyph eq $NOCHAR) {
187 print "\n".$glyphs->glyph_cell($glyph);
191 say '</table></div>';
198 <table class="glyphs"><tr>
199 <td class="X Cc">control
200 <td class="X Zs"><span>whitespace</span>
201 <td class="X Mn">diacritic<table class="glyphs"><tr>
202 <td class="X Sk">letter
204 <td class="X Po">punctuation<table class="glyphs"><tr>
205 <td class="X Pf">quote
207 <td class="X So">symbol<table class="glyphs"><tr>
208 <td class="X Sm">math
209 <td class="X Sc">currency
211 <td class="X No">numeric
212 <td class="X Greek">greek<table class="glyphs"><tr>
213 <td class="X Latin">latin
214 <td class="X Cyrillic">cyrillic
216 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
217 <td class="X Brahmic">brahmic
218 <td class="X Arabic">arabic
220 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
221 <td class="X African">african
222 <td class="X Hiragana">japanese
223 <td class="X Han">cjk
224 <td class="X Bopomofo">chinese
226 <td class="X Alpha">alphabetic
229 <table class="glyphs"><tr>
230 <td class="X">unicode 7.0
231 <td class="X Xr">proposed
232 <td class="X Xd">deprecated
233 <td class="">unassigned
234 <td class="X Xi">invalid