4 title => 'charset cheat sheet',
7 "Reference sheet with all glyphs in common character encoding tables,",
8 "and an overview of Unicode ranges and UTF-8 bytes.",
11 charset codepage unicode ascii utf8 latin glyph character encoding
12 reference common overview table
14 stylesheet => [qw'light'],
15 data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
19 <h1>Character encoding</h1>
22 use Shiar_Sheet::FormatChar;
23 my $glyphs = Shiar_Sheet::FormatChar->new;
24 my $cols = 16; # columns
26 # generate character table(s)
27 # (~16x faster than decoding in loop;
28 # substr strings is twice as fast as splitting to an array)
30 # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
31 default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
32 0 => [qw(cp437 -cp863)],
33 1 => [qw(iso-8859-1 -cp1252 -MacRoman -cp850)],
34 2 => [qw(iso-8859-2 -cp1250 -cp852 -MacCentralEurRoman -MacCroatian -MacRumanian)],
35 5 => [qw(koi8-f -iso-8859-5 -cp1251 -MacCyrillic -cp855 -cp866)],
36 7 => [qw(iso-8859-7 -cp1253 -MacGreek -cp737 -cp869)],
37 8 => [qw(iso-8859-8 -cp1255 -MacHebrew -cp862)],
41 my %row = (offset => 0);
43 if ($input =~ s/^--//) {
44 $row{offset} = $endpoint > 160 ? 160 : 48;
46 elsif ($input =~ s/^-//) {
47 $row{offset} = $endpoint > 128 ? 128 : 32;
49 if ($input =~ s/-$//) {
50 $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
53 $row{setnote} = 'over cp437' if $input eq 'cp850';
54 $row{setnote} = 'over iso-8859-1' if $input =~ /^iso-8859-|^cp125/;
57 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
58 my $start = hex($1) << ($2 ? 4 : 8);
59 my $end = $2 ? hex($2) << 4 : $start + 240;
60 $row{table} = join '', map { chr } $start .. $end+15;
61 utf8::upgrade($row{table}); # prevent latin1 output
62 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
64 elsif ($input eq 'U') {
65 $row{table} = ' ' x 1024;
66 $row{set} = 'Unicode planes';
67 $row{cell} = do 'charset-ucplanes.inc.pl'
68 or Alert('Table data could not be read', $@ || $!);
71 elsif ($row{set} = Encode::resolve_alias($input)) {
72 if ($row{set} eq 'Internal') {
73 $row{table} = ' ' x ($endpoint < 255 ? 640 : 8192);
74 $row{set} = 'Unicode BMP';
75 $row{cell} = do 'charset-unicode.inc.pl'
76 or Alert('Table data could not be read', $@ || $!);
78 elsif ($row{set} eq 'utf-8-strict') {
81 $row{cell} = do 'charset-utf8.inc.pl'
82 or Alert('Table data could not be read', $@ || $!);
85 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
89 Alert("Encoding <q>$input</q> unknown");
91 $row{set} ? \%row : ();
96 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
97 $Request =~ /\w/ ? split(m{[/+\s]}, $Request) : 'default';
98 my $NOCHAR = chr 0xFFFD;
100 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
101 substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
102 substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
103 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
104 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
109 my ($table, $offset) = @_;
110 my $def = $table->{$offset} or return;
111 my ($len, $class, $name, $title) = @{$def};
113 my $colsize = $table->{colsize} || 1;
116 $name //= $len <= 2 ? 'res' : 'reserved';
118 if (my $part = $offset/$colsize % $cols) {
120 my $rest = $cols - $part; # remaining
121 $rest = $len if $len < $rest; #TODO: optimise
123 # continued on new row
124 my @next = ($len * $colsize, "$class joinu");
127 push @next, $name, $title;
132 # minority on next row
133 push @next, '"', $title || $name;
135 $table->{$offset + $colsize*$rest} //= \@next;
140 elsif (my $rows = int($len / $cols)) {
142 if ($len -= $rows * $cols) {
143 # partial row remains
144 $table->{$offset + $colsize*$rows * $cols} //= [$len*$colsize, "$class joinu", '', $title];
147 $attr .= sprintf ' rowspan=%d', $rows;
151 $attr .= sprintf ' colspan=%d', $len unless $len == 1;
152 $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
153 $attr .= sprintf ' class="%s"', $class if $class;
154 $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
155 return "<td$attr>$name";
158 for my $row (@request) {
159 my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
160 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
161 my $title = $row->{set};
162 $title .= " <aside>($_)</aside>" for $row->{setnote} // ();
163 printf '<caption>%s</caption>', $title;
164 print '<col>' x ($cols + 1);
165 my $coldigits = $colsize * $cols <= 16 ? 1 : 2; # uniform length of hexadecimal header
166 for my $section (qw{thead}) {
167 print "<$section><tr><th>↱";
168 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
172 for my $msb (0 .. ((length($row->{table}) || 256) - 1) / $cols) {
173 printf '<tr><th>%X', ($msb + int($row->{offset} / $cols)) * $cols * $colsize;
174 for my $lsb (0 .. $cols - 1) {
175 my $val = ( ($msb * $cols) + $lsb ) * $colsize;
177 print range_cell($row->{cell}, $val);
181 my $glyph = substr $row->{table}, $val, 1;
182 if ($glyph eq $NOCHAR) {
187 print "\n".$glyphs->glyph_cell($glyph);
191 say '</table></div>';
198 <table class="glyphs"><tr>
199 <td class="X Cc">control
200 <td class="X Zs"><span>whitespace</span>
201 <td class="X Mn">diacritic<table class="glyphs"><tr>
202 <td class="X Sk">letter
204 <td class="X Po">punctuation<table class="glyphs"><tr>
205 <td class="X Pf">quote
207 <td class="X So">symbol<table class="glyphs"><tr>
208 <td class="X Sm">math
209 <td class="X Sc">currency
211 <td class="X No">numeric
212 <td class="X Greek">greek<table class="glyphs"><tr>
213 <td class="X Latin">latin
214 <td class="X Cyrillic">cyrillic
216 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
217 <td class="X Brahmic">brahmic
218 <td class="X Arabic">arabic
220 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
221 <td class="X African">african
222 <td class="X Hiragana">japanese
223 <td class="X Han">cjk
224 <td class="X Bopomofo">chinese
226 <td class="X Alpha">alphabetic
229 <table class="glyphs"><tr>
230 <td class="X">unicode 7.0
231 <td class="X Xr">proposed
232 <td class="X Xd">deprecated
233 <td class="">unassigned
234 <td class="X Xi">invalid