4 :><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
5 "http://www.w3.org/TR/html4/loose.dtd">
9 <meta http-equiv="content-type" content="<:= $header{content_type} :>">
10 <title>charset cheat sheet</title>
11 <:= stylesheet(qw'light') :>
15 <h1>Character encoding</h1>
18 my $diinfo = do 'digraphs.inc.pl';
19 my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
22 use Encode qw(decode resolve_alias);
23 # generate character table(s)
24 # (~16x faster than decoding in loop;
25 # substr strings is twice as fast as splitting to an array)
27 # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)],
28 default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)],
29 0 => [qw(cp437 cp863)],
30 1 => [qw(iso-8859-1 cp1252 MacRoman cp850)],
31 2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)],
32 5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)],
33 7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)],
34 8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)],
38 my %row = (offset => 0);
40 if ($input =~ s/^--//) {
41 $row{offset} = $endpoint > 160 ? 160 : 48;
43 elsif ($input =~ s/^-//) {
44 $row{offset} = $endpoint > 128 ? 128 : 32;
46 if ($input =~ s/-$//) {
47 $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127;
50 if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) {
51 my $start = hex($1) << ($2 ? 4 : 8);
52 my $end = $2 ? hex($2) << 4 : $start + 240;
53 $row{table} = join '', map { chr } $start .. $end+15;
54 utf8::upgrade($row{table}); # prevent latin1 output
55 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
57 elsif ($input eq 'U') {
58 $row{table} = ' ' x 512;
59 $row{set} = 'Unicode planes';
60 $row{cell} = do 'charset-ucplanes.inc.pl';
62 elsif ($row{set} = resolve_alias($input)) {
63 if ($row{set} eq 'Internal') {
64 $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096);
65 $row{set} = 'Unicode BMP';
66 $row{cell} = do 'charset-unicode.inc.pl';
68 elsif ($row{set} eq 'utf-8-strict') {
71 $row{cell} = do 'charset-utf8.inc.pl';
74 $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint);
78 print "<p>Encoding $input unknown</p>\n";
85 } map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ }
86 $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default';
87 my $NOCHAR = chr 0xFFFD;
89 for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) {
90 substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
91 substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw(
92 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C
93 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC
105 my @nibble = (0..9, 'A'..'F');
106 for my $row (@request) {
107 printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
108 printf '<caption>%s</caption>', $row->{set};
110 for my $section (qw{thead}) {
111 print "<$section><tr><th>↱";
112 print '<th>', $_ for @nibble;
116 for my $msb (0 .. (length($row->{table}) || 256) - 1 >> 4) {
117 printf '<tr><th>%X', $msb + ($row->{offset} >> 4);
118 for my $lsb (0 .. $#nibble) {
120 print $row->{cell}->(($msb<<4) + $lsb);
124 my $glyph = substr $row->{table}, ($msb<<4) + $lsb, 1;
125 if ($glyph eq $NOCHAR) {
130 my $info = [ord $glyph];
131 if (defined (my $mnem = $di{ord $glyph})) {
132 $info = $diinfo->{$mnem};
135 require Unicode::UCD;
136 my $fullinfo = Unicode::UCD::charinfo(ord $glyph);
137 $info = [@$fullinfo{qw/code name category script string/}] if $fullinfo;
139 my ($codepoint, $name, $prop, $script, $string) = @$info;
141 $glyph = quote($string || $glyph);
142 my $desc = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
143 my @class = ('X', grep {$_} $prop, $script);
145 $glyph = "<span>$glyph</span>" if $prop eq 'Zs';
147 printf "\n".'<td class="%s" title="%s">%s',
148 join(' ', @class), quote($desc), $glyph;
152 print "</table></div>\n";
159 <table class="glyphs"><tr>
160 <td class="X Cc">control
161 <td class="X Zs"><span>whitespace</span>
162 <td class="X Mn">diacritic<table class="glyphs"><tr>
163 <td class="X Sk">letter
165 <td class="X Po">punctuation<table class="glyphs"><tr>
166 <td class="X Pf">quote
168 <td class="X So">symbol<table class="glyphs"><tr>
169 <td class="X Sm">math
170 <td class="X Sc">currency
172 <td class="X No">numeric
173 <td class="X Greek">greek<table class="glyphs"><tr>
174 <td class="X Latin">latin
175 <td class="X Cyrillic">cyrillic
177 <td class="X Aramaic">aramaic<table class="glyphs"><tr>
178 <td class="X Brahmic">brahmic
179 <td class="X Arabic">arabic
181 <td class="X Syllabic">syllabic<table class="glyphs"><tr>
182 <td class="X African">african
183 <td class="X Hiragana">japanese
184 <td class="X Han">cjk
185 <td class="X Bopomofo">chinese
187 <td class="X Alpha">alphabetic
190 <table class="glyphs"><tr>
191 <td class="X">unicode 5.0
192 <td class="X Xr">proposed
193 <td class="X Xd">deprecated
194 <td class="">unassigned
195 <td class="X Xi">invalid
200 <a href="/" rel="home">sheet.shiar.nl</a>/charset.<a href="/source/charset.plp"
201 rel="code" title="Written in Perl">plp</a>
202 <a href="http://git.shiar.nl/sheet.git/history/HEAD:/charset.plp"
203 rel="vcs-git" title="Git repository"><:= $VERSION :></a>
204 created by <a href="http://shiar.nl/" rel="author">Shiar</a> •
205 <a href="http://www.fsf.org/licensing/licenses/agpl-3.0.html" rel="copyright"
206 title="Licensed under the GNU Affero General Public License, version 3">AGPLv3</a>