charset.plp

   1 <(common.inc.plp)><:
   2
   3 Html({
   4         title => 'charset cheat sheet',
   5         version => '1.0',
   6         description => [
   7                 "Reference sheet with all glyphs in common character encoding tables,",
   8                 "and an overview of Unicode ranges and UTF-8 bytes.",
   9         ],
  10         keywords => [qw'
  11                 charset codepage unicode ascii utf8 latin glyph character encoding
  12                 reference common overview table
  13         '],
  14         stylesheet => [qw'light'],
  15         data => [qw'charset-unicode.inc.pl charset-utf8.inc.pl'],
  16 });
  17
  18 my @tablist = split m{/+}, $Request || 'default';
  19
  20 use List::Util qw( first pairmap pairfirst pairs );
  21
  22 :>
  23 <h1>Character encodings</h1>
  24
  25 <p>
  26 <:
  27 if ($tablist[0] eq 'default') {
  28         say "Overview of Unicode allocation and common latin code pages.";
  29         say "Compare alternate charsets:";
  30 }
  31 else {
  32         say "Charset comparison:";
  33 }
  34
  35 sub optionlink {
  36         my ($title, $href, $selected) = @_;
  37         return sprintf(
  38                 $selected ? '<strong>%s</strong>' : '<a href="%2$s">%s</a>',
  39                 EscapeHTML($title), $href
  40         );
  41 }
  42
  43 print join " •\n", (
  44         map {
  45                 join " ·\n", pairmap {
  46                         optionlink($b || ucfirst $a, "/charset/$a?compare", $a eq $Request);
  47                 } @{$_}
  48         }
  49         [
  50                 iso      => 'ISO',
  51                 dos      => 'DOS',
  52                 mac      => 'Apple',
  53                 ebcdic   => 'EBCDIC',
  54         ],
  55         [
  56                 westeur  => 'West',
  57                 centeur  => 'Central',
  58                 norteur  => 'North European',
  59                 turkish  => 0,
  60                 greek    => 0,
  61                 cyrillic => 0,
  62                 hebrew   => 0,
  63         ],
  64         [ uc => 'Unicode' ],
  65 );
  66 :>.
  67 </p>
  68
  69 <:
  70 use POSIX qw( ceil );
  71 use Shiar_Sheet::FormatChar;
  72 my $glyphs = Shiar_Sheet::FormatChar->new;
  73 my @request;
  74
  75 sub tabinput {
  76         # generate character table(s)
  77         my $input = shift or return;
  78
  79         state $ALIAS = {
  80                 default    => [qw( unicode+0-639 utf-8 iso-8859-1 iso-8859-15 cp1252 cp437 cp850 )],
  81                 uc         => [qw( U+cols=32 unicode+0-4095 unicode+4096-6319  unicode+6320-8191 )],
  82                 us         => [qw( cp437 cp863 gsm0338 AdobeStandardEncoding )],
  83                 ebcdic     => [qw( cp37 cp500 cp1047 posix-bc cp1026 cp875 )],
  84                 iso        => [map {"iso-8859-$_"} 1 .. 11, 13 .. 16],
  85                 dos        => [qw( cp437 cp865 cp861 cp860 cp863 cp850 cp857 cp852 cp775
  86                                    cp737 cp869 cp866 cp855 cp862 cp864 )],
  87                 aix        => [qw( cp1006 )],
  88                 win        => [qw( cp1252 cp1250 cp1254 cp1257 cp1258 cp1253 cp1251 cp1255 cp1256 cp874 )],
  89                 mac        => [qw( MacRoman MacRomanian MacRumanian MacCroatian MacCentralEurRoman MacTurkish MacIcelandic MacSami
  90                                    MacGreek MacCyrillic MacHebrew MacArabic MacFarsi MacThai )],
  91
  92                 # languages
  93                 westeur    => [qw( iso-8859-1 iso-8859-15 cp1252 iso-8859-14 cp850 hp-roman8 nextstep MacRoman )],
  94                 centeur    => [qw( iso-8859-2 iso-8859-16 cp1250 cp852 MacRomanian MacCroatian MacCentralEurRoman )], # MacRumanian only for DB
  95                 turkish    => [qw( iso-8859-9 iso-8859-3 cp1254 cp857 MacTurkish )],
  96                 norteur    => [qw( baltic nordic )],
  97                 baltic     => [qw( iso-8859-4 iso-8859-13 cp1257 cp775 )],
  98                 nordic     => [qw( iso-8859-10 cp865 cp861 MacIcelandic MacSami )],
  99                 cyrillic   => [qw( koi8-r koi8-u koi8-f iso-8859-5 cp1251 MacCyrillic cp866 cp855
 100                                    U4 U2DE-2DF UA64-A69 U50-52 )], # MacUkrainian is broken
 101                 arabic     => [qw( iso-8859-6 cp1256 MacArabic cp864 cp1006 MacFarsi
 102                                    U6 U8A-8F+0-31+64 U75-77 )],
 103                 greek      => [qw( iso-8859-7 cp1253 MacGreek cp737 cp869  U37-3F U1F )],
 104                 hebrew     => [qw( iso-8859-8 cp1255 MacHebrew cp862  U59-5F )],
 105                 thai       => [qw( iso-8859-11 cp874 MacThai )],
 106                 vietnamese => [qw( viscii cp1258 MacVietnamese )],
 107                 symbol     => [qw( symbol dingbats MacDingbats AdobeZdingbat AdobeSymbol )],
 108
 109                 # iso-code shorthand
 110                 1 => 'westeur',
 111                 2 => 'centeur',
 112                 3 => 'turkish',
 113                 4 => 'baltic',
 114                 5 => 'cyrillic',
 115                 6 => 'arabic',
 116                 7 => 'greek',
 117                 8 => 'hebrew',
 118                 9 => 'turkish',
 119                 10 => 'nordic',
 120                 11 => 'thai',
 121         };
 122         if (my $follow = $ALIAS->{$input}) {
 123                 tabinput($_) for ref $follow ? @{$follow} : $follow;
 124                 return;
 125         }
 126
 127         state $visible = {};  # all present tables
 128         my %row = (offset => 0, cols => 16);
 129         my $params = $input =~ s/[+](.*)\z// ? $1 : undef;
 130
 131         if (not defined $params) {
 132                 state $INHERIT = {
 133                         'cp437'       => ['cp850' => 0, 'ascii' => '0-31+128'], # ascii range overridden later
 134                         'gsm0338'     => ['ascii' => '0-127'],
 135
 136                         'iso-8859-2'  => ['iso-8859-1' => '160'],
 137                         'iso-8859-3'  => ['iso-8859-1' => '160'], #TODO: also apply to iso-8859-9
 138                         'iso-8859-4'  => ['iso-8859-2' => '160'],
 139                         'iso-8859-5'  => ['iso-8859-1' => '160'],
 140                         'iso-8859-6'  => ['cp1256' => '128', 'iso-8859-1' => '160'],
 141                         'iso-8859-7'  => ['iso-8859-1' => '160'],
 142                         'iso-8859-8'  => ['iso-8859-1' => '160'],
 143                         'iso-8859-9'  => ['iso-8859-1' => '208-223+240'],
 144                         'iso-8859-10' => ['iso-8859-4' => '160'],
 145                         'iso-8859-11' => ['iso-8859-1' => '160'],
 146                         'iso-8859-13' => ['iso-8859-4' => '160'],
 147                         'iso-8859-14' => ['iso-8859-1' => '160'],
 148                         'iso-8859-15' => ['iso-8859-1' => '160-191'],
 149                         'iso-8859-16' => ['iso-8859-2' => '160'],
 150                         'hp-roman8'   => ['iso-8859-1' => '160'],
 151
 152                         'cp1252'      => ['iso-8859-1' => '128-159'],
 153                         'cp1250'      => ['iso-8859-2' => '128-191', 'cp1252' => '128'],
 154                         'cp1254'      => ['iso-8859-9' => '128-159', 'cp1252' => '128-159+208'],
 155                         'cp874'       => ['iso-8859-11' => '128-159', 'cp1252' => '128'], # windows-874 actually cp1162
 156                         'cp1257'      => ['iso-8859-13' => '128-159+255', 'cp1252' => '128'],
 157                         'cp1251'      => ['cp1252' => '128'],
 158                         'cp1253'      => ['cp1252' => '128'],
 159                         'cp1255'      => ['iso-8859-8' => '128-223', 'cp1252' => '128'],
 160                         'cp1256'      => ['cp1252' => '128'],
 161                         'cp1258'      => ['cp1252' => '128-159+192'],
 162
 163                         'cp850'       => ['cp437' => '144'],
 164                         'cp860'       => ['cp437' => '128-175'],
 165                         'cp861'       => ['cp865' => '128-175'],
 166                         'cp863'       => ['cp437' => '128-175'],
 167                         'cp865'       => ['cp437' => '144-175'],
 168                         'cp852'       => ['cp850' => '128', 'cp437' => '128'],
 169                         'cp857'       => ['cp850' => '128-175+208-239', 'cp437' => '128'],
 170                         'cp775'       => ['cp850' => '128'],  # partial cp437
 171                         'cp866'       => ['cp437' => '128-175+224'],
 172                         'cp855'       => ['cp437' => '128'],
 173                         'cp1006'      => ['iso-8859-6' => '160', 'cp437' => '128'],
 174                         'cp737'       => ['cp437' => '128-175+224'],
 175                         'cp869'       => ['cp437' => '128'],
 176                         'cp862'       => ['cp437' => '128-159'],
 177                         'cp864'       => ['MacArabic' => '128', 'iso-8859-6' => '128', 'cp437' => '128'], #TODO: compare form variants
 178
 179                         'koi8-u'      => ['koi8-r' => '144-191'],
 180                         'koi8-f'      => ['koi8-u' => '144-191'],
 181
 182                         'MacRomanian' => ['MacRoman' => '160-191+208-223'],
 183                         'MacRumanian' => ['MacRomanian' => '160-191+208-223', 'MacRoman' => '160-191+208-223'],
 184                         'MacCroatian' => ['MacRoman' => '160'],
 185                         'MacCentralEurRoman' => ['MacRoman' => '128'],
 186                         'MacIcelandic'=> ['MacRoman' => '160-175+208-239'],
 187                         'MacTurkish'  => ['MacRoman' => '208-223'], # F5 is unassigned
 188                         'MacSami'     => ['MacIcelandic' => '144-191+208-223+240', 'MacRoman' => '144'],
 189                         'MacGreek'    => ['MacRoman' => '128'],
 190                         'MacCyrillic' => ['MacRoman' => '128'],
 191                         'MacHebrew'   => ['iso-8859-8' => '128', 'MacRoman' => '128-143+160'], # partial ascii
 192                         'MacArabic'   => ['iso-8859-6' => '128', 'cp864' => '128', 'MacRoman' => '128'], #TODO: multiple parents
 193                         'MacFarsi'    => ['MacArabic' => '176-191', 'MacRoman' => '128'],
 194
 195                         'cp37'        => ['posix-bc' => '0'],
 196                         'posix-bc'    => ['cp1047' => '64'],
 197                         'cp500'       => ['cp37' => '64-95+176-191'],
 198                         'cp1047'      => ['cp37' => '16-47+80-95+160-191'],
 199                         'cp1026'      => ['cp37' => '64'],
 200                         'cp875'       => ['cp37' => '48'],
 201
 202                 };
 203
 204                 my @parents = @{ $INHERIT->{$input} || [] };
 205
 206                 if (my ($parent, $part) = pairfirst { defined $visible->{$a} } @parents) {
 207                         $row{parent} = $parent;
 208                         $params = $part;
 209                         $params = 128 unless $visible->{$parent}
 210                                 or ($input eq 'MacCroatian' and defined $visible->{MacRomanian});
 211                 }
 212                 elsif (defined $visible->{ascii}) {
 213                         $row{parent} = $parents[0];
 214                         $params = $parents[1] // 128;
 215                         $params = 128 if $params >= 128;  # ascii offset at most
 216                 }
 217                 elsif (@parents) {
 218                         $row{parent} = $parents[0];
 219                         $params = $parents[1] if $parents[1] == 0;  # apply ascii end
 220                 }
 221                 $visible->{$_} //= 0 for $row{parent} || ();
 222         }
 223
 224         for my $param (split /[+]+/, $params // '') {
 225                 if ($param =~ m{ \A cols = (\d+) \z }x) {
 226                         $row{cols} = $1;
 227                 }
 228                 elsif ($param =~ m{ \A (?<offset> \d+) (?: [-] (?<endpoint> \d+) )? \z }x) {
 229                         if ($row{offset}) {
 230                                 my $skip = int(($row{endpoint} || $row{offset}) / $row{cols});
 231                                 for ($skip + 1 .. ($+{offset} / $row{cols}) - 1) {
 232                                         $row{skip}->{ $_ * $row{cols} - $row{offset} }++;
 233                                 }
 234                         }
 235                         else {
 236                                 $row{offset} = $+{offset};
 237                         }
 238                         $row{endpoint} = $+{endpoint};
 239                 }
 240                 else {
 241                         Alert("Unknown option <q>$param</q> for charset $input");
 242                 }
 243         }
 244
 245         if ($input =~ /^U([0-9a-fA-F]+)(?:-([0-9a-fA-F]+))?/) {
 246                 my $start = hex($1) << ($2 ? 4 : 8);
 247                 my $end = $2 ? (hex($2) << 4) + $row{cols} - 1 : $start + 255;
 248                 $row{table} = join '', map { chr } $start .. $end;
 249                 utf8::upgrade($row{table});  # prevent latin1 output
 250                 $row{endpoint} = $end - $start;
 251                 $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8;
 252                 $row{offset} = $start % 256;
 253         }
 254         elsif ($input eq 'U') {
 255                 $row{set} = 'Unicode planes';
 256                 $row{cell} = do 'charset-ucplanes.inc.pl'
 257                         or Alert('Table data could not be read', $@ || $!);
 258                 $row{endpoint} = 1023 * $row{cell}->{colsize};
 259         }
 260         elsif ($row{set} = Encode::resolve_alias($input)) {
 261                 if ($row{set} eq 'Internal') {
 262                         $row{cell} = do 'charset-unicode.inc.pl'
 263                                 or Alert('Table data could not be read', $@ || $!);
 264
 265                         $row{endpoint} ||= 8191;
 266                         $row{endpoint}  *= $row{cell}->{colsize};
 267                         $row{startpoint} = $row{cell}->{colsize} * $row{offset};
 268                         $row{offset} = 0;
 269                         $row{set} = 'Unicode ' . (
 270                                 $row{startpoint} <  0x10000 && $row{endpoint} < 0x10000 ? 'BMP' :
 271                                 $row{startpoint} >= 0x10000 && $row{endpoint} < 0x20000 ? 'SMP' :
 272                                 'allocations'
 273                         );
 274                 }
 275                 elsif ($row{set} eq 'utf-8-strict') {
 276                         $row{set} = 'UTF-8';
 277                         $row{cell} = do 'charset-utf8.inc.pl'
 278                                 or Alert('Table data could not be read', $@ || $!);
 279                         $row{endpoint} = 255;
 280                 }
 281                 else {
 282                         $row{endpoint} ||= 255;
 283                         if ($row{set} eq 'MacHebrew' or $row{set} eq 'MacThai') {
 284                                 # array of possibly multiple characters per code point
 285                                 $row{table} = [
 286                                         map { Encode::decode($row{set}, pack 'C*', $_) } $row{offset} .. $row{endpoint}
 287                                 ];
 288                         }
 289                         else {
 290                                 # ~16x faster than decoding in loop;
 291                                 # substr strings is twice as fast as splitting to an array
 292                                 $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $row{endpoint});
 293                         }
 294                         $row{endpoint} -= $row{offset};
 295
 296                         if ($row{set} eq 'cp437' and !$row{offset}) {
 297                                 substr($row{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign
 298                                 substr($row{table}, 0, 32) = pack 'U*', map {hex} qw(
 299                                         2007 263A 263B 2665 2666 2663 2660 2022
 300                                         25D8 25CB 25D9 2642 2640 266A 266B 263C
 301                                         25BA 25C4 2195 203C 00B6 00A7 25AC 21A8
 302                                         2191 2193 2192 2190 221F 2194 25B2 25BC
 303                                 );
 304                         }
 305
 306                         $visible->{ascii} =  # assume common base
 307                         $visible->{ $row{set} } = 1;
 308                 }
 309         }
 310         else {
 311                 Alert("Encoding <q>$input</q> unknown");
 312                 return;
 313         }
 314         push @request, \%row;
 315 }
 316 tabinput($_) for @tablist;
 317
 318 my $NOCHAR = chr 0xFFFD;
 319
 320 sub range_cell {
 321         my ($info, $offset) = @_;
 322         my $table = $info->{cell} or return;
 323         my $def = $table->{$offset} or return;
 324         my ($len, $class, $name, $title) = @{$def};
 325
 326         my $cols = $info->{cols};
 327         my $colsize = $table->{colsize} || 1;
 328         my $attr = '';
 329         $len /= $colsize;
 330         $name //= $len <= 2 ? 'res' : 'reserved';
 331
 332         if (my $part = $offset/$colsize % $cols) {
 333                 # continued row
 334                 my $rest = $cols - $part;  # remaining
 335                 $rest = $len if $len < $rest; #TODO: optimise
 336                 if ($len -= $rest) {
 337                         # continued on new row
 338                         my @next = ($len * $colsize, "$class joinu");
 339                         if ($len > $rest) {
 340                                 # minority remains
 341                                 push @next, $name, $title;
 342                                 $title ||= $name;
 343                                 $name = '';
 344                         }
 345                         else {
 346                                 # minority on next row
 347                                 push @next, '"', $title || $name;
 348                         }
 349                         $table->{$offset + $colsize*$rest} //= \@next;
 350                         $class .= ' joind';
 351                 }
 352                 $len = $rest;
 353         }
 354         elsif (my $rows = int($len / $cols)) {
 355                 # multiple full rows
 356                 my $rowsize = $colsize * $cols;
 357                 if ($len -= $rows * $cols) {
 358                         # partial row remains
 359                         $table->{$offset + $rowsize * $rows} //= [$len*$colsize, "$class joinu", '', $title];
 360                         $class .= ' joind';
 361                 }
 362
 363                 # coalesce multiple rows
 364                 while ($rows > 3) {
 365                         $info->{skip}->{$offset += $rowsize}++;
 366                         $rows--;
 367                 }
 368                 if ($rows > 2) {
 369                         $info->{skip}->{$offset += $rowsize} = 0;
 370                 }
 371
 372                 $attr .= sprintf ' rowspan=%d', $rows;
 373                 $len = $cols;
 374         }
 375
 376         $attr .= sprintf ' colspan=%d', $len unless $len == 1;
 377         $attr .= $1 if $class and $class =~ s/( \w+="[^"]*")//;
 378         $attr .= sprintf ' class="%s"', $class if $class;
 379         $attr .= sprintf ' title="%s"', EscapeHTML($title) if $title;
 380         return "<td$attr>$name";
 381 }
 382
 383 for my $row (@request) {
 384         my $cols = $row->{cols};
 385         my $colsize = $row->{cell} && $row->{cell}->{colsize} || 1;
 386         my $coldigits = ceil(log($colsize * $cols) / log(16));  # uniform length of hexadecimal header
 387         my $rowdiv = 16 ** $coldigits;  # row divide for column digits
 388         $rowdiv = 1 if $rowdiv != $cols * $colsize;  # divide only if all columns are matched
 389         my $offset = $row->{startpoint} || 0;
 390
 391         printf '<div class="section"><table class="glyphs%s">', !$row->{cell} && ' charmap';
 392         my $title = $row->{set};
 393         $title .= " <aside>(over $_)</aside>"
 394                 for grep { $_ ne 'iso-8859-1' } $row->{parent} // ();
 395         printf '<caption>%s</caption>', $title;
 396         print '<col>' x ($cols + 1);
 397         for my $section (qw{thead}) {
 398                 print "<$section><tr><th>", $rowdiv == 1 ? '+' : '↱';
 399                 printf '<th>%0*X', $coldigits, $_ * $colsize for 0 .. $cols - 1;
 400                 print "\n";
 401         }
 402         print '<tbody>';
 403         while ($offset < $row->{endpoint}) {
 404                 if ($row->{skip}->{$offset}) {
 405                         $offset += $cols * $colsize;
 406                         next;
 407                 }
 408
 409                 print '<tr><th>';
 410                 if (defined $row->{skip}->{$offset}) {
 411                         print '⋮';
 412                 }
 413                 else {
 414                         if (my $rowmod = $offset % $rowdiv) {
 415                                 # offset in column units
 416                                 printf '<small>+%X</small>', $rowmod;
 417                         }
 418                         else {
 419                                 # divided row offset
 420                                 printf '%X', ($offset + $row->{offset}) / $rowdiv;
 421                         }
 422                 }
 423                 for (1 .. $cols) {
 424                         if ($row->{cell}) {
 425                                 print range_cell($row, $offset);
 426                                 next;
 427                         }
 428
 429                         my $glyph = ref $row->{table} eq 'ARRAY' ? $row->{table}->[$offset] :
 430                                 substr $row->{table}, $offset, 1;
 431                         if ($glyph eq $NOCHAR) {
 432                                 print '<td>';
 433                                 next;
 434                         }
 435
 436                         if (exists $get{compare}) {
 437                                 state $visible = {};
 438                                 my $cp = $offset + $row->{offset};
 439                                 printf '<td class="%s" title="%3$s">%2$s',
 440                                         $cp == ord $glyph ? 'l4' :
 441                                         $row->{parent} && $glyph eq
 442                                                 Encode::decode($row->{parent}, pack 'C', $cp) ? 'l3' :
 443                                         $visible->{$glyph} ? 'l2' :
 444                                         'l1',
 445                                         $glyphs->glyph_html($glyph);
 446                                 $visible->{$glyph}++;
 447                                 next;
 448                         }
 449
 450                         print "\n".$glyphs->glyph_cell($glyph);
 451                 }
 452                 continue {
 453                         $offset += $colsize;
 454                 }
 455                 print "\n";
 456         }
 457         say '</table></div>';
 458 }
 459
 460 :>
 461 <hr>
 462
 463 <div class="legend">
 464         <table class="glyphs"><tr><: if (exists $get{compare}) { :>
 465         <td class="X l4">unicode
 466         <td class="X l3">inherited
 467         <td class="X l2">existing
 468         <td class="X l1">original
 469         <td class="">unassigned
 470 <: } else { :>
 471         <td class="X Cc">control
 472         <td class="X Zs"><span>whitespace</span>
 473         <td class="X Mn">diacritic<table class="glyphs"><tr>
 474                 <td class="X Sk">letter
 475                 </table>
 476         <td class="X Po">punctuation<table class="glyphs"><tr>
 477                 <td class="X Pf">quote
 478                 </table>
 479         <td class="X So">symbol<table class="glyphs"><tr>
 480                 <td class="X Sm">math
 481                 <td class="X Sc">currency
 482                 </table>
 483         <td class="X No">numeric
 484         <td class="X Greek">greek<table class="glyphs"><tr>
 485                 <td class="X Latin">latin
 486                 <td class="X Cyrillic">cyrillic
 487                 </table>
 488         <td class="X Aramaic">aramaic<table class="glyphs"><tr>
 489                 <td class="X Brahmic">brahmic
 490                 <td class="X Arabic">arabic
 491                 </table>
 492         <td class="X Syllabic">syllabic<table class="glyphs"><tr>
 493                 <td class="X African">african
 494                 <td class="X Hiragana">japanese
 495                 <td class="X Han">cjk
 496                 <td class="X Bopomofo">chinese
 497                 </table>
 498         <td class="X Alpha">alphabetic
 499         </table>
 500
 501         <table class="glyphs"><tr>
 502         <td class="X">unicode 7.0
 503         <td class="X Xr">proposed
 504         <td class="X Xd">deprecated
 505         <td class="">unassigned
 506         <td class="X Xi">invalid
 507 <: } :> </table>
 508 </div>
 509