X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/92240b0d1e4434e0981b3326ca4d7ad8673dbf71..42302c75aa79bb1f14328d6b249379169f5a4fa1:/charset.plp diff --git a/charset.plp b/charset.plp index d3ddea7..2dc6964 100644 --- a/charset.plp +++ b/charset.plp @@ -26,13 +26,66 @@ my $diinfo = do 'digraphs.inc.pl'; my %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} } keys %$diinfo; -use Encode qw(decode); +use Encode qw(decode resolve_alias); # generate character table(s) # (~16x faster than decoding in loop; # substr strings is twice as fast as splitting to an array) -my @tables = map { decode($_, pack 'C*', 0..255) } 'iso-8859-1'; +my %ALIAS = ( +# default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)], + default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)], + 0 => [qw(cp437 cp863)], + 1 => [qw(iso-8859-1 cp1252 MacRoman cp850)], + 2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)], + 5 => [qw(koi8-f iso-8859-5 cp1251 MacCyrillic cp855 cp866)], + 7 => [qw(iso-8859-7 cp1253 MacGreek cp737 cp869)], + 8 => [qw(iso-8859-8 cp1255 MacHebrew cp862)], +); +my @request = map { + if (my $input = $_) { + my %row = (offset => 0); + my $endpoint = 255; + if ($input =~ s/^--//) { + $row{offset} = $endpoint > 160 ? 160 : 48; + } + elsif ($input =~ s/^-//) { + $row{offset} = $endpoint > 128 ? 128 : 32; + } + if ($input =~ s/-$//) { + $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127; + } + if ($row{set} = resolve_alias($input)) { + if ($row{set} eq 'Internal') { + $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096); + $row{set} = 'Unicode BMP'; + } + elsif ($row{set} eq 'utf-8-strict') { + $row{table} = undef; + $row{set} = 'UTF-8'; + } + else { + $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint); + } + } + else { + print "

Encoding $input unknown

\n"; + } + \%row; + } + else { + (); + } +} map { defined $ALIAS{$_} ? @{ $ALIAS{$_} } : $_ } + $ENV{PATH_INFO} =~ /\w/ ? split(m{[/+\s]}, $ENV{PATH_INFO}) : 'default'; my $NOCHAR = chr 0xFFFD; +for my $cp437 (grep {$request[$_]->{set} eq 'cp437'} 0 .. $#request) { + substr($request[$cp437]->{table}, 237, 1) = pack 'U*', 0x3D5; # phi sign + substr($request[$cp437]->{table}, 0, 32) = pack 'U*', map {hex} qw( + 2007 263A 263B 2665 2666 2663 2660 2022 25D8 25CB 25D9 2642 2640 266A 266B 263C + 25BA 25C4 2195 203C 00B6 00A7 25AC 21A8 2191 2193 2192 2190 221F 2194 25B2 25BC + ); +} + sub quote { local $_ = shift; s/"/"/g; @@ -41,19 +94,617 @@ sub quote { return $_; } +sub printcell_unicode { + my ($value) = @_; + if ($value > 0xFFF) { + print "\n".'?'; + } + elsif ($value == 0) { + print 'control'; + } + elsif ($value == 2) { + print 'comn'; + } + elsif ($value == 4) { + print 'basic latin'; + } + elsif ($value == 8) { + print 'control'; + } + elsif ($value == 10) { + print 'comn'; + } + elsif ($value == 12) { + print 'latin1'; + } + elsif ($value == 0x10) { + print 'latin extended-A'; + } + elsif ($value == 0x18) { + print 'latin extended-B'; + } + elsif ($value == 0x20) { + print 'latin ext-B'; + } + elsif ($value == 0x25) { + print 'IPA'; + } + elsif ($value == 0x2B) { + print 'spacing modifier'; + } + elsif ($value == 0x30) { + print 'diacritics'; + } + elsif ($value == 0x38) { + print 'greek'; + } + elsif ($value == 0x40) { + print 'cyrillic'; + } + elsif ($value == 0x50) { + print 'cyrillic+'; + } + elsif ($value == 0x53) { + print 'armenian'; + } + elsif ($value == 0x58) { + print 'hebrew'; + } + elsif ($value == 0x60) { + print 'arabic'; + } + elsif ($value == 0x70) { + print 'syriac'; + } + elsif ($value == 0x75) { + print 'arabic+'; + } + elsif ($value == 0x78) { + print 'thaana'; + } + elsif ($value == 0x7C) { + print 'n\'ko'; + } + elsif ($value == 0x80) { + print 'samaritan'; + } + elsif ($value == 0x84) { + print 'manda'; + } + elsif ($value == 0x86) { + print 'reserved'; + } + elsif ($value == 0x90) { + print 'devanagari'; + } + elsif ($value == 0x98) { + print 'bengali'; + } + elsif ($value == 0xA0) { + print 'gurmukhi'; + } + elsif ($value == 0xA8) { + print 'gujarati'; + } + elsif ($value == 0xB0) { + print 'oriya'; + } + elsif ($value == 0xB8) { + print 'tamil'; + } + elsif ($value == 0xC0) { + print 'telugu'; + } + elsif ($value == 0xC8) { + print 'kannada'; + } + elsif ($value == 0xD0) { + print 'malayalam'; + } + elsif ($value == 0xD8) { + print 'sinhala'; + } + elsif ($value == 0xE0) { + print 'thai'; + } + elsif ($value == 0xE8) { + print 'lao'; + } + elsif ($value == 0xF0) { + print 'tibetan'; + } + elsif ($value == 0x100) { + print 'myanmar'; + } + elsif ($value == 0x10A) { + print 'georgian'; + } + elsif ($value == 0x110) { + print 'hangeul jamo'; + } + elsif ($value == 0x120) { + print 'ethiopic'; + } + elsif ($value == 0x130) { + print 'ethiopic'; + } + elsif ($value == 0x138) { + print 'eth+'; + } + elsif ($value == 0x13A) { + print 'cherokee'; + } + elsif ($value == 0x140) { + print 'unified canadian aboriginal syllabics'; + } + elsif ($value == 0x160) { + print 'unified canadian syllabics'; + } + elsif ($value == 0x168) { + print 'ogham'; + } + elsif ($value == 0x16A) { + print 'runic'; + } + elsif ($value == 0x170) { + print 'tagalog'; + } + elsif ($value == 0x172) { + print 'hanun'; + } + elsif ($value == 0x174) { + print 'buhid'; + } + elsif ($value == 0x176) { + print 'tagb'; + } + elsif ($value == 0x178) { + print 'khmer'; + } + elsif ($value == 0x180) { + print 'mongolian'; + } + elsif ($value == 0x18B) { + print 'canadian+'; + } + elsif ($value == 0x190) { + print 'limbu'; + } + elsif ($value == 0x195) { + print 'tai le'; + } + elsif ($value == 0x198) { + print 'new tai lue'; + } + elsif ($value == 0x19E) { + print 'km'; + } + elsif ($value == 0x1A0) { + print 'lontara'; + } + elsif ($value == 0x1A2) { + print 'tai tham'; + } + elsif ($value == 0x1AB) { + print 'reserved'; + } + elsif ($value == 0x1B0) { + print 'balinese'; + } + elsif ($value == 0x1B8) { + print 'sundanese'; + } + elsif ($value == 0x1BC) { + print 'batak'; + } + elsif ($value == 0x1C0) { + print 'lepcha'; + } + elsif ($value == 0x1C5) { + print 'ol chiki'; + } + elsif ($value == 0x1C8) { + print 'reserved'; + } + elsif ($value == 0x1CD) { + print 'vedic'; + } + elsif ($value == 0x1D0) { + print 'phonetic'; + } + elsif ($value == 0x1D8) { + print 'phonetic+'; + } + elsif ($value == 0x1DC) { + print 'combining'; + } + elsif ($value == 0x1E0) { + print 'latin extended additional'; + } + elsif ($value == 0x1F0) { + print 'greek+'; + } + elsif ($value == 0x200) { + print 'general punctuation'; + } + elsif ($value == 0x207) { + print 'suþscript'; # suth now means "sub and/or sup" + } + elsif ($value == 0x20A) { + print 'currency'; + } + elsif ($value == 0x20D) { + print 'overlay'; + } + elsif ($value == 0x210) { + print 'letterlike'; + } + elsif ($value == 0x215) { + print 'number'; + } + elsif ($value == 0x219) { + print 'arrows'; + } + elsif ($value == 0x220) { + print 'mathematical symbols'; + } + elsif ($value == 0x230) { + print 'miscellaneous technical'; + } + elsif ($value == 0x240) { + print 'control'; + } + elsif ($value == 0x244) { + print 'OCR'; + } + elsif ($value == 0x246) { + print 'enclosed alphanumerics'; + } + elsif ($value == 0x250) { + print 'box drawing'; + } + elsif ($value == 0x258) { + print 'blocks'; + } + elsif ($value == 0x25A) { + print 'geometric shapes'; + } + elsif ($value == 0x260) { + print 'miscellaneous symbols'; + } + elsif ($value == 0x270) { + print 'dingbats'; + } + elsif ($value == 0x27C) { + print 'maths-A'; + } + elsif ($value == 0x27F) { + print 'arr'; + } + elsif ($value == 0x280) { + print 'braille'; + } + elsif ($value == 0x290) { + print 'supplemental arrows-B'; + } + elsif ($value == 0x298) { + print 'mathematical symbols-B'; + } + elsif ($value == 0x2A0) { + print 'supplemental mathematical operators'; + } + elsif ($value == 0x2B0) { + print 'miscellaneous symbols and arrows'; + } + elsif ($value == 0x2C0) { + print 'glagolitic'; + } + elsif ($value == 0x2C6) { + print 'latin-C'; + } + elsif ($value == 0x2C8) { + print 'coptic'; + } + elsif ($value == 0x2D0) { + print 'georgian+'; + } + elsif ($value == 0x2D3) { + print 'tifinagh'; #TODO: proto-canaanite + } + elsif ($value == 0x2D8) { + print 'ethiopic+'; + } + elsif ($value == 0x2DE) { + print 'cyrl-A'; + } + elsif ($value == 0x2E0) { + print 'punctuation+'; + } + elsif ($value == 0x2E8) { + print 'cjk radicals'; + } + elsif ($value == 0x2F0) { + print 'kangxi radicals'; + } + elsif ($value == 0x2FE) { + print ''; + } + elsif ($value == 0x2FF) { + print 'idc'; + } + elsif ($value == 0x300) { + print 'cjk misc'; + } + elsif ($value == 0x304) { + print 'hiragana'; + } + elsif ($value == 0x30A) { + print 'katakana'; + } + elsif ($value == 0x310) { + print 'bopomofo'; + } + elsif ($value == 0x313) { + print 'hangeul compat'; + } + elsif ($value == 0x319) { + print 'kbn'; + } + elsif ($value == 0x31A) { + print 'bpmf'; + } + elsif ($value == 0x31C) { + print 'strokes'; + } + elsif ($value == 0x31F) { + print 'k+'; + } + elsif ($value == 0x320) { + print 'enclosed cjk characters'; + } + elsif ($value == 0x330) { + print 'cjk compatibility'; + } + elsif ($value == 0x340) { + print 'cjk unified ideographs extension A'; + } + elsif ($value == 0x4D0) { + print 'cjk unified ideographs extension A'; + } + elsif ($value == 0x4DC) { + print 'hexagrams'; + } + elsif ($value == 0x4E0) { + print 'cjk unified ideographs'; + } + elsif ($value == 0xA00) { + print 'yi'; + } + elsif ($value == 0xA40) { + print 'yi'; + } + elsif ($value == 0xA49) { + print 'yi radicals'; + } + elsif ($value == 0xA4D) { + print 'lisu'; + } + elsif ($value == 0xA50) { + print 'vai'; + } + elsif ($value == 0xA60) { + print 'vai'; + } + elsif ($value == 0xA64) { + print 'cyrillic extended-B'; + } + elsif ($value == 0xA6A) { + print 'bamum'; + } + elsif ($value == 0xA70) { + print 'tones'; + } + elsif ($value == 0xA72) { + print 'latin extended-D'; + } + elsif ($value == 0xA80) { + print 'sylheti'; + } + elsif ($value == 0xA83) { + print 'in'; + } + elsif ($value == 0xA84) { + print 'phags-pa'; + } + elsif ($value == 0xA88) { + print 'saurashtra'; + } + elsif ($value == 0xA8E) { + print 'deva+'; + } + elsif ($value == 0xA90) { + print 'kayah li'; + } + elsif ($value == 0xA93) { + print 'rejang'; + } + elsif ($value == 0xA96) { + print 'jamo-A'; + } + elsif ($value == 0xA98) { + print 'javanese'; + } + elsif ($value == 0xA9E) { + print 'res'; + } + elsif ($value == 0xAA0) { + print 'cham'; + } + elsif ($value == 0xAA6) { + print 'mym-A'; + } + elsif ($value == 0xAA8) { + print 'tai viet'; + } + elsif ($value == 0xAAE) { + print 'mtei+'; + } + elsif ($value == 0xAB0) { + print 'reserved'; + } + elsif ($value == 0xABC) { + print 'manipuri'; + } + elsif ($value == 0xAC0) { + print 'hangeul syllables'; + } + elsif ($value == 0xD70) { + print 'hangeul syllables'; + } + elsif ($value == 0xD7B) { + print 'haungeul jamo-B'; + } + elsif ($value == 0xD80) { + print 'high surrogates'; + } + elsif ($value == 0xDC0) { + print 'low surrogates'; + } + elsif ($value == 0xE00) { + print 'private use'; + } + elsif ($value == 0xF90) { + print 'cjk compatibility ideographs'; + } + elsif ($value == 0xFB0) { + print 'presentation'; + } + elsif ($value == 0xFB5) { + print ''; + } + elsif ($value == 0xFC0) { + print 'arabic presentation forms A'; + } + elsif ($value == 0xFD0) { + print ''; + } + elsif ($value == 0xFDD) { + print '?'; + } + elsif ($value == 0xFDF) { + print ''; + } + elsif ($value == 0xFE0) { + print 'var'; + } + elsif ($value == 0xFE1) { + print 'ver'; + } + elsif ($value == 0xFE2) { + print '½'; + } + elsif ($value == 0xFE3) { + print 'comp'; + } + elsif ($value == 0xFE5) { + print 'small'; + } + elsif ($value == 0xFE7) { + print 'arabic presentation B'; + } + elsif ($value == 0xFF0) { + print 'halfwidth & fullwidth forms'; + } + elsif ($value == 0xFFF) { + print 'sp'; + } +} + +sub printcell_utf8 { + my ($value) = @_; + if ($value <= 0x7F) { + print 'single byte ASCII' + if $value == 0; + } + elsif ($value <= 0xBF) { + print 'multi-byte continuation' + if $value == 0x80; + } + elsif ($value <= 0xC1) { + print '(overl.)' + if $value == 0xC0; + } + elsif ($value <= 0xDF) { + print '2-byte sequence start' + if $value == 0xC2; + print '' + if $value == 0xD0; + } + elsif ($value <= 0xEF) { + print '3-byte sequence start' + if $value == 0xE0; + } + elsif ($value <= 0xF4) { + print '4-byte sequence' + if $value == 0xF0; + } + elsif ($value <= 0xF7) { + print '(overflow)' + if $value == 0xF5; + } + elsif ($value <= 0xFB) { + print '5-byte' + if $value == 0xF8; + } + elsif ($value <= 0xFD) { + print '6-byte' + if $value == 0xFC; + } + elsif ($value <= 0xFF) { + print 'invalid' + if $value == 0xFE; + } + else { + print "\n".'?'; + } +} + +print "