X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/138b47f5f5ac5364dbfc15075e21b0990ab919d9..5ff4b1c1b790314cd654d32ca6d72368d1748283:/charset.plp diff --git a/charset.plp b/charset.plp index 777e0ee..501e26c 100644 --- a/charset.plp +++ b/charset.plp @@ -4,7 +4,7 @@ use strict; use warnings; use open IO => ':utf8'; -our $VERSION = '1.0'; +our $VERSION = 'v1.0'; $header{content_type} = 'text/html; charset=utf-8'; @@ -13,8 +13,8 @@ $header{content_type} = 'text/html; charset=utf-8'; + charset cheat sheet - @@ -32,7 +32,7 @@ use Encode qw(decode resolve_alias); # substr strings is twice as fast as splitting to an array) my %ALIAS = ( # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)], - default => [qw(unicode utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)], + default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)], 0 => [qw(cp437 cp863)], 1 => [qw(iso-8859-1 cp1252 MacRoman cp850)], 2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)], @@ -53,14 +53,29 @@ my @request = map { if ($input =~ s/-$//) { $endpoint = $row{offset} ? $row{offset} < 160 ? 159 : 191 : 127; } - if ($row{set} = resolve_alias($input)) { + + if ($input =~ /^U([0-9a-f]+)(?:-([0-9a-f]+))?/) { + my $start = hex($1) << ($2 ? 4 : 8); + my $end = $2 ? hex($2) << 4 : $start + 240; + $row{table} = join '', map { chr } $start .. $end+15; + utf8::upgrade($row{table}); # prevent latin1 output + $row{set} = sprintf 'Unicode block U+%02Xxx', $start >> 8; + } + elsif ($input eq 'U') { + $row{table} = ' ' x 512; + $row{set} = 'Unicode planes'; + $row{cell} = do 'charset-ucplanes.inc.pl'; + } + elsif ($row{set} = resolve_alias($input)) { if ($row{set} eq 'Internal') { - $row{table} = ' 'x640; + $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096); $row{set} = 'Unicode BMP'; + $row{cell} = do 'charset-unicode.inc.pl'; } elsif ($row{set} eq 'utf-8-strict') { $row{table} = undef; $row{set} = 'UTF-8'; + $row{cell} = do 'charset-utf8.inc.pl'; } else { $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint); @@ -94,355 +109,13 @@ sub quote { return $_; } -sub printcell_unicode { - my ($value) = @_; - if ($value > 0x27F) { - print "\n".'?'; - } - elsif ($value == 0) { - print 'control'; - } - elsif ($value == 2) { - print 'latin'; - } - elsif ($value == 8) { - print 'control'; - } - elsif ($value == 10) { - print 'latin supplement'; - } - elsif ($value == 0x10) { - print 'latin ext-A'; - } - elsif ($value == 0x18) { - print 'latin ext-B'; - } - elsif ($value == 0x20) { - print 'latin ext-B'; - } - elsif ($value == 0x25) { - print 'IPA'; - } - elsif ($value == 0x2B) { - print 'spacing modifier'; - } - elsif ($value == 0x30) { - print 'diacritics'; - } - elsif ($value == 0x38) { - print 'greek'; - } - elsif ($value == 0x40) { - print 'cyrillic'; - } - elsif ($value == 0x50) { - print 'cyrillic+'; - } - elsif ($value == 0x53) { - print 'armenian'; - } - elsif ($value == 0x58) { - print 'hebrew'; - } - elsif ($value == 0x60) { - print 'arabic'; - } - elsif ($value == 0x70) { - print 'syriac'; - } - elsif ($value == 0x75) { - print 'arabic+'; - } - elsif ($value == 0x78) { - print 'thaana'; - } - elsif ($value == 0x7C) { - print 'nko'; - } - elsif ($value == 0x80) { - print 'samaritan'; - } - elsif ($value == 0x84) { - print 'manda'; - } - elsif ($value == 0x86) { - print 'reserved'; - } - elsif ($value == 0x90) { - print 'devanagari'; - } - elsif ($value == 0x98) { - print 'bengali'; - } - elsif ($value == 0xA0) { - print 'gurmukhi'; - } - elsif ($value == 0xA8) { - print 'gujarati'; - } - elsif ($value == 0xB0) { - print 'oriya'; - } - elsif ($value == 0xB8) { - print 'tamil'; - } - elsif ($value == 0xC0) { - print 'telugu'; - } - elsif ($value == 0xC8) { - print 'kannada'; - } - elsif ($value == 0xD0) { - print 'malayalam'; - } - elsif ($value == 0xD8) { - print 'sinhala'; - } - elsif ($value == 0xE0) { - print 'thai'; - } - elsif ($value == 0xE8) { - print 'lao'; - } - elsif ($value == 0xF0) { - print 'tibetan'; - } - elsif ($value == 0x100) { - print 'myanmar'; - } - elsif ($value == 0x10A) { - print 'georgian'; - } - elsif ($value == 0x110) { - print 'hangeul jamo'; - } - elsif ($value == 0x120) { - print 'ethiopic'; - } - elsif ($value == 0x130) { - print 'ethiopic'; - } - elsif ($value == 0x138) { - print 'eth+'; - } - elsif ($value == 0x13A) { - print 'cherokee'; - } - elsif ($value == 0x140) { - print 'unified canadian aboriginal syllabics'; - } - elsif ($value == 0x160) { - print 'unified canadian syllabics'; - } - elsif ($value == 0x168) { - print 'ogham'; - } - elsif ($value == 0x16A) { - print 'runic'; - } - elsif ($value == 0x170) { - print 'tagalog'; - } - elsif ($value == 0x172) { - print 'hanun'; - } - elsif ($value == 0x174) { - print 'buhid'; - } - elsif ($value == 0x176) { - print 'tagb'; - } - elsif ($value == 0x178) { - print 'khmer'; - } - elsif ($value == 0x180) { - print 'mongolian'; - } - elsif ($value == 0x18B) { - print 'canadian+'; - } - elsif ($value == 0x190) { - print 'limbu'; - } - elsif ($value == 0x195) { - print 'tai le'; - } - elsif ($value == 0x198) { - print 'new tai lue'; - } - elsif ($value == 0x19E) { - print 'km'; - } - elsif ($value == 0x1A0) { - print 'lontara'; - } - elsif ($value == 0x1A2) { - print 'tai tham'; - } - elsif ($value == 0x1AB) { - print 'reserved'; - } - elsif ($value == 0x1B0) { - print 'balinese'; - } - elsif ($value == 0x1B8) { - print 'sundanese'; - } - elsif ($value == 0x1BC) { - print 'batak'; - } - elsif ($value == 0x1C0) { - print 'lepcha'; - } - elsif ($value == 0x1C5) { - print 'ol chiki'; - } - elsif ($value == 0x1C8) { - print 'reserved'; - } - elsif ($value == 0x1CD) { - print 'vedic'; - } - elsif ($value == 0x1D0) { - print 'phonetic'; - } - elsif ($value == 0x1D8) { - print 'phonetic+'; - } - elsif ($value == 0x1DC) { - print 'combining'; - } - elsif ($value == 0x1E0) { - print 'latin extended additional'; - } - elsif ($value == 0x1F0) { - print 'greek+'; - } - elsif ($value == 0x200) { - print 'general punctuation'; - } - elsif ($value == 0x207) { - print 'su[bp]script'; - } - elsif ($value == 0x20A) { - print 'currency'; - } - elsif ($value == 0x20D) { - print 'overlay'; - } - elsif ($value == 0x210) { - print 'letterlike'; - } - elsif ($value == 0x215) { - print 'number'; - } - elsif ($value == 0x219) { - print 'arrows'; - } - elsif ($value == 0x220) { - print 'mathematical symbols'; - } - elsif ($value == 0x230) { - print 'miscellaneous technical'; - } - elsif ($value == 0x240) { - print 'control'; - } - elsif ($value == 0x244) { - print 'OCR'; - } - elsif ($value == 0x246) { - print 'enclosed alphanumerics'; - } - elsif ($value == 0x250) { - print 'box drawing'; - } - elsif ($value == 0x258) { - print 'blocks'; - } - elsif ($value == 0x25A) { - print 'geometric shapes'; - } - elsif ($value == 0x260) { - print 'miscellaneous symbols'; - } - elsif ($value == 0x270) { - print 'dingbats'; - } - elsif ($value == 0x27C) { - print 'maths-A'; - } - elsif ($value == 0x27F) { - print 'arr'; - } -} - -sub printcell_utf8 { - my ($value) = @_; - if ($value <= 0x7F) { - print 'Single byte ASCII' - if $value == 0; - } - elsif ($value <= 0xBF) { - print 'Multi-byte continuation' - if $value == 0x80; - } - elsif ($value <= 0xC1) { - print '(Overl.)' - if $value == 0xC0; - } - elsif ($value <= 0xDF) { - print '2-byte sequence start' - if $value == 0xC2; - print '' - if $value == 0xD0; - } - elsif ($value <= 0xEF) { - print '3-byte sequence start' - if $value == 0xE0; - } - elsif ($value <= 0xF4) { - print '4-byte sequence' - if $value == 0xF0; - } - elsif ($value <= 0xF7) { - print '(Overflow)' - if $value == 0xF5; - } - elsif ($value <= 0xFB) { - print '5-byte' - if $value == 0xF8; - } - elsif ($value <= 0xFD) { - print '6-byte' - if $value == 0xFC; - } - elsif ($value <= 0xFF) { - print 'Invalid' - if $value == 0xFE; - } - else { - print "\n".'?'; - } -} - print "