From: Mischa POSLAWSKY Date: Mon, 20 Apr 2009 23:31:05 +0000 (+0000) Subject: charset: move unicode specifications to separate includes X-Git-Tag: v1.3~168 X-Git-Url: http://git.shiar.nl/sheet.git/commitdiff_plain/bb412b576d883e11a4ec09d01c34a933d894af6c charset: move unicode specifications to separate includes --- diff --git a/charset-unicode.inc.pl b/charset-unicode.inc.pl new file mode 100644 index 0000000..4f5a49d --- /dev/null +++ b/charset-unicode.inc.pl @@ -0,0 +1,180 @@ +my %uniblock = ( + 0x000, 'control', + 0x002, 'comn', + 0x004, 'basic latin', + 0x008, 'control', + 0x00A, 'comn', + 0x00C, 'latin1', + 0x010, 'latin extended-A', + 0x018, 'latin extended-B', + 0x020, 'latin ext-B', + 0x025, 'IPA', + 0x02B, 'spacing modifier', + 0x030, 'diacritics', + 0x038, 'greek', + 0x040, 'cyrillic', + 0x050, 'cyrillic+', + 0x053, 'armenian', + 0x058, 'hebrew', + 0x060, 'arabic', + 0x070, 'syriac', + 0x075, 'arabic+', + 0x078, 'thaana', + 0x07C, 'n\'ko', + 0x080, 'samaritan', + 0x084, 'manda', + 0x086, 'reserved', + 0x090, 'devanagari', + 0x098, 'bengali', + 0x0A0, 'gurmukhi', + 0x0A8, 'gujarati', + 0x0B0, 'oriya', + 0x0B8, 'tamil', + 0x0C0, 'telugu', + 0x0C8, 'kannada', + 0x0D0, 'malayalam', + 0x0D8, 'sinhala', + 0x0E0, 'thai', + 0x0E8, 'lao', + 0x0F0, 'tibetan', + 0x100, 'myanmar', + 0x10A, 'georgian', + 0x110, 'hangeul jamo', + 0x120, 'ethiopic', + 0x130, 'ethiopic', + 0x138, 'eth+', + 0x13A, 'cherokee', + 0x140, 'unified canadian aboriginal syllabics', + 0x160, 'unified canadian syllabics', + 0x168, 'ogham', + 0x16A, 'runic', + 0x170, 'tagalog', + 0x172, 'hanun', + 0x174, 'buhid', + 0x176, 'tagb', + 0x178, 'khmer', + 0x180, 'mongolian', + 0x18B, 'canadian+', + 0x190, 'limbu', + 0x195, 'tai le', + 0x198, 'new tai lue', + 0x19E, 'km', + 0x1A0, 'lontara', + 0x1A2, 'tai tham', + 0x1AB, 'reserved', + 0x1B0, 'balinese', + 0x1B8, 'sundanese', + 0x1BC, 'batak', + 0x1C0, 'lepcha', + 0x1C5, 'ol chiki', + 0x1C8, 'reserved', + 0x1CD, 'vedic', + 0x1D0, 'phonetic', + 0x1D8, 'phonetic+', + 0x1DC, 'combining', + 0x1E0, 'latin extended additional', + 0x1F0, 'greek+', + 0x200, 'general punctuation', + 0x207, 'suþscript', # suth now means "sub and/or sup" + 0x20A, 'currency', + 0x20D, 'overlay', + 0x210, 'letterlike', + 0x215, 'number', + 0x219, 'arrows', + 0x220, 'mathematical symbols', + 0x230, 'miscellaneous technical', + 0x240, 'control', + 0x244, 'OCR', + 0x246, 'enclosed alphanumerics', + 0x250, 'box drawing', + 0x258, 'blocks', + 0x25A, 'geometric shapes', + 0x260, 'miscellaneous symbols', + 0x270, 'dingbats', + 0x27C, 'maths-A', + 0x27F, 'arr', + 0x280, 'braille', + 0x290, 'supplemental arrows-B', + 0x298, 'mathematical symbols-B', + 0x2A0, 'supplemental mathematical operators', + 0x2B0, 'miscellaneous symbols and arrows', + 0x2C0, 'glagolitic', + 0x2C6, 'latin-C', + 0x2C8, 'coptic', + 0x2D0, 'georgian+', + 0x2D3, 'tifinagh', #TODO: proto-canaanite + 0x2D8, 'ethiopic+', + 0x2DE, 'cyrl-A', + 0x2E0, 'punctuation+', + 0x2E8, 'cjk radicals', + 0x2F0, 'kangxi radicals', + 0x2FE, '', + 0x2FF, 'idc', + 0x300, 'cjk misc', + 0x304, 'hiragana', + 0x30A, 'katakana', + 0x310, 'bopomofo', + 0x313, 'hangeul compat', + 0x319, 'kbn', + 0x31A, 'bpmf', + 0x31C, 'strokes', + 0x31F, 'k+', + 0x320, 'enclosed cjk characters', + 0x330, 'cjk compatibility', + 0x340, 'cjk unified ideographs extension A', + 0x4D0, 'cjk unified ideographs extension A', + 0x4DC, 'hexagrams', + 0x4E0, 'cjk unified ideographs', + 0xA00, 'yi', + 0xA40, 'yi', + 0xA49, 'yi radicals', + 0xA4D, 'lisu', + 0xA50, 'vai', + 0xA60, 'vai', + 0xA64, 'cyrillic extended-B', + 0xA6A, 'bamum', + 0xA70, 'tones', + 0xA72, 'latin extended-D', + 0xA80, 'sylheti', + 0xA83, 'in', + 0xA84, 'phags-pa', + 0xA88, 'saurashtra', + 0xA8E, 'deva+', + 0xA90, 'kayah li', + 0xA93, 'rejang', + 0xA96, 'jamo-A', + 0xA98, 'javanese', + 0xA9E, 'res', + 0xAA0, 'cham', + 0xAA6, 'mym-A', + 0xAA8, 'tai viet', + 0xAAE, 'mtei+', + 0xAB0, 'reserved', + 0xABC, 'manipuri', + 0xAC0, 'hangeul syllables', + 0xD70, 'hangeul syllables', + 0xD7B, 'haungeul jamo-B', + 0xD80, 'high surrogates', + 0xDC0, 'low surrogates', + 0xE00, 'private use', + 0xF90, 'cjk compatibility ideographs', + 0xFB0, 'presentation', + 0xFB5, '', + 0xFC0, 'arabic presentation forms A', + 0xFD0, '', + 0xFDD, '?', + 0xFDF, '', + 0xFE0, 'var', + 0xFE1, 'ver', + 0xFE2, '½', + 0xFE3, 'comp', + 0xFE5, 'small', + 0xFE7, 'arabic presentation B', + 0xFF0, 'halfwidth & fullwidth forms', + 0xFFF, 'sp', +); + +sub { + return defined $uniblock{$_[0]} ? $uniblock{$_[0]} : (); +} + diff --git a/charset-utf8.inc.pl b/charset-utf8.inc.pl new file mode 100644 index 0000000..c751e6d --- /dev/null +++ b/charset-utf8.inc.pl @@ -0,0 +1,28 @@ +my %utf8byte = ( + 0x00, 'single byte ASCII', + 0x80, 'multi-byte continuation', + 0xC0, '(overl.)', + 0xC2, '2-byte sequence start', + 0xD0, '', + 0xE0, '3-byte sequence start', + 0xF0, '4-byte sequence', + 0xF5, '(overflow)', + 0xF8, '5-byte', + 0xFC, '6-byte', + 0xFE, 'invalid', +); + +sub { + return defined $utf8byte{$_[0]} ? $utf8byte{$_[0]} : (); +} + diff --git a/charset.plp b/charset.plp index 2dc6964..f546f2d 100644 --- a/charset.plp +++ b/charset.plp @@ -57,10 +57,12 @@ my @request = map { if ($row{set} eq 'Internal') { $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096); $row{set} = 'Unicode BMP'; + $row{cell} = do 'charset-unicode.inc.pl'; } elsif ($row{set} eq 'utf-8-strict') { $row{table} = undef; $row{set} = 'UTF-8'; + $row{cell} = do 'charset-utf8.inc.pl'; } else { $row{table} = decode($row{set}, pack 'C*', $row{offset} .. $endpoint); @@ -94,591 +96,6 @@ sub quote { return $_; } -sub printcell_unicode { - my ($value) = @_; - if ($value > 0xFFF) { - print "\n".'?'; - } - elsif ($value == 0) { - print 'control'; - } - elsif ($value == 2) { - print 'comn'; - } - elsif ($value == 4) { - print 'basic latin'; - } - elsif ($value == 8) { - print 'control'; - } - elsif ($value == 10) { - print 'comn'; - } - elsif ($value == 12) { - print 'latin1'; - } - elsif ($value == 0x10) { - print 'latin extended-A'; - } - elsif ($value == 0x18) { - print 'latin extended-B'; - } - elsif ($value == 0x20) { - print 'latin ext-B'; - } - elsif ($value == 0x25) { - print 'IPA'; - } - elsif ($value == 0x2B) { - print 'spacing modifier'; - } - elsif ($value == 0x30) { - print 'diacritics'; - } - elsif ($value == 0x38) { - print 'greek'; - } - elsif ($value == 0x40) { - print 'cyrillic'; - } - elsif ($value == 0x50) { - print 'cyrillic+'; - } - elsif ($value == 0x53) { - print 'armenian'; - } - elsif ($value == 0x58) { - print 'hebrew'; - } - elsif ($value == 0x60) { - print 'arabic'; - } - elsif ($value == 0x70) { - print 'syriac'; - } - elsif ($value == 0x75) { - print 'arabic+'; - } - elsif ($value == 0x78) { - print 'thaana'; - } - elsif ($value == 0x7C) { - print 'n\'ko'; - } - elsif ($value == 0x80) { - print 'samaritan'; - } - elsif ($value == 0x84) { - print 'manda'; - } - elsif ($value == 0x86) { - print 'reserved'; - } - elsif ($value == 0x90) { - print 'devanagari'; - } - elsif ($value == 0x98) { - print 'bengali'; - } - elsif ($value == 0xA0) { - print 'gurmukhi'; - } - elsif ($value == 0xA8) { - print 'gujarati'; - } - elsif ($value == 0xB0) { - print 'oriya'; - } - elsif ($value == 0xB8) { - print 'tamil'; - } - elsif ($value == 0xC0) { - print 'telugu'; - } - elsif ($value == 0xC8) { - print 'kannada'; - } - elsif ($value == 0xD0) { - print 'malayalam'; - } - elsif ($value == 0xD8) { - print 'sinhala'; - } - elsif ($value == 0xE0) { - print 'thai'; - } - elsif ($value == 0xE8) { - print 'lao'; - } - elsif ($value == 0xF0) { - print 'tibetan'; - } - elsif ($value == 0x100) { - print 'myanmar'; - } - elsif ($value == 0x10A) { - print 'georgian'; - } - elsif ($value == 0x110) { - print 'hangeul jamo'; - } - elsif ($value == 0x120) { - print 'ethiopic'; - } - elsif ($value == 0x130) { - print 'ethiopic'; - } - elsif ($value == 0x138) { - print 'eth+'; - } - elsif ($value == 0x13A) { - print 'cherokee'; - } - elsif ($value == 0x140) { - print 'unified canadian aboriginal syllabics'; - } - elsif ($value == 0x160) { - print 'unified canadian syllabics'; - } - elsif ($value == 0x168) { - print 'ogham'; - } - elsif ($value == 0x16A) { - print 'runic'; - } - elsif ($value == 0x170) { - print 'tagalog'; - } - elsif ($value == 0x172) { - print 'hanun'; - } - elsif ($value == 0x174) { - print 'buhid'; - } - elsif ($value == 0x176) { - print 'tagb'; - } - elsif ($value == 0x178) { - print 'khmer'; - } - elsif ($value == 0x180) { - print 'mongolian'; - } - elsif ($value == 0x18B) { - print 'canadian+'; - } - elsif ($value == 0x190) { - print 'limbu'; - } - elsif ($value == 0x195) { - print 'tai le'; - } - elsif ($value == 0x198) { - print 'new tai lue'; - } - elsif ($value == 0x19E) { - print 'km'; - } - elsif ($value == 0x1A0) { - print 'lontara'; - } - elsif ($value == 0x1A2) { - print 'tai tham'; - } - elsif ($value == 0x1AB) { - print 'reserved'; - } - elsif ($value == 0x1B0) { - print 'balinese'; - } - elsif ($value == 0x1B8) { - print 'sundanese'; - } - elsif ($value == 0x1BC) { - print 'batak'; - } - elsif ($value == 0x1C0) { - print 'lepcha'; - } - elsif ($value == 0x1C5) { - print 'ol chiki'; - } - elsif ($value == 0x1C8) { - print 'reserved'; - } - elsif ($value == 0x1CD) { - print 'vedic'; - } - elsif ($value == 0x1D0) { - print 'phonetic'; - } - elsif ($value == 0x1D8) { - print 'phonetic+'; - } - elsif ($value == 0x1DC) { - print 'combining'; - } - elsif ($value == 0x1E0) { - print 'latin extended additional'; - } - elsif ($value == 0x1F0) { - print 'greek+'; - } - elsif ($value == 0x200) { - print 'general punctuation'; - } - elsif ($value == 0x207) { - print 'suþscript'; # suth now means "sub and/or sup" - } - elsif ($value == 0x20A) { - print 'currency'; - } - elsif ($value == 0x20D) { - print 'overlay'; - } - elsif ($value == 0x210) { - print 'letterlike'; - } - elsif ($value == 0x215) { - print 'number'; - } - elsif ($value == 0x219) { - print 'arrows'; - } - elsif ($value == 0x220) { - print 'mathematical symbols'; - } - elsif ($value == 0x230) { - print 'miscellaneous technical'; - } - elsif ($value == 0x240) { - print 'control'; - } - elsif ($value == 0x244) { - print 'OCR'; - } - elsif ($value == 0x246) { - print 'enclosed alphanumerics'; - } - elsif ($value == 0x250) { - print 'box drawing'; - } - elsif ($value == 0x258) { - print 'blocks'; - } - elsif ($value == 0x25A) { - print 'geometric shapes'; - } - elsif ($value == 0x260) { - print 'miscellaneous symbols'; - } - elsif ($value == 0x270) { - print 'dingbats'; - } - elsif ($value == 0x27C) { - print 'maths-A'; - } - elsif ($value == 0x27F) { - print 'arr'; - } - elsif ($value == 0x280) { - print 'braille'; - } - elsif ($value == 0x290) { - print 'supplemental arrows-B'; - } - elsif ($value == 0x298) { - print 'mathematical symbols-B'; - } - elsif ($value == 0x2A0) { - print 'supplemental mathematical operators'; - } - elsif ($value == 0x2B0) { - print 'miscellaneous symbols and arrows'; - } - elsif ($value == 0x2C0) { - print 'glagolitic'; - } - elsif ($value == 0x2C6) { - print 'latin-C'; - } - elsif ($value == 0x2C8) { - print 'coptic'; - } - elsif ($value == 0x2D0) { - print 'georgian+'; - } - elsif ($value == 0x2D3) { - print 'tifinagh'; #TODO: proto-canaanite - } - elsif ($value == 0x2D8) { - print 'ethiopic+'; - } - elsif ($value == 0x2DE) { - print 'cyrl-A'; - } - elsif ($value == 0x2E0) { - print 'punctuation+'; - } - elsif ($value == 0x2E8) { - print 'cjk radicals'; - } - elsif ($value == 0x2F0) { - print 'kangxi radicals'; - } - elsif ($value == 0x2FE) { - print ''; - } - elsif ($value == 0x2FF) { - print 'idc'; - } - elsif ($value == 0x300) { - print 'cjk misc'; - } - elsif ($value == 0x304) { - print 'hiragana'; - } - elsif ($value == 0x30A) { - print 'katakana'; - } - elsif ($value == 0x310) { - print 'bopomofo'; - } - elsif ($value == 0x313) { - print 'hangeul compat'; - } - elsif ($value == 0x319) { - print 'kbn'; - } - elsif ($value == 0x31A) { - print 'bpmf'; - } - elsif ($value == 0x31C) { - print 'strokes'; - } - elsif ($value == 0x31F) { - print 'k+'; - } - elsif ($value == 0x320) { - print 'enclosed cjk characters'; - } - elsif ($value == 0x330) { - print 'cjk compatibility'; - } - elsif ($value == 0x340) { - print 'cjk unified ideographs extension A'; - } - elsif ($value == 0x4D0) { - print 'cjk unified ideographs extension A'; - } - elsif ($value == 0x4DC) { - print 'hexagrams'; - } - elsif ($value == 0x4E0) { - print 'cjk unified ideographs'; - } - elsif ($value == 0xA00) { - print 'yi'; - } - elsif ($value == 0xA40) { - print 'yi'; - } - elsif ($value == 0xA49) { - print 'yi radicals'; - } - elsif ($value == 0xA4D) { - print 'lisu'; - } - elsif ($value == 0xA50) { - print 'vai'; - } - elsif ($value == 0xA60) { - print 'vai'; - } - elsif ($value == 0xA64) { - print 'cyrillic extended-B'; - } - elsif ($value == 0xA6A) { - print 'bamum'; - } - elsif ($value == 0xA70) { - print 'tones'; - } - elsif ($value == 0xA72) { - print 'latin extended-D'; - } - elsif ($value == 0xA80) { - print 'sylheti'; - } - elsif ($value == 0xA83) { - print 'in'; - } - elsif ($value == 0xA84) { - print 'phags-pa'; - } - elsif ($value == 0xA88) { - print 'saurashtra'; - } - elsif ($value == 0xA8E) { - print 'deva+'; - } - elsif ($value == 0xA90) { - print 'kayah li'; - } - elsif ($value == 0xA93) { - print 'rejang'; - } - elsif ($value == 0xA96) { - print 'jamo-A'; - } - elsif ($value == 0xA98) { - print 'javanese'; - } - elsif ($value == 0xA9E) { - print 'res'; - } - elsif ($value == 0xAA0) { - print 'cham'; - } - elsif ($value == 0xAA6) { - print 'mym-A'; - } - elsif ($value == 0xAA8) { - print 'tai viet'; - } - elsif ($value == 0xAAE) { - print 'mtei+'; - } - elsif ($value == 0xAB0) { - print 'reserved'; - } - elsif ($value == 0xABC) { - print 'manipuri'; - } - elsif ($value == 0xAC0) { - print 'hangeul syllables'; - } - elsif ($value == 0xD70) { - print 'hangeul syllables'; - } - elsif ($value == 0xD7B) { - print 'haungeul jamo-B'; - } - elsif ($value == 0xD80) { - print 'high surrogates'; - } - elsif ($value == 0xDC0) { - print 'low surrogates'; - } - elsif ($value == 0xE00) { - print 'private use'; - } - elsif ($value == 0xF90) { - print 'cjk compatibility ideographs'; - } - elsif ($value == 0xFB0) { - print 'presentation'; - } - elsif ($value == 0xFB5) { - print ''; - } - elsif ($value == 0xFC0) { - print 'arabic presentation forms A'; - } - elsif ($value == 0xFD0) { - print ''; - } - elsif ($value == 0xFDD) { - print '?'; - } - elsif ($value == 0xFDF) { - print ''; - } - elsif ($value == 0xFE0) { - print 'var'; - } - elsif ($value == 0xFE1) { - print 'ver'; - } - elsif ($value == 0xFE2) { - print '½'; - } - elsif ($value == 0xFE3) { - print 'comp'; - } - elsif ($value == 0xFE5) { - print 'small'; - } - elsif ($value == 0xFE7) { - print 'arabic presentation B'; - } - elsif ($value == 0xFF0) { - print 'halfwidth & fullwidth forms'; - } - elsif ($value == 0xFFF) { - print 'sp'; - } -} - -sub printcell_utf8 { - my ($value) = @_; - if ($value <= 0x7F) { - print 'single byte ASCII' - if $value == 0; - } - elsif ($value <= 0xBF) { - print 'multi-byte continuation' - if $value == 0x80; - } - elsif ($value <= 0xC1) { - print '(overl.)' - if $value == 0xC0; - } - elsif ($value <= 0xDF) { - print '2-byte sequence start' - if $value == 0xC2; - print '' - if $value == 0xD0; - } - elsif ($value <= 0xEF) { - print '3-byte sequence start' - if $value == 0xE0; - } - elsif ($value <= 0xF4) { - print '4-byte sequence' - if $value == 0xF0; - } - elsif ($value <= 0xF7) { - print '(overflow)' - if $value == 0xF5; - } - elsif ($value <= 0xFB) { - print '5-byte' - if $value == 0xF8; - } - elsif ($value <= 0xFD) { - print '6-byte' - if $value == 0xFC; - } - elsif ($value <= 0xFF) { - print 'invalid' - if $value == 0xFE; - } - else { - print "\n".'?'; - } -} - print "