From 98699865db67c9772d2a56571e8ddcaee5a5e7bd Mon Sep 17 00:00:00 2001 From: Mischa POSLAWSKY Date: Mon, 20 Apr 2009 02:27:35 +0000 Subject: [PATCH] charset: complete unicode BMP descriptions --- base.css | 36 +++-- charset.plp | 403 +++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 346 insertions(+), 93 deletions(-) diff --git a/base.css b/base.css index b535be3..30b3846 100644 --- a/base.css +++ b/base.css @@ -245,23 +245,21 @@ table.glyphs.dimap { /* character properties */ td.X {background: #FFF} /* unidentified */ -td.Xr {background: #EEE} /* reverse */ -td.Xa {color: #0A0} /* ascii */ -td.Xl {color: #070} /* latin1 */ - .Xz {color: #D00} /* proposed */ +#digraphs td.Xa {color: #0A0} /* ascii */ +#digraphs td.Xl {color: #070} /* latin1 */ +#digraphs .Xz {color: #D00} /* proposed */ -td.Lm, td.Mc, td.Me, td.Zl, td.Zp, td.Cs {background: #F00} /* unstyled */ +td.Lm, td.Mc, td.Me, td.Zl, td.Zp {background: #F00} /* unstyled */ /* letter scripts */ td.Armenian, -td.Greek {background: #FFE0CF} +td.Greek {background: #FFE8CF} td.Cyrillic {background: #FFDDA8} td.Latin {background: #FFB} td.Aramaic, td.Hebrew {background: #FFD} td.Arabic {background: #EFE} td.African {background: #DED} -td.XXXXXX {background: #ACB} /* same as space */ td.Brahmic {background: #FBB} /* same as number */ td.Khmer {background: #FBA} td.Hangul, @@ -269,9 +267,11 @@ td.Syllabic {background: #DEA} td.Katakana {background: #DFA} td.Hiragana {background: #DFC} td.Bopomofo {background: #BFC} +td.Han {background: #CFD} +td.Alpha {background: #ADA} /* other scripts */ /* other categories */ -td.Nd, td.Nl, td.No {background: #FBB} /* number */ +td.Nd, td.Nl, td.No {background: #FDD} /* number */ td.Sc {background: #FCD} /* currency */ td.Sm {background: #ECE} /* math */ td.So {background: #DCF} /* symbol */ @@ -283,6 +283,10 @@ td.Cc {color: #666; background: #BBB} /* control */ td.Zs {background: #ACB} /* space */ td.Zs span {background: #EEE} td.Co {background: #A99} /* private */ +td.Xi, td.Cs {background: #CCC} /* invalid */ +td.Xd {color: #844} /* deprecated */ +td.Xr {color: #888} /* reserved (digraph reverse or proposal) */ +.dimap td.Xr {background: #EEE} /* reversed digraph */ /* implementation-based alternatives */ td.di-b {background: #FDD} /* bmp */ @@ -295,16 +299,21 @@ td.di-invalid {background: #BBB} /* impossible */ /* hover effects */ td.di-d, td.X:hover {cursor: help} -td.Greek:hover {background: #FA9} +td.Greek:hover, td.Armenian:hover {background: #FA8} td.Cyrillic:hover {background: #FB7} -td.Latin:hover {background: #FF6} -td.Hebrew:hover {background: #FFA} +td.Latin:hover {background: #EE4} +td.Hebrew:hover, td.Aramaic:hover {background: #FFA} td.Arabic:hover {background: #CFD} -td.Hangul:hover {background: #CE6} +td.African:hover {background: #BDB} +td.Syllabic:hover, td.Hangul:hover {background: #CE6} td.Katakana:hover {background: #BF7} td.Hiragana:hover {background: #AF8} td.Bopomofo:hover {background: #8FA} -td.Nd:hover, td.Nl:hover, td.No:hover {background: #F88} /* number */ +td.Brahmic:hover {background: #F77} +td.Khmer:hover {background: #F87} +td.Han:hover {background: #5EB} +td.Alpha:hover {background: #5C5} +td.Nd:hover, td.Nl:hover, td.No:hover {background: #F99} /* number */ td.Sc:hover {background: #F8C} /* currency */ td.Sm:hover {background: #F8F} /* math */ td.So:hover {background: #A8F} /* symbol */ @@ -315,6 +324,7 @@ td.Mn:hover {background: #CDE} /* modifie td.Zs:hover {background: #CED} /* space */ td.Cc:hover {background: #DDD} /* control */ td.Co:hover {background: #A77} /* private */ +td.Xr:hover {background: #FFF} /* reserved */ td.Xa:hover {outline: 1px solid #0F0} /* ascii */ td.Xl:hover {outline: 1px solid #0C0} /* latin1 */ td.Xz:hover {outline: 1px solid #F00} /* proposed */ diff --git a/charset.plp b/charset.plp index 777e0ee..2dc6964 100644 --- a/charset.plp +++ b/charset.plp @@ -32,7 +32,7 @@ use Encode qw(decode resolve_alias); # substr strings is twice as fast as splitting to an array) my %ALIAS = ( # default => [qw(unicode utf-8 iso-8859-1 cp437 -cp1252- --iso-8859-15- -koi8-f)], - default => [qw(unicode utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)], + default => [qw(unicode- utf-8 iso-8859-1 -cp1252- --iso-8859-15- cp437 -cp850)], 0 => [qw(cp437 cp863)], 1 => [qw(iso-8859-1 cp1252 MacRoman cp850)], 2 => [qw(iso-8859-2 cp1250 cp852 MacCentralEurRoman MacCroatian MacRumanian)], @@ -55,7 +55,7 @@ my @request = map { } if ($row{set} = resolve_alias($input)) { if ($row{set} eq 'Internal') { - $row{table} = ' 'x640; + $row{table} = ' ' x ($endpoint < 255 ? 640 : 4096); $row{set} = 'Unicode BMP'; } elsif ($row{set} eq 'utf-8-strict') { @@ -96,32 +96,38 @@ sub quote { sub printcell_unicode { my ($value) = @_; - if ($value > 0x27F) { + if ($value > 0xFFF) { print "\n".'?'; } elsif ($value == 0) { - print 'control'; + print 'control'; } elsif ($value == 2) { - print 'latin'; + print 'comn'; + } + elsif ($value == 4) { + print 'basic latin'; } elsif ($value == 8) { - print 'control'; + print 'control'; } elsif ($value == 10) { - print 'latin supplement'; + print 'comn'; + } + elsif ($value == 12) { + print 'latin1'; } elsif ($value == 0x10) { - print 'latin ext-A'; + print 'latin extended-A'; } elsif ($value == 0x18) { - print 'latin ext-B'; + print 'latin extended-B'; } elsif ($value == 0x20) { - print 'latin ext-B'; + print 'latin ext-B'; } elsif ($value == 0x25) { - print 'IPA'; + print 'IPA'; } elsif ($value == 0x2B) { print 'spacing modifier'; @@ -130,199 +136,199 @@ sub printcell_unicode { print 'diacritics'; } elsif ($value == 0x38) { - print 'greek'; + print 'greek'; } elsif ($value == 0x40) { - print 'cyrillic'; + print 'cyrillic'; } elsif ($value == 0x50) { - print 'cyrillic+'; + print 'cyrillic+'; } elsif ($value == 0x53) { - print 'armenian'; + print 'armenian'; } elsif ($value == 0x58) { - print 'hebrew'; + print 'hebrew'; } elsif ($value == 0x60) { - print 'arabic'; + print 'arabic'; } elsif ($value == 0x70) { - print 'syriac'; + print 'syriac'; } elsif ($value == 0x75) { - print 'arabic+'; + print 'arabic+'; } elsif ($value == 0x78) { - print 'thaana'; + print 'thaana'; } elsif ($value == 0x7C) { - print 'nko'; + print 'n\'ko'; } elsif ($value == 0x80) { - print 'samaritan'; + print 'samaritan'; } elsif ($value == 0x84) { - print 'manda'; + print 'manda'; } elsif ($value == 0x86) { - print 'reserved'; + print 'reserved'; } elsif ($value == 0x90) { - print 'devanagari'; + print 'devanagari'; } elsif ($value == 0x98) { - print 'bengali'; + print 'bengali'; } elsif ($value == 0xA0) { - print 'gurmukhi'; + print 'gurmukhi'; } elsif ($value == 0xA8) { - print 'gujarati'; + print 'gujarati'; } elsif ($value == 0xB0) { - print 'oriya'; + print 'oriya'; } elsif ($value == 0xB8) { - print 'tamil'; + print 'tamil'; } elsif ($value == 0xC0) { - print 'telugu'; + print 'telugu'; } elsif ($value == 0xC8) { - print 'kannada'; + print 'kannada'; } elsif ($value == 0xD0) { - print 'malayalam'; + print 'malayalam'; } elsif ($value == 0xD8) { - print 'sinhala'; + print 'sinhala'; } elsif ($value == 0xE0) { - print 'thai'; + print 'thai'; } elsif ($value == 0xE8) { - print 'lao'; + print 'lao'; } elsif ($value == 0xF0) { - print 'tibetan'; + print 'tibetan'; } elsif ($value == 0x100) { - print 'myanmar'; + print 'myanmar'; } elsif ($value == 0x10A) { - print 'georgian'; + print 'georgian'; } elsif ($value == 0x110) { - print 'hangeul jamo'; + print 'hangeul jamo'; } elsif ($value == 0x120) { - print 'ethiopic'; + print 'ethiopic'; } elsif ($value == 0x130) { - print 'ethiopic'; + print 'ethiopic'; } elsif ($value == 0x138) { - print 'eth+'; + print 'eth+'; } elsif ($value == 0x13A) { - print 'cherokee'; + print 'cherokee'; } elsif ($value == 0x140) { - print 'unified canadian aboriginal syllabics'; + print 'unified canadian aboriginal syllabics'; } elsif ($value == 0x160) { - print 'unified canadian syllabics'; + print 'unified canadian syllabics'; } elsif ($value == 0x168) { - print 'ogham'; + print 'ogham'; } elsif ($value == 0x16A) { - print 'runic'; + print 'runic'; } elsif ($value == 0x170) { - print 'tagalog'; + print 'tagalog'; } elsif ($value == 0x172) { - print 'hanun'; + print 'hanun'; } elsif ($value == 0x174) { - print 'buhid'; + print 'buhid'; } elsif ($value == 0x176) { - print 'tagb'; + print 'tagb'; } elsif ($value == 0x178) { - print 'khmer'; + print 'khmer'; } elsif ($value == 0x180) { - print 'mongolian'; + print 'mongolian'; } elsif ($value == 0x18B) { - print 'canadian+'; + print 'canadian+'; } elsif ($value == 0x190) { - print 'limbu'; + print 'limbu'; } elsif ($value == 0x195) { - print 'tai le'; + print 'tai le'; } elsif ($value == 0x198) { - print 'new tai lue'; + print 'new tai lue'; } elsif ($value == 0x19E) { - print 'km'; + print 'km'; } elsif ($value == 0x1A0) { - print 'lontara'; + print 'lontara'; } elsif ($value == 0x1A2) { - print 'tai tham'; + print 'tai tham'; } elsif ($value == 0x1AB) { - print 'reserved'; + print 'reserved'; } elsif ($value == 0x1B0) { - print 'balinese'; + print 'balinese'; } elsif ($value == 0x1B8) { - print 'sundanese'; + print 'sundanese'; } elsif ($value == 0x1BC) { - print 'batak'; + print 'batak'; } elsif ($value == 0x1C0) { - print 'lepcha'; + print 'lepcha'; } elsif ($value == 0x1C5) { - print 'ol chiki'; + print 'ol chiki'; } elsif ($value == 0x1C8) { - print 'reserved'; + print 'reserved'; } elsif ($value == 0x1CD) { - print 'vedic'; + print 'vedic'; } elsif ($value == 0x1D0) { - print 'phonetic'; + print 'phonetic'; } elsif ($value == 0x1D8) { - print 'phonetic+'; + print 'phonetic+'; } elsif ($value == 0x1DC) { print 'combining'; } elsif ($value == 0x1E0) { - print 'latin extended additional'; + print 'latin extended additional'; } elsif ($value == 0x1F0) { - print 'greek+'; + print 'greek+'; } elsif ($value == 0x200) { - print 'general punctuation'; + print 'general punctuation'; } elsif ($value == 0x207) { - print 'su[bp]script'; + print 'suþscript'; # suth now means "sub and/or sup" } elsif ($value == 0x20A) { print 'currency'; @@ -375,23 +381,260 @@ sub printcell_unicode { elsif ($value == 0x27F) { print 'arr'; } + elsif ($value == 0x280) { + print 'braille'; + } + elsif ($value == 0x290) { + print 'supplemental arrows-B'; + } + elsif ($value == 0x298) { + print 'mathematical symbols-B'; + } + elsif ($value == 0x2A0) { + print 'supplemental mathematical operators'; + } + elsif ($value == 0x2B0) { + print 'miscellaneous symbols and arrows'; + } + elsif ($value == 0x2C0) { + print 'glagolitic'; + } + elsif ($value == 0x2C6) { + print 'latin-C'; + } + elsif ($value == 0x2C8) { + print 'coptic'; + } + elsif ($value == 0x2D0) { + print 'georgian+'; + } + elsif ($value == 0x2D3) { + print 'tifinagh'; #TODO: proto-canaanite + } + elsif ($value == 0x2D8) { + print 'ethiopic+'; + } + elsif ($value == 0x2DE) { + print 'cyrl-A'; + } + elsif ($value == 0x2E0) { + print 'punctuation+'; + } + elsif ($value == 0x2E8) { + print 'cjk radicals'; + } + elsif ($value == 0x2F0) { + print 'kangxi radicals'; + } + elsif ($value == 0x2FE) { + print ''; + } + elsif ($value == 0x2FF) { + print 'idc'; + } + elsif ($value == 0x300) { + print 'cjk misc'; + } + elsif ($value == 0x304) { + print 'hiragana'; + } + elsif ($value == 0x30A) { + print 'katakana'; + } + elsif ($value == 0x310) { + print 'bopomofo'; + } + elsif ($value == 0x313) { + print 'hangeul compat'; + } + elsif ($value == 0x319) { + print 'kbn'; + } + elsif ($value == 0x31A) { + print 'bpmf'; + } + elsif ($value == 0x31C) { + print 'strokes'; + } + elsif ($value == 0x31F) { + print 'k+'; + } + elsif ($value == 0x320) { + print 'enclosed cjk characters'; + } + elsif ($value == 0x330) { + print 'cjk compatibility'; + } + elsif ($value == 0x340) { + print 'cjk unified ideographs extension A'; + } + elsif ($value == 0x4D0) { + print 'cjk unified ideographs extension A'; + } + elsif ($value == 0x4DC) { + print 'hexagrams'; + } + elsif ($value == 0x4E0) { + print 'cjk unified ideographs'; + } + elsif ($value == 0xA00) { + print 'yi'; + } + elsif ($value == 0xA40) { + print 'yi'; + } + elsif ($value == 0xA49) { + print 'yi radicals'; + } + elsif ($value == 0xA4D) { + print 'lisu'; + } + elsif ($value == 0xA50) { + print 'vai'; + } + elsif ($value == 0xA60) { + print 'vai'; + } + elsif ($value == 0xA64) { + print 'cyrillic extended-B'; + } + elsif ($value == 0xA6A) { + print 'bamum'; + } + elsif ($value == 0xA70) { + print 'tones'; + } + elsif ($value == 0xA72) { + print 'latin extended-D'; + } + elsif ($value == 0xA80) { + print 'sylheti'; + } + elsif ($value == 0xA83) { + print 'in'; + } + elsif ($value == 0xA84) { + print 'phags-pa'; + } + elsif ($value == 0xA88) { + print 'saurashtra'; + } + elsif ($value == 0xA8E) { + print 'deva+'; + } + elsif ($value == 0xA90) { + print 'kayah li'; + } + elsif ($value == 0xA93) { + print 'rejang'; + } + elsif ($value == 0xA96) { + print 'jamo-A'; + } + elsif ($value == 0xA98) { + print 'javanese'; + } + elsif ($value == 0xA9E) { + print 'res'; + } + elsif ($value == 0xAA0) { + print 'cham'; + } + elsif ($value == 0xAA6) { + print 'mym-A'; + } + elsif ($value == 0xAA8) { + print 'tai viet'; + } + elsif ($value == 0xAAE) { + print 'mtei+'; + } + elsif ($value == 0xAB0) { + print 'reserved'; + } + elsif ($value == 0xABC) { + print 'manipuri'; + } + elsif ($value == 0xAC0) { + print 'hangeul syllables'; + } + elsif ($value == 0xD70) { + print 'hangeul syllables'; + } + elsif ($value == 0xD7B) { + print 'haungeul jamo-B'; + } + elsif ($value == 0xD80) { + print 'high surrogates'; + } + elsif ($value == 0xDC0) { + print 'low surrogates'; + } + elsif ($value == 0xE00) { + print 'private use'; + } + elsif ($value == 0xF90) { + print 'cjk compatibility ideographs'; + } + elsif ($value == 0xFB0) { + print 'presentation'; + } + elsif ($value == 0xFB5) { + print ''; + } + elsif ($value == 0xFC0) { + print 'arabic presentation forms A'; + } + elsif ($value == 0xFD0) { + print ''; + } + elsif ($value == 0xFDD) { + print '?'; + } + elsif ($value == 0xFDF) { + print ''; + } + elsif ($value == 0xFE0) { + print 'var'; + } + elsif ($value == 0xFE1) { + print 'ver'; + } + elsif ($value == 0xFE2) { + print '½'; + } + elsif ($value == 0xFE3) { + print 'comp'; + } + elsif ($value == 0xFE5) { + print 'small'; + } + elsif ($value == 0xFE7) { + print 'arabic presentation B'; + } + elsif ($value == 0xFF0) { + print 'halfwidth & fullwidth forms'; + } + elsif ($value == 0xFFF) { + print 'sp'; + } } sub printcell_utf8 { my ($value) = @_; if ($value <= 0x7F) { print 'Single byte ASCII' + ' title="U+0000 – U+007F">single byte ASCII' if $value == 0; } elsif ($value <= 0xBF) { print 'Multi-byte continuation' + '>multi-byte continuation' if $value == 0x80; } elsif ($value <= 0xC1) { print '(Overl.)' + ' title="U+0000 – U+007F">(overl.)' if $value == 0xC0; } elsif ($value <= 0xDF) { @@ -414,7 +657,7 @@ sub printcell_utf8 { } elsif ($value <= 0xF7) { print '(Overflow)' + ' title="U+11·0000 – U+1FF·FFFF">(overflow)' if $value == 0xF5; } elsif ($value <= 0xFB) { @@ -428,7 +671,7 @@ sub printcell_utf8 { if $value == 0xFC; } elsif ($value <= 0xFF) { - print 'Invalid' + print 'invalid' if $value == 0xFE; } else { -- 2.30.0