charset: legacy map ansel (and extensions)
[sheet.git] / charset-encoding.inc.pl
index 92330ab0f876092ee425d0412a28d989d1e4ecaa..f4a9f26d1f9007a8153d320f467b40665a2a44ff 100644 (file)
@@ -9,7 +9,7 @@ use utf8;
        ebcdic     => [qw( cp37 cp500 cp1047 posix-bc cp1026 cp875 )],
        iso        => [map {"iso-8859-$_"} 1 .. 11, 13 .. 16],
        dos        => [qw( cp437 cp865 cp861 cp860 cp863 cp850 cp857 cp852 cp775
-                          cp737 cp869 cp866 cp855 cp862 cp864 )],
+                          cp737 cp869 cp866 MIK cp855 cp862 cp864 )],
        aix        => [qw( cp1006 )],
        win        => [qw( cp1252 cp1250 cp1254 cp1257 cp1258 cp1253 cp1251 cp1255 cp1256 cp874 )],
        mac        => [qw( MacRoman MacRomanian MacRumanian MacCroatian MacCentralEurRoman MacTurkish MacIcelandic MacSami
@@ -22,7 +22,7 @@ use utf8;
        norteur    => [qw( baltic nordic )],
        baltic     => [qw( iso-8859-4 iso-8859-13 cp1257 cp775 )],
        nordic     => [qw( iso-8859-10 cp865 cp861 MacIcelandic MacSami )],
-       cyrillic   => [qw( koi8-r koi8-u koi8-f iso-8859-5 cp1251 MacCyrillic cp866 cp855
+       cyrillic   => [qw( koi8-r koi8-u koi8-f iso-8859-5 cp1251 MacCyrillic cp866 MIK cp855
                           +400 +2DE0 +A640-A69F +500-52F )], # MacUkrainian is broken
        arabic     => [qw( iso-8859-6 cp1256 MacArabic cp864 cp1006 MacFarsi
                           +600 +8A0-8BF+8E0 +750-77F )],
@@ -161,6 +161,18 @@ use utf8;
 
        'koi8-u'       => {inherit => ['koi8-r' => '90-BF']},
        'koi8-f'       => {inherit => ['koi8-u' => '90-BF']},
+       'mik'          => {inherit => ['cp437' => '80-D8', 'cp866' => 'B0'], setup => sub {
+               $_[0]->{table} = [(map {chr} 0 .. 0x7F), qw(
+                       А Б В Г Д Е Ж З И Й К Л М Н О П
+                       Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я
+                       а б в г д е ж з и й к л м н о п
+                       р с т у ф х ц ч ш щ ъ ы ь э ю я
+                       └ ┴ ┬ ├ ─ ┼ ╣ ║ ╚ ╔ ╩ ╦ ╠ ═ ╬ ┐
+                       ░ ▒ ▓ │ ┤ № § ╗ ╝ ┘ ┌ █ ▄ ▌ ▐ ▀
+                       α ß Γ π Σ σ µ τ Φ Θ Ω δ ∞ φ ε ∩
+                       ≡ ± ≥ ≤ ⌠ ⌡ ÷ ≈ ° ∙ · √ ⁿ ² ■
+               ), "\xA0"];
+       }},
 
        'macromanian'  => {inherit => ['MacRoman' => 'A0-BF+D0-DF']},
        'macrumanian'  => {inherit => ['MacRomanian' => 'A0-BF+D0-DF', 'MacRoman' => 'A0-BF+D0-DF']},
@@ -206,6 +218,81 @@ use utf8;
        'cp1026'       => {inherit => ['cp37' => '40']},
        'cp875'        => {inherit => ['cp37' => '30']},
 
+       legacy     => [qw( cp437 ATASCII PETSCII MSX ZX-Spectrum ANSEL )],
+       'petscii'      => {inherit => ['' => '40-7F+A0-BF'], setup => sub {
+               $_[0]->{table} = [(map {chr} 0 .. 0x3F), qw(
+                       @ a b c d e f g h i j k l m n o p q r s t u v w x y z [ £ ] ↑ ←
+                       🭹 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ┼ 🮌 │ 🮖 🮘
+                       . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
+                         ▌ ▄ ▔ ▁ ▏ ▒ ▕ 🮏 🮙 🮇 ├ ▗ └ ┐ ▂ ┌ ┴ ┬ ┤ ▎ ▍ 🮈 🮂 🮃 ▃ ✓ ▖ ▝ ┘ ▘ ▚
+               )];
+       }},
+       'atascii'      => {inherit => ['' => '0-1F+60-7F'], setup => sub {
+               $_[0]->{table} = [qw(
+                       ♥ ├ 🮇 ┘ ┤ ┐ ╱ ╲ ◢ ▗ ◣ ▝ ▘ 🮂 ▂ ▖ ♣ ┌ ─ ┼ • ▄ ▎ ┬ ┴ ▌ └ ␛ ↑ ↓ ← →
+                       _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+                       _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+                       ♦ a b c d e f g h i j k l m n o p q r s t u v w x y z ♠ | 🢰 ◀ ▶
+               )];
+       }},
+       'zx-spectrum'  => {
+               inherit => ['' => '50-8F'],
+               set => 'ascii',
+               replace => {
+                       ord('^') => '↑',
+                       ord('`') => '£',
+                       0x7F => '© ▝▘▀▗▐▚▜▖▞▌▛▄▟▙█',
+               },
+       },
+       'msx'          => {inherit => ['cp437' => '80-FF'], setup => sub {
+               $_[0]->{table} = [(map {chr} 0 .. 0x7F), qw(
+                       Ç ü é â ä à å ç ê ë è ï î ì Ä Å É æ Æ ô ö ò û ù ÿ Ö Ü ¢ £ ¥ ₧ ƒ
+                       á í ó ú ñ Ñ ª º ¿ ⌐ ¬ ½ ¼ ¡ « » Ã ã Ĩ ĩ Õ õ Ũ ũ IJ ij ¾ ∽ ◊ ‰ ¶ §
+                       ▂ ▚ ▆ 🮂 ▬ 🮅 ▎ ▞ ▊ 🮇 🮊 🮙 🮘 🭭 🭯 🭬 🭮 🮚 🮛 ▘ ▗ ▝ ▖ 🮖 Δ ‡ ω █ ▄ ▌ ▐ ▀
+                       α ß Γ π Σ σ µ τ Φ Θ Ω δ ∞ ⌀ ∈ ∩ ≡ ± ≥ ≤ ⌠ ⌡ ÷ ≈ ° ∙ · √ ⁿ ² ■
+               )];
+       }},
+       'brascii'      => {
+               inherit => ['' => 'D0-DF+F0-FF'],
+               setup => sub {
+                       $_[0]->{table} = [(map {chr} 0 .. 0xFF)];
+               },
+               replace => {
+                       0xD7 => 'Œ',
+                       0xF7 => 'œ',
+               },
+       },
+       'ansel'        => {
+               note => '+GEDCOM',
+               inherit => ['' => 'A0-CF+E0-FE'],
+               setup => sub {
+                       $_[0]->{table} = [
+                               (undef) x 0xA0,
+                               undef, qw( Ł Ø Đ Þ Æ Œ ʹ · ♭ ®    ±          Ơ Ư ʾ ), undef,
+                               qw( ʿ      ł ø đ þ æ œ ʺ ı £ ð ), undef, qw( ơ ư ), undef, undef,
+                               qw( °      ℓ ℗ © ♯ ¿ ¡ ), (undef) x 0x19,
+                               (map {$_ && chr}
+                                       0x309, 0x300, 0x0301, 0x0302, 0x0303, 0x304, 0x306, 0x307,
+                                       0x308, 0x30C, 0x030A, 0xFE20, 0xFE21, 0x315, 0x30B, 0x310,
+                                       0x327, 0x328, 0x0323, 0x0324, 0x0325, 0x333, 0x332, 0x326,
+                                       0x31C, 0x32E, 0xFE22, 0xFE23, undef,  undef, 0x313, undef,
+                               ),
+                       ];
+               },
+               replace => {
+                       # GEDCOM extensions
+                       0xBE => '□',
+                       0xBF => '■',
+                       0xCD => 'e', # endowment?
+                       0xCE => 'o', # ordinance?
+                       0xCF => 'ß',
+                       0xFC => "\x{338}",
+                       # MARC21 extensions
+                       0xC7 => 'ß',
+                       0xC8 => '€',
+               },
+       },
+
        ''             => {setup => sub {
                my $row = shift;
                $row->{offset} = delete $row->{startpoint};
@@ -214,7 +301,8 @@ use utf8;
                $row->{endpoint} ||= ($block + 1 << 8) - 1;
                $block == ($row->{endpoint} >> 8) or undef $block;
 
-               $row->{table} = join '', map { chr } $row->{offset} .. $row->{endpoint};
+               $row->{table} = join '', map { chr =~ s/\A\p{Unassigned}\z/�/r }
+                       $row->{offset} .. $row->{endpoint};
                utf8::upgrade($row->{table});  # prevent latin1 output
 
                $row->{endpoint} -= $row->{offset};