charset: generic glyph replacement from metadata
authorMischa POSLAWSKY <perl@shiar.org>
Sat, 22 Apr 2017 10:32:37 +0000 (12:32 +0200)
committerMischa POSLAWSKY <perl@shiar.org>
Thu, 25 May 2017 20:10:56 +0000 (22:10 +0200)
Common format with full support for partial overlaps.

charset-encoding.inc.pl
charset.plp

index eee5aa2..71ba2ba 100644 (file)
@@ -1,5 +1,6 @@
 use 5.014;
 use warnings;
+use utf8;
 
 +{
        default    => [qw( u+0-27F utf8+realsize iso-8859-1 iso-8859-15 cp1252 cp437 cp850 )],
@@ -45,12 +46,30 @@ use warnings;
        11 => 'thai',
 
        'ascii'        => {inherit => ['' => '00-7F']},
-       'cp437'        => {inherit => ['cp850' => 0, 'ascii' => '00-1F+80']}, # ascii range overridden later
+       'cp437'        => {
+               inherit => ['cp850' => 0, 'ascii' => '00-1F+80'], # ascii range overridden later
+               replace => {
+                       0    => ' ☺☻♥♦♣♠•◘○◙♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲▼', # visible variants of control characters
+                       0xED => 'ϕ', # non-greek usage and appearance
+               },
+       },
        'gsm0338'      => {inherit => ['ascii' => '00-7F']},
        'dingbats'     => {inherit => ['' => '20-7F+A0']},
        'macdingbats'  => {inherit => ['dingbats' => '80-9F']},
        'adobezdingbat'=> {inherit => ['MacDingbats' => '80-9F']}, # should be identical but maps to private use
-       'symbol'       => {inherit => ['' => '20-7F+A0']},
+       'symbol'       => {
+               inherit => ['' => '20-7F+A0'],
+               replace => {
+                       0x60 => '│', # replace radical extender by closest unicode equivalent
+                       0xBD => '⏐⎯', # arrow extenders
+                       0xD2 => '®©™', # serif variants
+                       0xE0 => '◊', # replace lookalike, should match AdobeSymbol
+                       0xE2 => '®©™', # sans-serif variants
+                       0xE6 => '⎛⎜⎝⎡⎢⎣⎧⎨⎩⎪',
+                       0xF0 => '€',
+                       0xF4 => '⎮⌡⎞⎟⎠⎤⎥⎦⎫⎬⎭',
+               },
+       },
        'adobesymbol'  => {inherit => ['symbol' => '20-7F+A0', '' => '20-7F+A0']}, # minor differences, irrelevant except for different '€'
        'wingdings'    => {inherit => ['' => '20'], setup => sub {require Encode::Wingdings}},
        'wingdings2'   => {inherit => ['' => '20'], setup => sub {require Encode::Wingdings2}},
index ed495fe..7f19bc1 100644 (file)
@@ -164,33 +164,6 @@ sub tabinput {
                        $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $row{endpoint});
                }
 
-               if ($row{set} eq 'cp437') {
-                       if ($row{offset} <= 0xED and $row{endpoint} >= 0xED) {
-                               # replace phi glyph
-                               substr($row{table}, 0xED - $row{offset}, 1) = 'ϕ';
-                       }
-                       if ($row{offset} < 0x20) {
-                               # replace control characters by visible variants
-                               my $sub = substr ' ☺☻♥♦♣♠•◘○◙♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲▼', $row{offset};
-                               substr($row{table}, 0, length $sub) = $sub;
-                       }
-               }
-               elsif ($row{set} eq 'symbol') {
-                       if ($row{offset} <= 0x60 and $row{endpoint} >= 0x60) {
-                               # replace radical extender by closest unicode equivalent
-                               substr($row{table}, 0x60 - $row{offset},  1) = '│';
-                       }
-                       if ($row{offset} <= 0xBD and $row{endpoint} >= 0xFF) {
-                               substr($row{table}, 0xBD - $row{offset},  2) = '⏐⎯'; # arrow extenders
-                               substr($row{table}, 0xD2 - $row{offset},  3) = '®©™'; # serif variants
-                               substr($row{table}, 0xE0 - $row{offset},  1) = '◊'; # replace lookalike, should match AdobeSymbol
-                               substr($row{table}, 0xE2 - $row{offset},  3) = '®©™'; # sans-serif variants
-                               substr($row{table}, 0xE6 - $row{offset}, 10) = '⎛⎜⎝⎡⎢⎣⎧⎨⎩⎪';
-                               substr($row{table}, 0xF0 - $row{offset},  1) = '€';
-                               substr($row{table}, 0xF4 - $row{offset}, 11) = '⎮⌡⎞⎟⎠⎤⎥⎦⎫⎬⎭';
-                       }
-               }
-
                $row{endpoint} -= $row{offset};
 
                $visible->{ascii} =  # assume common base
@@ -200,6 +173,32 @@ sub tabinput {
                Alert("Encoding <q>$input</q> unknown");
                return;
        }
+
+       if (my $replace = $charset->{replace}) {
+               while (my ($offset, $sub) = each %{$replace}) {
+                       $offset -= $row{offset};
+
+                       my $length = length $sub;
+
+                       if ($offset < 0) {
+                               $offset > -$length or next; # at least one character after start
+                               # trim leftmost part to start at offset
+                               substr($sub, 0, -$offset) = '';
+                               $length += $offset;
+                               $offset = 0;
+                       }
+
+                       if ((my $excess = $row{endpoint} - $offset - $length + 1) < 0) {
+                               $excess > -$length or next;
+                               # trim rightmost part to prevent overflow
+                               substr($sub, $excess) = '';
+                               $length += $excess;
+                       }
+
+                       substr($row{table}, $offset, $length) = $sub;
+               }
+       }
+
        push @request, \%row;
 }
 tabinput($_) for @tablist;