charset: generic glyph replacement from metadata
[sheet.git] / charset.plp
index 4f60765cd7c94d346dae6f013891ee39509cb95e..7f19bc183e6fb02abd1d91622041cf7f75479905 100644 (file)
@@ -143,50 +143,12 @@ sub tabinput {
                }
        }
 
-       if ($input =~ m{ \A (?:wing|web)dings \d* \z }ix) {
-               eval "require Encode::\u$input";
+       if ($charset->{setup}) {
+               eval { $charset->{setup}->(\%row) }
+                       or Alert("Incomplete setup of $input", $@);
        }
 
-       if ($input eq '') {
-               $row{offset} = delete $row{startpoint};
-               $row{set} = 'Unicode characters';
-               my $block = $row{offset} >> 8;
-               $row{endpoint} ||= ($block + 1 << 8) - 1;
-               $block == ($row{endpoint} >> 8) or undef $block;
-
-               $row{table} = join '', map { chr } $row{offset} .. $row{endpoint};
-               utf8::upgrade($row{table});  # prevent latin1 output
-
-               $row{endpoint} -= $row{offset};
-
-               if (defined $block) {
-                       $row{set} = sprintf 'Unicode block U+%02Xxx', $block;
-                       $row{offset} %= 0x100;
-               }
-       }
-       elsif (lc $input eq 'uu') {
-               $row{cell} = do 'charset-ucplanes.inc.pl'
-                       or Alert('Table data could not be read', $@ || $!);
-               $row{endpoint} ||= 0x3FF;
-               $row{set} = 'Unicode planes';
-       }
-       elsif (lc $input eq 'u') {
-               $row{cell} = do 'charset-unicode.inc.pl'
-                       or Alert('Table data could not be read', $@ || $!);
-
-               $row{endpoint} ||= 0x1FFF;
-               $row{set} = 'Unicode ' . (
-                       $row{startpoint} <  0x1000 && $row{endpoint} < 0x1000 ? 'BMP' :
-                       $row{startpoint} >= 0x1000 && $row{endpoint} < 0x2000 ? 'SMP' :
-                       'allocations'
-               );
-       }
-       elsif ($input =~ m/^utf-*8$/i) {
-               $row{set} = 'UTF-8';
-               $row{cell} = do 'charset-utf8.inc.pl'
-                       or Alert('Table data could not be read', $@ || $!);
-               $row{endpoint} = 0xFF;
-       }
+       if ($row{set}) {}
        elsif ($row{set} = Encode::resolve_alias($input)) {
                $row{offset} = delete $row{startpoint};
                $row{endpoint} ||= 0xFF;
@@ -202,33 +164,6 @@ sub tabinput {
                        $row{table} = Encode::decode($row{set}, pack 'C*', $row{offset} .. $row{endpoint});
                }
 
-               if ($row{set} eq 'cp437') {
-                       if ($row{offset} <= 0xED and $row{endpoint} >= 0xED) {
-                               # replace phi glyph
-                               substr($row{table}, 0xED - $row{offset}, 1) = 'ϕ';
-                       }
-                       if ($row{offset} < 0x20) {
-                               # replace control characters by visible variants
-                               my $sub = substr ' ☺☻♥♦♣♠•◘○◙♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲▼', $row{offset};
-                               substr($row{table}, 0, length $sub) = $sub;
-                       }
-               }
-               elsif ($row{set} eq 'symbol') {
-                       if ($row{offset} <= 0x60 and $row{endpoint} >= 0x60) {
-                               # replace radical extender by closest unicode equivalent
-                               substr($row{table}, 0x60 - $row{offset},  1) = '│';
-                       }
-                       if ($row{offset} <= 0xBD and $row{endpoint} >= 0xFF) {
-                               substr($row{table}, 0xBD - $row{offset},  2) = '⏐⎯'; # arrow extenders
-                               substr($row{table}, 0xD2 - $row{offset},  3) = '®©™'; # serif variants
-                               substr($row{table}, 0xE0 - $row{offset},  1) = '◊'; # replace lookalike, should match AdobeSymbol
-                               substr($row{table}, 0xE2 - $row{offset},  3) = '®©™'; # sans-serif variants
-                               substr($row{table}, 0xE6 - $row{offset}, 10) = '⎛⎜⎝⎡⎢⎣⎧⎨⎩⎪';
-                               substr($row{table}, 0xF0 - $row{offset},  1) = '€';
-                               substr($row{table}, 0xF4 - $row{offset}, 11) = '⎮⌡⎞⎟⎠⎤⎥⎦⎫⎬⎭';
-                       }
-               }
-
                $row{endpoint} -= $row{offset};
 
                $visible->{ascii} =  # assume common base
@@ -238,6 +173,32 @@ sub tabinput {
                Alert("Encoding <q>$input</q> unknown");
                return;
        }
+
+       if (my $replace = $charset->{replace}) {
+               while (my ($offset, $sub) = each %{$replace}) {
+                       $offset -= $row{offset};
+
+                       my $length = length $sub;
+
+                       if ($offset < 0) {
+                               $offset > -$length or next; # at least one character after start
+                               # trim leftmost part to start at offset
+                               substr($sub, 0, -$offset) = '';
+                               $length += $offset;
+                               $offset = 0;
+                       }
+
+                       if ((my $excess = $row{endpoint} - $offset - $length + 1) < 0) {
+                               $excess > -$length or next;
+                               # trim rightmost part to prevent overflow
+                               substr($sub, $excess) = '';
+                               $length += $excess;
+                       }
+
+                       substr($row{table}, $offset, $length) = $sub;
+               }
+       }
+
        push @request, \%row;
 }
 tabinput($_) for @tablist;