tools/mkdigraphlist

   1 #!/usr/bin/env perl
   2
   3 use strict;
   4 use warnings;
   5 use utf8;
   6
   7 use open OUT => ':utf8', ':std';
   8 use Data::Dumper;
   9
  10 our $VERSION = '1.02';
  11
  12 # expect input data source at command line
  13 @ARGV or die "Specify input source file or - for STDIN\n";
  14
  15 # skip everything until a character indented by 1 space (table start)
  16 do {
  17         $_ = readline;
  18         defined or die "Premature input end";
  19 } until /^\s\S/;
  20
  21 my @t = $_;  # add first line (already read, assume it's ok)
  22
  23 # read the rest of the character table
  24 while ($_ = readline) {
  25         # check for table end (chapter 4)
  26         last if /^\d/;
  27
  28         # parse table lines (ignore (unindented) page break)
  29         next unless s/^ //;
  30         chomp;
  31
  32         # add the line to @t
  33         if (s/^ {15}/ /) {
  34                 # continuation line (add to last entry)
  35                 $t[-1] .= $_;
  36         }
  37         else {
  38                 # add a new entry
  39                 push @t, $_;
  40         }
  41 }
  42
  43 # create a hash of desired input
  44 my %di;
  45 for (@t) {
  46         my ($mnem, $char, $name) = split / +/, $_, 3;
  47         next if length $mnem != 2;
  48         $di{$mnem} = hex $char;
  49 }
  50
  51 # XXX
  52 my %trans = (
  53         0xE001 => 0,  # join lines: not accepted
  54         0xE004 => 0,  # umlaut is no different from diaeresis 0x0308
  55         0xE005 => 0x0344, # discouraged
  56         0xE006 => 0x0300,
  57         0xE007 => 0x0301,
  58         0xE008 => 0x0302,
  59         0xE009 => 0x0303,
  60         0xE00A => 0x0304,
  61         0xE00B => 0x0306,
  62         0xE00C => 0x0307,
  63         0xE00D => 0x0308,
  64         0xE00E => 0x030A,
  65         0xE00F => 0x030B,
  66         0xE010 => 0x030C,
  67         0xE011 => 0x0327,
  68         0xE012 => 0x0328,
  69         0xE013 => 0x0332,
  70         0xE014 => 0x0333,
  71         0xE015 => 0x0338,
  72         0xE016 => 0x0345,
  73         0xE017 => 0x0314,
  74         0xE018 => 0x0313,
  75         0xE019 => 0x1FFE,
  76         0xE01A => 0x1FBF,
  77         0xE01B => 0x03D0,  # middle beta = curled beta?
  78         0xE01C => 0x25CB,
  79         0xE01D => 0x0192,
  80         0xE01E => 0x0292,
  81         0xE01F => 0x33C2,  # am, compatibility char
  82         0xE020 => 0x33D8,  # pm, compatibility char
  83         0xE021 => 0x2121,
  84         0xE022 => 0xFE8E,
  85         0xE023 => 0,  # dutch guilder 0192 is already encoded, and not very useful anyway
  86         0xE024 => 0x0393,
  87         0xE025 => 0x20D7,  # also 20D1; non-spacing
  88         0xE026 => 0x1FEF,
  89         0xE027 => 0x1FC0,
  90         0xE028 => 0x01F0, #but uppercase
  91 );
  92 for (values %di) {
  93         $_ >= 0xE000 or next;
  94         $_ = $trans{$_} if defined $trans{$_};
  95 }
  96
  97 # personal addendums
  98 my @extra;
  99 if (-r 'shiar.inc.txt') {
 100         open my $include, '<:utf8', 'shiar.inc.txt';
 101         for (readline $include) {
 102                 m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next;
 103                 warn("$1 already defined"), next if defined $di{$1};
 104                 $di{$1} = ord $2;
 105                 push @extra, $1;
 106         }
 107 }
 108 warn $@ if $@;
 109
 110 $di{chr $_} = $_ for 32 .. 126;
 111 $di{'\\'.$_} = delete $di{$_} for '{', '}', '\\';
 112
 113 # optionally get unicode character information
 114 my %info = eval {
 115         require Unicode::UCD;
 116         map {
 117                 $_ => Unicode::UCD::charinfo($di{$_})
 118                         || { block => '?', category => 'Xn', name => '', script => '' }
 119         } keys %di;
 120 };
 121
 122 # add custom categories for certain blocks
 123 for (values %info) {
 124         $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
 125         $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
 126 }
 127
 128 # mark unofficial extras as such
 129 $info{$_}->{category} .= ' Xz' for @extra;
 130
 131 for (keys %di) {
 132         $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
 133         # find control characters (first 32 chars from 0 and 128)
 134         next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
 135         # rename to something more descriptive
 136         $info{$_}->{name} = $info{$_}->{unicode10}
 137                 ? '<'.$info{$_}->{unicode10}.'>'  # the old name was much more useful
 138                 : sprintf('<control U+%04X>', $di{$_});  # at least identify by value
 139         # show descriptive symbols instead of control chars themselves
 140         $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
 141 }
 142 # presentational string for some control(lish) entries
 143 $info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di;
 144 $info{$_}->{string} = '␣' for grep { $di{$_} == 0x200B } keys %di;
 145 $info{$_}->{string} = '|' for grep { $di{$_} == 0x200C } keys %di;
 146 $info{$_}->{string} = '⁀' for grep { $di{$_} == 0x200D } keys %di;
 147 $info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di;
 148 $info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di;
 149
 150 # convert info hashes into arrays of strings to output in display order
 151 for my $row (values %info) {
 152         $row = [ map { $row->{$_} } qw/name category script string/ ];
 153         # strip off trailing missing values (especially string may be unknown)
 154         defined $row->[-1] ? last : pop @$row for 1 .. @$row;
 155 }
 156
 157 # output perl code of hash
 158 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
 159 print "# automatically generated by $0\n";
 160 print "use utf8;\n";
 161 print "+{\n";
 162 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
 163         map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
 164 );
 165 printf "q{%s}=>[%s],\n", $_, join(',',
 166         $di{$_},   # original code point
 167         $info{$_}  # optional additional arguments
 168                 ? map {"'$_'"} @{ $info{$_} }
 169                 : ()
 170 ) for sort keys %di;
 171 print "}\n";
 172
 173 __END__
 174
 175 =head1 NAME
 176
 177 mkdigraphlist - Output digraph data from RFC-1345
 178
 179 =head1 SYNOPSIS
 180
 181 Extract digraphs from text specifications as a perl hash:
 182
 183     mkdigraphlist rfc1345.txt custom.txt > digraphs.inc.pl
 184
 185 Input can be the literal RFC (or similar) document:
 186
 187     curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist -
 188
 189 Test by printing the character for DO (should be a dollar sign):
 190
 191     perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
 192
 193 =head1 DESCRIPTION
 194
 195 Parses the official RFC-1345 document, searching the
 196 'character mnemonic table' for all digraph definitions.
 197 If successful, Perl code is output resulting in a hash
 198 with character data keyed by digraph.
 199 Any errors and warnings are given at STDERR.
 200
 201 The value can either be a scalar string containing another
 202 digraph which can be considered identical (usually inverted),
 203 or an array ref containing at least the resulting character's
 204 Unicode code point value.  If available, the following UCD data
 205 is appended: character name, category, script, and output string.
 206 For example:
 207
 208  +{
 209    AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
 210    EA => 'AE',
 211   }
 212
 213 =head1 AUTHOR
 214
 215 Mischa POSLAWSKY <perl@shiar.org>
 216
 217 =head1 LICENSE
 218
 219 Licensed under the GNU Affero General Public License version 3.
 220