rfc1345convert

   1 #!/usr/bin/env perl
   2
   3 use strict;
   4 use warnings;
   5 use utf8;
   6
   7 use open OUT => ':utf8', ':std';
   8 use Data::Dumper;
   9
  10 our $VERSION = '1.01';
  11
  12 # determine input data source
  13 my $input;
  14 if (@ARGV) {
  15         # manual contents specified (either piped or filename(s) given)
  16         $input = \*ARGV;
  17 }
  18 else {
  19         # automatic download from official website
  20         require LWP::Simple;
  21         my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt')
  22                 or die "Couldn't download RFC-1345 from ietf.org";
  23         open $input, '<', \$contents;  # emulate file handle
  24 }
  25
  26 # skip everything until a character indented by 1 space (table start)
  27 do {
  28         $_ = readline $input;
  29         defined or die "Premature input end";
  30 } until /^\s\S/;
  31
  32 my @t = $_;  # add first line (already read, assume it's ok)
  33
  34 # read the rest of the character table
  35 while ($_ = readline $input) {
  36         # check for table end (chapter 4)
  37         last if /^\d/;
  38
  39         # parse table lines (ignore (unindented) page break)
  40         next unless s/^ //;
  41         chomp;
  42
  43         # add the line to @t
  44         if (s/^ {15}/ /) {
  45                 # continuation line (add to last entry)
  46                 $t[-1] .= $_;
  47         }
  48         else {
  49                 # add a new entry
  50                 push @t, $_;
  51         }
  52 }
  53
  54 # create a hash of desired input
  55 my %di;
  56 for (@t) {
  57         my ($mnem, $char, $name) = split / +/, $_, 3;
  58         next if length $mnem != 2;
  59         $di{$mnem} = hex $char;
  60 }
  61
  62 # XXX
  63 my %trans = (
  64         0xE001 => 0,  # join lines: not accepted
  65         0xE004 => 0,  # umlaut is no different from diaeresis 0x0308
  66         0xE005 => 0x0344, # discouraged
  67         0xE006 => 0x0300,
  68         0xE007 => 0x0301,
  69         0xE008 => 0x0302,
  70         0xE009 => 0x0303,
  71         0xE00A => 0x0304,
  72         0xE00B => 0x0306,
  73         0xE00C => 0x0307,
  74         0xE00D => 0x0308,
  75         0xE00E => 0x030A,
  76         0xE00F => 0x030B,
  77         0xE010 => 0x030C,
  78         0xE011 => 0x0327,
  79         0xE012 => 0x0328,
  80         0xE013 => 0x0332,
  81         0xE014 => 0x0333,
  82         0xE015 => 0x0338,
  83         0xE016 => 0x0345,
  84         0xE017 => 0x0314,
  85         0xE018 => 0x0313,
  86         0xE019 => 0x1FFE,
  87         0xE01A => 0x1FBF,
  88         0xE01B => 0x03D0,  # middle beta = curled beta?
  89         0xE01C => 0x25CB,
  90         0xE01D => 0x0192,
  91         0xE01E => 0x0292,
  92         0xE01F => 0x33C2,  # am, compatibility char
  93         0xE020 => 0x33D8,  # pm, compatibility char
  94         0xE021 => 0x2121,
  95         0xE022 => 0xFE8E,
  96         0xE023 => 0,  # dutch guilder 0192 is already encoded, and not very useful anyway
  97         0xE024 => 0x0393,
  98         0xE025 => 0x20D7,  # also 20D1; non-spacing
  99         0xE026 => 0x1FEF,
 100         0xE027 => 0x1FC0,
 101         0xE028 => 0x01F0, #but uppercase
 102 );
 103 for (values %di) {
 104         $_ >= 0xE000 or next;
 105         $_ = $trans{$_} if defined $trans{$_};
 106 }
 107
 108 # personal addendums
 109 my @extra;
 110 if (-r 'shiar.inc.txt') {
 111         open my $include, '<:utf8', 'shiar.inc.txt';
 112         for (readline $include) {
 113                 m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next;
 114                 warn("$1 already defined"), next if defined $di{$1};
 115                 $di{$1} = ord $2;
 116                 push @extra, $1;
 117         }
 118 }
 119 warn $@ if $@;
 120
 121 $di{chr $_} = $_ for 32 .. 126;
 122 $di{'\\'.$_} = delete $di{$_} for '{', '}', '\\';
 123
 124 # optionally get unicode character information
 125 my %info = eval {
 126         require Unicode::UCD;
 127         map {
 128                 $_ => Unicode::UCD::charinfo($di{$_})
 129                         || { block => '?', category => 'Xn', name => '', script => '' }
 130         } keys %di;
 131 };
 132
 133 # add custom categories for certain blocks
 134 for (values %info) {
 135         $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
 136         $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
 137 }
 138
 139 # mark unofficial extras as such
 140 $info{$_}->{category} .= ' Xz' for @extra;
 141
 142 for (keys %di) {
 143         $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
 144         # find control characters (first 32 chars from 0 and 128)
 145         next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
 146         # rename to something more descriptive
 147         $info{$_}->{name} = $info{$_}->{unicode10}
 148                 ? '<'.$info{$_}->{unicode10}.'>'  # the old name was much more useful
 149                 : sprintf('<control U+%04X>', $di{$_});  # at least identify by value
 150         # show descriptive symbols instead of control chars themselves
 151         $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
 152 }
 153 # presentational string for some control(lish) entries
 154 $info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di;
 155 $info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di;
 156 $info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di;
 157
 158 # convert info hashes into arrays of strings to output in display order
 159 for my $row (values %info) {
 160         $row = [ map { $row->{$_} } qw/name category script string/ ];
 161         # strip off trailing missing values (especially string may be unknown)
 162         defined $row->[-1] ? last : pop @$row for 1 .. @$row;
 163 }
 164
 165 # output perl code of hash
 166 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
 167 print "+{\n";
 168 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
 169         map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
 170 );
 171 printf "q{%s}=>[%s],\n", $_, join(',',
 172         $di{$_},   # original code point
 173         $info{$_}  # optional additional arguments
 174                 ? map {"'$_'"} @{ $info{$_} }
 175                 : ()
 176 ) for sort keys %di;
 177 print "}\n";
 178
 179 __END__
 180
 181 =head1 NAME
 182
 183 rfc1345convert - Output digraph data from RFC-1345
 184
 185 =head1 SYNOPSIS
 186
 187 Download and convert the digraph specification from ietf.org:
 188
 189     rfc1345convert > digraphs.inc.pl
 190
 191 Test by printing the character for DO (should be a dollar sign):
 192
 193     perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
 194
 195 Manual specification of source retrieval:
 196
 197     rfc1345convert rfc1345.txt
 198     curl $url | rfc1345convert -
 199
 200 =head1 DESCRIPTION
 201
 202 Parses the official RFC-1345 document, searching the
 203 'character mnemonic table' for all digraph definitions.
 204 If successful, Perl code is output resulting in a hash
 205 with character data keyed by digraph.
 206 Any errors and warnings are given at STDERR.
 207
 208 The value can either be a scalar string containing another
 209 digraph which can be considered identical (usually inverted),
 210 or an array ref containing at least the resulting character's
 211 Unicode code point value.  If available, the following UCD data
 212 is appended: character name, category, script, and output string.
 213 For example:
 214
 215  +{
 216    AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
 217    EA => 'AE',
 218   }
 219
 220 =head1 AUTHOR
 221
 222 Mischa POSLAWSKY <perl@shiar.org>
 223
 224 =head1 LICENSE
 225
 226 Licensed under the GNU Affero General Public License version 3.
 227