6 use open OUT => ':utf8', ':std';
11 # translation table for deprecated code points
13 0xE001 => 0, # join lines: not accepted
14 0xE004 => 0, # umlaut is no different from diaeresis 0x0308
15 0xE005 => "\x{0344}", # discouraged
37 0xE01B => "\x{03D0}", # middle beta = curled beta?
41 0xE01F => "\x{33C2}", # am, compatibility char
42 0xE020 => "\x{33D8}", # pm, compatibility char
45 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway
47 0xE025 => "\x{20D7}", # also 20D1; non-spacing
50 0xE028 => "J̌", # uppercase U+01F0, no single character
53 # expect input data source at command line
54 @ARGV or die "Specify input source file or - for STDIN\n";
56 # skip everything until a character indented by 1 space (table start)
59 defined or die "Premature input end";
60 } until s/^\s(?=\S)//;
63 my @line = $_; # add first line (already read, assume it's ok)
65 # read the rest of the character table
66 while ($_ = readline) {
67 # check for table end (chapter 4)
70 # parse table lines (ignore (unindented) page break)
74 # append line contents
76 # continuation line (add to last entry)
85 # output perl code of hash
86 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
87 say "# automatically generated by $0";
91 my ($mnem, $chrhex, $name) = split / +/, $_, 3;
92 next if length $mnem != 2;
93 my $chrnum = hex $chrhex;
94 my $chr = $replace{$chrnum} // chr $chrnum or next;
96 say "q{$mnem} => $chrstr, # $name";
104 mkdigraphs-rfc - Output digraph data from RFC-1345
108 Extract digraphs from text specifications as a perl hash:
110 mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl
112 Input can be the literal RFC (or similar) document:
114 curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist -
116 Test by printing the character for DO (should be a dollar sign):
118 perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}'
122 Parses the official RFC-1345 document, searching the
123 'character mnemonic table' for all digraph definitions.
124 If successful, Perl code is output resulting in a hash
125 with Unicode code points keyed by digraph.
126 Obsolete values (references to private use area)
127 are converted to modern alternatives.
128 Any errors and warnings are given at STDERR.
132 Mischa POSLAWSKY <perl@shiar.org>
136 Licensed under the GNU Affero General Public License version 3.