#!/usr/bin/env perl use strict; use warnings; use utf8; use open OUT => ':utf8', ':std'; our $VERSION = '1.00'; # expect input data source at command line @ARGV or die "Specify input source file or - for STDIN\n"; # skip everything until a character indented by 1 space (table start) do { $_ = readline; defined or die "Premature input end"; } until /^\s\S/; my @t = $_; # add first line (already read, assume it's ok) # read the rest of the character table while ($_ = readline) { # check for table end (chapter 4) last if /^\d/; # parse table lines (ignore (unindented) page break) next unless s/^ //; chomp; # add the line to @t if (s/^ {15}/ /) { # continuation line (add to last entry) $t[-1] .= $_; } else { # add a new entry push @t, $_; } } # create a hash of desired input my %di; for (@t) { my ($mnem, $char, $name) = split / +/, $_, 3; next if length $mnem != 2; $di{$mnem} = hex $char; } # XXX my %trans = ( 0xE001 => 0, # join lines: not accepted 0xE004 => 0, # umlaut is no different from diaeresis 0x0308 0xE005 => 0x0344, # discouraged 0xE006 => 0x0300, 0xE007 => 0x0301, 0xE008 => 0x0302, 0xE009 => 0x0303, 0xE00A => 0x0304, 0xE00B => 0x0306, 0xE00C => 0x0307, 0xE00D => 0x0308, 0xE00E => 0x030A, 0xE00F => 0x030B, 0xE010 => 0x030C, 0xE011 => 0x0327, 0xE012 => 0x0328, 0xE013 => 0x0332, 0xE014 => 0x0333, 0xE015 => 0x0338, 0xE016 => 0x0345, 0xE017 => 0x0314, 0xE018 => 0x0313, 0xE019 => 0x1FFE, 0xE01A => 0x1FBF, 0xE01B => 0x03D0, # middle beta = curled beta? 0xE01C => 0x25CB, 0xE01D => 0x0192, 0xE01E => 0x0292, 0xE01F => 0x33C2, # am, compatibility char 0xE020 => 0x33D8, # pm, compatibility char 0xE021 => 0x2121, 0xE022 => 0xFE8E, 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway 0xE024 => 0x0393, 0xE025 => 0x20D7, # also 20D1; non-spacing 0xE026 => 0x1FEF, 0xE027 => 0x1FC0, 0xE028 => 0x01F0, #but uppercase ); for (values %di) { $_ >= 0xE000 or next; $_ = $trans{$_} if defined $trans{$_}; } # output perl code of hash # (assume no backslashes or curlies, so we can just q{} w/o escaping) print "# automatically generated by $0\n"; print "use utf8;\n"; print "+{\n"; printf "q{%s}=>%s,\n", $_, $di{$_} for sort keys %di; print "}\n"; __END__ =head1 NAME mkdigraphs-rfc - Output digraph data from RFC-1345 =head1 SYNOPSIS Extract digraphs from text specifications as a perl hash: mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl Input can be the literal RFC (or similar) document: curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist - Test by printing the character for DO (should be a dollar sign): perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}' =head1 DESCRIPTION Parses the official RFC-1345 document, searching the 'character mnemonic table' for all digraph definitions. If successful, Perl code is output resulting in a hash with Unicode code points keyed by digraph. Obsolete values (references to private use area) are converted to modern alternatives. Any errors and warnings are given at STDERR. =head1 AUTHOR Mischa POSLAWSKY =head1 LICENSE Licensed under the GNU Affero General Public License version 3.