X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/248fad5ca4456d48cab13b603b582006ef6c4659..67ae97362cec317f2cfc3ae97f9d3810f70d615c:/tools/mkdigraphs-rfc diff --git a/tools/mkdigraphs-rfc b/tools/mkdigraphs-rfc new file mode 100755 index 0000000..ab03a42 --- /dev/null +++ b/tools/mkdigraphs-rfc @@ -0,0 +1,140 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use utf8; +use open OUT => ':utf8', ':std'; + +our $VERSION = '1.00'; + +# expect input data source at command line +@ARGV or die "Specify input source file or - for STDIN\n"; + +# skip everything until a character indented by 1 space (table start) +do { + $_ = readline; + defined or die "Premature input end"; +} until /^\s\S/; + +my @t = $_; # add first line (already read, assume it's ok) + +# read the rest of the character table +while ($_ = readline) { + # check for table end (chapter 4) + last if /^\d/; + + # parse table lines (ignore (unindented) page break) + next unless s/^ //; + chomp; + + # add the line to @t + if (s/^ {15}/ /) { + # continuation line (add to last entry) + $t[-1] .= $_; + } + else { + # add a new entry + push @t, $_; + } +} + +# create a hash of desired input +my %di; +for (@t) { + my ($mnem, $char, $name) = split / +/, $_, 3; + next if length $mnem != 2; + $di{$mnem} = hex $char; +} + +# XXX +my %trans = ( + 0xE001 => 0, # join lines: not accepted + 0xE004 => 0, # umlaut is no different from diaeresis 0x0308 + 0xE005 => 0x0344, # discouraged + 0xE006 => 0x0300, + 0xE007 => 0x0301, + 0xE008 => 0x0302, + 0xE009 => 0x0303, + 0xE00A => 0x0304, + 0xE00B => 0x0306, + 0xE00C => 0x0307, + 0xE00D => 0x0308, + 0xE00E => 0x030A, + 0xE00F => 0x030B, + 0xE010 => 0x030C, + 0xE011 => 0x0327, + 0xE012 => 0x0328, + 0xE013 => 0x0332, + 0xE014 => 0x0333, + 0xE015 => 0x0338, + 0xE016 => 0x0345, + 0xE017 => 0x0314, + 0xE018 => 0x0313, + 0xE019 => 0x1FFE, + 0xE01A => 0x1FBF, + 0xE01B => 0x03D0, # middle beta = curled beta? + 0xE01C => 0x25CB, + 0xE01D => 0x0192, + 0xE01E => 0x0292, + 0xE01F => 0x33C2, # am, compatibility char + 0xE020 => 0x33D8, # pm, compatibility char + 0xE021 => 0x2121, + 0xE022 => 0xFE8E, + 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway + 0xE024 => 0x0393, + 0xE025 => 0x20D7, # also 20D1; non-spacing + 0xE026 => 0x1FEF, + 0xE027 => 0x1FC0, + 0xE028 => 0x01F0, #but uppercase +); +for (values %di) { + $_ >= 0xE000 or next; + $_ = $trans{$_} if defined $trans{$_}; +} + +# output perl code of hash +# (assume no backslashes or curlies, so we can just q{} w/o escaping) +print "# automatically generated by $0\n"; +print "use utf8;\n"; +print "+{\n"; +printf "q{%s}=>%s,\n", $_, $di{$_} for sort keys %di; +print "}\n"; + +__END__ + +=head1 NAME + +mkdigraphs-rfc - Output digraph data from RFC-1345 + +=head1 SYNOPSIS + +Extract digraphs from text specifications as a perl hash: + + mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl + +Input can be the literal RFC (or similar) document: + + curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist - + +Test by printing the character for DO (should be a dollar sign): + + perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}' + +=head1 DESCRIPTION + +Parses the official RFC-1345 document, searching the +'character mnemonic table' for all digraph definitions. +If successful, Perl code is output resulting in a hash +with Unicode code points keyed by digraph. +Obsolete values (references to private use area) +are converted to modern alternatives. +Any errors and warnings are given at STDERR. + +=head1 AUTHOR + +Mischa POSLAWSKY + +=head1 LICENSE + +Licensed under the GNU Affero General Public License version 3. +