#!/usr/bin/env perl use 5.014; use warnings; use utf8; use open OUT => ':encoding(utf-8)', ':std'; use charnames ':full'; use Data::Dump 'pp'; our $VERSION = '1.01'; # translation table for deprecated code points my %replace = ( (map {$_ => 0} 0xE000 .. 0xE03F), # omit by default if unspecified 0xE001 => 0, # /c join lines: not accepted 0xE002 => "\N{EM QUAD}", # UA unit space A #> ISO-IR 008-1: unit spaces A in position 4/0 and B in #> position 6/0 are fixed length spaces whereby UA ≥ UB. #> […] The lengths of UA and UB are determined by the user. 0xE003 => "\N{EN QUAD}", # UB unit space B # 0xE004 => "\N{COMBINING DIAERESIS}", # "3 combining umlaut, unified with diaeresis # 0xE005 => "\N{COMBINING GREEK DIALYTIKA TONOS}", # "1 combining diaeresis+accent, unspecified 0xE006 => "\N{COMBINING GRAVE ACCENT}", # "! combining grave 0xE007 => "\N{COMBINING ACUTE ACCENT}", # "' combining acute 0xE008 => "\N{COMBINING CIRCUMFLEX ACCENT}", # "> combining circumflex 0xE009 => "\N{COMBINING TILDE}", # "? combining tilde 0xE00A => "\N{COMBINING MACRON}", # "- combining macron 0xE00B => "\N{COMBINING BREVE}", # "( combining breve 0xE00C => "\N{COMBINING DOT ABOVE}", # ". combining dot above 0xE00D => "\N{COMBINING DIAERESIS}", # ": combining diaeresis 0xE00E => "\N{COMBINING RING ABOVE}", # "0 combining ring above 0xE00F => "\N{COMBINING DOUBLE ACUTE ACCENT}", # "" combining double accute (sic) 0xE010 => "\N{COMBINING CARON}", # "< combining caron 0xE011 => "\N{COMBINING CEDILLA}", # ", combining cedilla 0xE012 => "\N{COMBINING OGONEK}", # "; combining ogonek 0xE013 => "\N{COMBINING LOW LINE}", # "_ combining low line 0xE014 => "\N{COMBINING DOUBLE LOW LINE}", # "= combining double low line 0xE015 => "\N{COMBINING LONG SOLIDUS OVERLAY}", # "/ combining long solidus 0xE016 => "\N{COMBINING GREEK YPOGEGRAMMENI}", # "i combining greek iota below 0xE017 => "\N{COMBINING REVERSED COMMA ABOVE}", # "d combining greek dasia pneumata 0xE018 => "\N{COMBINING COMMA ABOVE}", # "p combining greek psili pneumata 0xE019 => "\N{GREEK DASIA}", # ;; greek dasia pneumata 0xE01A => "\N{GREEK PSILI}", # ,, greek psili pneumata 0xE01B => "\N{GREEK BETA SYMBOL}", # b3 middle beta = curled beta? 0xE01C => "\N{WHITE CIRCLE}", # Ci circle 0xE01D => "\N{LATIN SMALL LETTER F WITH HOOK}", # f( function sign 0xE01E => "\N{LATIN SMALL LETTER EZH}", # ed ezh 0xE01F => "\N{SQUARE AM}", # am, compatibility char 0xE020 => "\N{SQUARE PM}", # pm, compatibility char 0xE021 => "\N{TELEPHONE SIGN}", # Tel, compatibility char 0xE022 => "\N{ARABIC LETTER ALEF FINAL FORM}", # a+: final alef compatibility # 0xE023 => "\N{LATIN SMALL LETTER F WITH HOOK}", # Fl dutch guilder, unified with function sign, obsolete # 0xE024 => "\N{GREEK CAPITAL LETTER GAMMA}", # GF gamma function sign, unified with letter 0xE025 => "\N{COMBINING RIGHT ARROW ABOVE}", # >V combining(?) rightwards vector above; also U+20D1 0xE026 => "\N{GREEK VARIA}", # !* greek varia 0xE027 => "\N{GREEK PERISPOMENI}", # ?* greek perispomeni 0xE028 => "J\N{COMBINING CARON}", # J< J-caron = uppercase U+01F0, no single character ); # expect input data source at command line @ARGV or die "Specify input source file or - for STDIN\n"; # skip everything until a character indented by 1 space (table start) do { $_ = readline; defined or die "Premature input end"; } until s/^\s(?=\S)//; chomp; my @line = $_; # add first line (already read, assume it's ok) # read the rest of the character table while ($_ = readline) { # check for table end (chapter 4) last if /^\d/; # parse table lines (ignore (unindented) page break) next unless s/^ //; chomp; # append line contents if (s/^ {15}/ /) { # continuation line (add to last entry) $line[-1] .= $_; } else { # add a new entry push @line, $_; } } # output perl code of hash # (assume no backslashes or curlies, so we can just q{} w/o escaping) say "# automatically generated by $0"; say 'use utf8;'; say '+{'; for (@line) { my ($mnem, $chrhex, $name) = split / +/, $_, 3; next if length $mnem != 2; my $chrnum = hex $chrhex; my $chr = $replace{$chrnum} // chr $chrnum or next; my $chrstr = pp $chr; say "q{$mnem} => $chrstr, # $name"; } say '}'; __END__ =head1 NAME mkdigraphs-rfc - Output digraph data from RFC-1345 =head1 SYNOPSIS Extract digraphs from text specifications as a perl hash: mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl Input can be the literal RFC (or similar) document: curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist - Test by printing the character for DO (should be a dollar sign): perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}' =head1 DESCRIPTION Parses the official RFC-1345 document, searching the 'character mnemonic table' for all digraph definitions. If successful, Perl code is output resulting in a hash with Unicode code points keyed by digraph. Obsolete values (references to private use area) are converted to modern alternatives. Any errors and warnings are given at STDERR. =head1 AUTHOR Mischa POSLAWSKY =head1 LICENSE Licensed under the GNU Affero General Public License version 3.