From: Mischa POSLAWSKY Date: Sat, 21 Feb 2015 01:34:39 +0000 (+0100) Subject: digraphs: separate rfc parser from include generator X-Git-Tag: v1.7~124 X-Git-Url: http://git.shiar.nl/sheet.git/commitdiff_plain/67ae97362cec317f2cfc3ae97f9d3810f70d615c?hp=248fad5ca4456d48cab13b603b582006ef6c4659 digraphs: separate rfc parser from include generator --- diff --git a/Makefile b/Makefile index 0cfdd89..46d6817 100644 --- a/Makefile +++ b/Makefile @@ -12,9 +12,12 @@ unicode-age.inc.pl: tools/mkcharver data/DerivedAge.txt data/rfc1345.txt: tools/wget-ifmodified http://www.ietf.org/rfc/$(@F) $@ -digraphs.inc.pl: tools/mkdigraphlist data/rfc1345.txt shiar.inc.txt +data/digraphs-rfc.inc.pl: tools/mkdigraphs-rfc data/rfc1345.txt $< $(word 2,$^) >$@ +digraphs.inc.pl: tools/mkdigraphlist data/digraphs-rfc.inc.pl shiar.inc.txt + $< >$@ + unicode-char.inc.pl: tools/mkcharinfo digraphs.inc.pl unicode-age.inc.pl $< >$@ diff --git a/tools/mkdigraphlist b/tools/mkdigraphlist index 96ec534..66ffd2a 100755 --- a/tools/mkdigraphlist +++ b/tools/mkdigraphlist @@ -7,92 +7,11 @@ use utf8; use open OUT => ':utf8', ':std'; use Data::Dumper; -our $VERSION = '1.02'; - -# expect input data source at command line -@ARGV or die "Specify input source file or - for STDIN\n"; - -# skip everything until a character indented by 1 space (table start) -do { - $_ = readline; - defined or die "Premature input end"; -} until /^\s\S/; - -my @t = $_; # add first line (already read, assume it's ok) - -# read the rest of the character table -while ($_ = readline) { - # check for table end (chapter 4) - last if /^\d/; - - # parse table lines (ignore (unindented) page break) - next unless s/^ //; - chomp; - - # add the line to @t - if (s/^ {15}/ /) { - # continuation line (add to last entry) - $t[-1] .= $_; - } - else { - # add a new entry - push @t, $_; - } -} +our $VERSION = '1.03'; # create a hash of desired input -my %di; -for (@t) { - my ($mnem, $char, $name) = split / +/, $_, 3; - next if length $mnem != 2; - $di{$mnem} = hex $char; -} - -# XXX -my %trans = ( - 0xE001 => 0, # join lines: not accepted - 0xE004 => 0, # umlaut is no different from diaeresis 0x0308 - 0xE005 => 0x0344, # discouraged - 0xE006 => 0x0300, - 0xE007 => 0x0301, - 0xE008 => 0x0302, - 0xE009 => 0x0303, - 0xE00A => 0x0304, - 0xE00B => 0x0306, - 0xE00C => 0x0307, - 0xE00D => 0x0308, - 0xE00E => 0x030A, - 0xE00F => 0x030B, - 0xE010 => 0x030C, - 0xE011 => 0x0327, - 0xE012 => 0x0328, - 0xE013 => 0x0332, - 0xE014 => 0x0333, - 0xE015 => 0x0338, - 0xE016 => 0x0345, - 0xE017 => 0x0314, - 0xE018 => 0x0313, - 0xE019 => 0x1FFE, - 0xE01A => 0x1FBF, - 0xE01B => 0x03D0, # middle beta = curled beta? - 0xE01C => 0x25CB, - 0xE01D => 0x0192, - 0xE01E => 0x0292, - 0xE01F => 0x33C2, # am, compatibility char - 0xE020 => 0x33D8, # pm, compatibility char - 0xE021 => 0x2121, - 0xE022 => 0xFE8E, - 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway - 0xE024 => 0x0393, - 0xE025 => 0x20D7, # also 20D1; non-spacing - 0xE026 => 0x1FEF, - 0xE027 => 0x1FC0, - 0xE028 => 0x01F0, #but uppercase -); -for (values %di) { - $_ >= 0xE000 or next; - $_ = $trans{$_} if defined $trans{$_}; -} +my $di = do 'data/digraphs-rfc.inc.pl' + or die "error reading digraphs include: ", $@ // $!; # personal addendums my @extra; @@ -100,23 +19,23 @@ if (-r 'shiar.inc.txt') { open my $include, '<:utf8', 'shiar.inc.txt'; for (readline $include) { m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next; - warn("$1 already defined"), next if defined $di{$1}; - $di{$1} = ord $2; + warn("$1 already defined"), next if defined $di->{$1}; + $di->{$1} = ord $2; push @extra, $1; } } warn $@ if $@; -$di{chr $_} = $_ for 32 .. 126; -$di{'\\'.$_} = delete $di{$_} for '{', '}', '\\'; +$di->{chr $_} = $_ for 32 .. 126; +$di->{'\\'.$_} = delete $di->{$_} for '{', '}', '\\'; # optionally get unicode character information my %info = eval { require Unicode::UCD; map { - $_ => Unicode::UCD::charinfo($di{$_}) + $_ => Unicode::UCD::charinfo($di->{$_}) || { block => '?', category => 'Xn', name => '', script => '' } - } keys %di; + } keys %{$di}; }; # add custom categories for certain blocks @@ -128,24 +47,24 @@ for (values %info) { # mark unofficial extras as such $info{$_}->{category} .= ' Xz' for @extra; -for (keys %di) { - $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining}; +for (keys %{$di}) { + $info{$_}->{string} = chr(9676) . chr($di->{$_}) if $info{$_}->{combining}; # find control characters (first 32 chars from 0 and 128) - next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127; + next unless ($di->{$_} & ~0b1001_1111) == 0 or $di->{$_} == 127; # rename to something more descriptive $info{$_}->{name} = $info{$_}->{unicode10} ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful - : sprintf('', $di{$_}); # at least identify by value + : sprintf('', $di->{$_}); # at least identify by value # show descriptive symbols instead of control chars themselves - $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD); + $info{$_}->{string} = $di->{$_} < 32 ? chr($di->{$_} + 0x2400) : chr(0xFFFD); } # presentational string for some control(lish) entries -$info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di; -$info{$_}->{string} = '␣' for grep { $di{$_} == 0x200B } keys %di; -$info{$_}->{string} = '|' for grep { $di{$_} == 0x200C } keys %di; -$info{$_}->{string} = '⁀' for grep { $di{$_} == 0x200D } keys %di; -$info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di; -$info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di; +$info{$_}->{string} = '-' for grep { $di->{$_} == 0x00AD } keys %{$di}; +$info{$_}->{string} = '␣' for grep { $di->{$_} == 0x200B } keys %{$di}; +$info{$_}->{string} = '|' for grep { $di->{$_} == 0x200C } keys %{$di}; +$info{$_}->{string} = '⁀' for grep { $di->{$_} == 0x200D } keys %{$di}; +$info{$_}->{string} = '→' for grep { $di->{$_} == 0x200E } keys %{$di}; +$info{$_}->{string} = '←' for grep { $di->{$_} == 0x200F } keys %{$di}; # convert info hashes into arrays of strings to output in display order for my $row (values %info) { @@ -160,34 +79,25 @@ print "# automatically generated by $0\n"; print "use utf8;\n"; print "+{\n"; printf '(map {$_=>0} qw{%s}),'."\n", join(' ', - map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di + map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %{$di} ); printf "q{%s}=>[%s],\n", $_, join(',', - $di{$_}, # original code point + $di->{$_}, # original code point $info{$_} # optional additional arguments ? map {"'$_'"} @{ $info{$_} } : () -) for sort keys %di; +) for sort keys %{$di}; print "}\n"; __END__ =head1 NAME -mkdigraphlist - Output digraph data from RFC-1345 +mkdigraphlist - Output character list of combined digraph data =head1 SYNOPSIS -Extract digraphs from text specifications as a perl hash: - - mkdigraphlist rfc1345.txt custom.txt > digraphs.inc.pl - -Input can be the literal RFC (or similar) document: - - curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist - - -Test by printing the character for DO (should be a dollar sign): - + mkdigraphlist >digraphs.inc.pl perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]' =head1 DESCRIPTION diff --git a/tools/mkdigraphs-rfc b/tools/mkdigraphs-rfc new file mode 100755 index 0000000..ab03a42 --- /dev/null +++ b/tools/mkdigraphs-rfc @@ -0,0 +1,140 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use utf8; +use open OUT => ':utf8', ':std'; + +our $VERSION = '1.00'; + +# expect input data source at command line +@ARGV or die "Specify input source file or - for STDIN\n"; + +# skip everything until a character indented by 1 space (table start) +do { + $_ = readline; + defined or die "Premature input end"; +} until /^\s\S/; + +my @t = $_; # add first line (already read, assume it's ok) + +# read the rest of the character table +while ($_ = readline) { + # check for table end (chapter 4) + last if /^\d/; + + # parse table lines (ignore (unindented) page break) + next unless s/^ //; + chomp; + + # add the line to @t + if (s/^ {15}/ /) { + # continuation line (add to last entry) + $t[-1] .= $_; + } + else { + # add a new entry + push @t, $_; + } +} + +# create a hash of desired input +my %di; +for (@t) { + my ($mnem, $char, $name) = split / +/, $_, 3; + next if length $mnem != 2; + $di{$mnem} = hex $char; +} + +# XXX +my %trans = ( + 0xE001 => 0, # join lines: not accepted + 0xE004 => 0, # umlaut is no different from diaeresis 0x0308 + 0xE005 => 0x0344, # discouraged + 0xE006 => 0x0300, + 0xE007 => 0x0301, + 0xE008 => 0x0302, + 0xE009 => 0x0303, + 0xE00A => 0x0304, + 0xE00B => 0x0306, + 0xE00C => 0x0307, + 0xE00D => 0x0308, + 0xE00E => 0x030A, + 0xE00F => 0x030B, + 0xE010 => 0x030C, + 0xE011 => 0x0327, + 0xE012 => 0x0328, + 0xE013 => 0x0332, + 0xE014 => 0x0333, + 0xE015 => 0x0338, + 0xE016 => 0x0345, + 0xE017 => 0x0314, + 0xE018 => 0x0313, + 0xE019 => 0x1FFE, + 0xE01A => 0x1FBF, + 0xE01B => 0x03D0, # middle beta = curled beta? + 0xE01C => 0x25CB, + 0xE01D => 0x0192, + 0xE01E => 0x0292, + 0xE01F => 0x33C2, # am, compatibility char + 0xE020 => 0x33D8, # pm, compatibility char + 0xE021 => 0x2121, + 0xE022 => 0xFE8E, + 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway + 0xE024 => 0x0393, + 0xE025 => 0x20D7, # also 20D1; non-spacing + 0xE026 => 0x1FEF, + 0xE027 => 0x1FC0, + 0xE028 => 0x01F0, #but uppercase +); +for (values %di) { + $_ >= 0xE000 or next; + $_ = $trans{$_} if defined $trans{$_}; +} + +# output perl code of hash +# (assume no backslashes or curlies, so we can just q{} w/o escaping) +print "# automatically generated by $0\n"; +print "use utf8;\n"; +print "+{\n"; +printf "q{%s}=>%s,\n", $_, $di{$_} for sort keys %di; +print "}\n"; + +__END__ + +=head1 NAME + +mkdigraphs-rfc - Output digraph data from RFC-1345 + +=head1 SYNOPSIS + +Extract digraphs from text specifications as a perl hash: + + mkdigraphs-rfc rfc1345.txt >digraphs-rfc.inc.pl + +Input can be the literal RFC (or similar) document: + + curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist - + +Test by printing the character for DO (should be a dollar sign): + + perl -e'$di = do "digraphs-rfc.inc.pl"; print chr $di->{DO}' + +=head1 DESCRIPTION + +Parses the official RFC-1345 document, searching the +'character mnemonic table' for all digraph definitions. +If successful, Perl code is output resulting in a hash +with Unicode code points keyed by digraph. +Obsolete values (references to private use area) +are converted to modern alternatives. +Any errors and warnings are given at STDERR. + +=head1 AUTHOR + +Mischa POSLAWSKY + +=head1 LICENSE + +Licensed under the GNU Affero General Public License version 3. +