X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/dbddc268edc610c50c7ec27de30e13bdd585b377..3aab2658e078a4f424343fb10817f8d0eab9343b:/rfc1345convert diff --git a/rfc1345convert b/rfc1345convert old mode 100644 new mode 100755 index c6131ad..4fd3940 --- a/rfc1345convert +++ b/rfc1345convert @@ -2,26 +2,39 @@ use strict; use warnings; +use utf8; +use open OUT => ':utf8', ':std'; use Data::Dumper; -our $VERSION = '1.00'; +our $VERSION = '1.01'; -if (0) { - #TODO: automatic download if not specified on stdin +# determine input data source +my $input; +if (@ARGV) { + # manual contents specified (either piped or filename(s) given) + $input = \*ARGV; +} +else { + # automatic download from official website require LWP::Simple; - LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt'); + my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt') + or die "Couldn't download RFC-1345 from ietf.org"; + open $input, '<', \$contents; # emulate file handle } # skip everything until a character indented by 1 space (table start) -do {$_ = <>} until /^\s\S/; +do { + $_ = readline $input; + defined or die "Premature input end"; +} until /^\s\S/; my @t = $_; # add first line (already read, assume it's ok) # read the rest of the character table -while ($_ = <>) { +while ($_ = readline $input) { # check for table end (chapter 4) - last if /^4/; + last if /^\d/; # parse table lines (ignore (unindented) page break) next unless s/^ //; @@ -97,7 +110,7 @@ my @extra; if (-r 'shiar.inc.txt') { open my $include, '<:utf8', 'shiar.inc.txt'; for (readline $include) { - m{^([!"%'-Z_a-z]{2}) (.)} or next; + m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next; warn("$1 already defined"), next if defined $di{$1}; $di{$1} = ord $2; push @extra, $1; @@ -105,6 +118,9 @@ if (-r 'shiar.inc.txt') { } warn $@ if $@; +$di{chr $_} = $_ for 32 .. 126; +$di{'\\'.$_} = delete $di{$_} for '{', '}', '\\'; + # optionally get unicode character information my %info = eval { require Unicode::UCD; @@ -124,14 +140,26 @@ for (values %info) { $info{$_}->{category} .= ' Xz' for @extra; for (keys %di) { + $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining}; # find control characters (first 32 chars from 0 and 128) - next if $di{$_} & ~0b1001_1111; + next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127; # rename to something more descriptive $info{$_}->{name} = $info{$_}->{unicode10} ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful : sprintf('', $di{$_}); # at least identify by value # show descriptive symbols instead of control chars themselves - $di{$_} += 0x2400 if $di{$_} < 32; + $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD); +} +# presentational string for some control(lish) entries +$info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di; +$info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di; +$info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di; + +# convert info hashes into arrays of strings to output in display order +for my $row (values %info) { + $row = [ map { $row->{$_} } qw/name category script string/ ]; + # strip off trailing missing values (especially string may be unknown) + defined $row->[-1] ? last : pop @$row for 1 .. @$row; } # output perl code of hash @@ -141,10 +169,59 @@ printf '(map {$_=>0} qw{%s}),'."\n", join(' ', map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di ); printf "q{%s}=>[%s],\n", $_, join(',', - $di{$_}, # glyph code point + $di{$_}, # original code point $info{$_} # optional additional arguments - ? map {"'$_'"} @{ $info{$_} }{qw/name category script/} + ? map {"'$_'"} @{ $info{$_} } : () ) for sort keys %di; print "}\n"; +__END__ + +=head1 NAME + +rfc1345convert - Output digraph data from RFC-1345 + +=head1 SYNOPSIS + +Download and convert the digraph specification from ietf.org: + + rfc1345convert > digraphs.inc.pl + +Test by printing the character for DO (should be a dollar sign): + + perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]' + +Manual specification of source retrieval: + + rfc1345convert rfc1345.txt + curl $url | rfc1345convert - + +=head1 DESCRIPTION + +Parses the official RFC-1345 document, searching the +'character mnemonic table' for all digraph definitions. +If successful, Perl code is output resulting in a hash +with character data keyed by digraph. +Any errors and warnings are given at STDERR. + +The value can either be a scalar string containing another +digraph which can be considered identical (usually inverted), +or an array ref containing at least the resulting character's +Unicode code point value. If available, the following UCD data +is appended: character name, category, script, and output string. +For example: + + +{ + AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'], + EA => 'AE', + } + +=head1 AUTHOR + +Mischa POSLAWSKY + +=head1 LICENSE + +Licensed under the GNU Affero General Public License version 3. +