X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/a531724f57f9cad3fd401fa2846bd61f35aaf9f2..a0ba9298856b2426c5c66b6d2f2b284d98cee594:/rfc1345convert diff --git a/rfc1345convert b/rfc1345convert old mode 100644 new mode 100755 index 3157e04..f932753 --- a/rfc1345convert +++ b/rfc1345convert @@ -3,25 +3,37 @@ use strict; use warnings; +use open OUT => ':utf8', ':std'; use Data::Dumper; -our $VERSION = '1.00'; +our $VERSION = '1.01'; -if (0) { - #TODO: automatic download if not specified on stdin +# determine input data source +my $input; +if (@ARGV) { + # manual contents specified (either piped or filename(s) given) + $input = \*ARGV; +} +else { + # automatic download from official website require LWP::Simple; - LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt'); + my $contents = LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt') + or die "Couldn't download RFC-1345 from ietf.org"; + open $input, '<', \$contents; # emulate file handle } # skip everything until a character indented by 1 space (table start) -do {$_ = <>} until /^\s\S/; +do { + $_ = readline $input; + defined or die "Premature input end"; +} until /^\s\S/; my @t = $_; # add first line (already read, assume it's ok) # read the rest of the character table -while ($_ = <>) { +while ($_ = readline $input) { # check for table end (chapter 4) - last if /^4/; + last if /^\d/; # parse table lines (ignore (unindented) page break) next unless s/^ //; @@ -105,6 +117,9 @@ if (-r 'shiar.inc.txt') { } warn $@ if $@; +$di{chr $_} = $_ for 32 .. 126; +$di{'\\'.$_} = delete $di{$_} for '{', '}', '\\'; + # optionally get unicode character information my %info = eval { require Unicode::UCD; @@ -156,3 +171,52 @@ printf "q{%s}=>[%s],\n", $_, join(',', ) for sort keys %di; print "}\n"; +__END__ + +=head1 NAME + +rfc1345convert - Output digraph data from RFC-1345 + +=head1 SYNOPSIS + +Download and convert the digraph specification from ietf.org: + + rfc1345convert > digraphs.inc.pl + +Test by printing the character for DO (should be a dollar sign): + + perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]' + +Manual specification of source retrieval: + + rfc1345convert rfc1345.txt + curl $url | rfc1345convert - + +=head1 DESCRIPTION + +Parses the official RFC-1345 document, searching the +'character mnemonic table' for all digraph definitions. +If successful, Perl code is output resulting in a hash +with character data keyed by digraph. +Any errors and warnings are given at STDERR. + +The value can either be a scalar string containing another +digraph which can be considered identical (usually inverted), +or an array ref containing at least the resulting character's +Unicode code point value. If available, the following UCD data +is appended: character name, category, script, and output string. +For example: + + +{ + AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'], + EA => 'AE', + } + +=head1 AUTHOR + +Mischa POSLAWSKY + +=head1 LICENSE + +Licensed under the GNU Affero General Public License version 3. +