X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/0d71f602a3268f8059dc48042280fb25c1abb4ba..b5b3537710ed9f73e1c867e0cc27d50439eaf4cd:/rfc1345convert diff --git a/rfc1345convert b/rfc1345convert deleted file mode 100755 index 33b6ee9..0000000 --- a/rfc1345convert +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; -use utf8; - -use open OUT => ':utf8', ':std'; -use Data::Dumper; - -our $VERSION = '1.02'; - -# expect input data source at command line -@ARGV or die "Specify input source file or - for STDIN\n"; - -# skip everything until a character indented by 1 space (table start) -do { - $_ = readline; - defined or die "Premature input end"; -} until /^\s\S/; - -my @t = $_; # add first line (already read, assume it's ok) - -# read the rest of the character table -while ($_ = readline) { - # check for table end (chapter 4) - last if /^\d/; - - # parse table lines (ignore (unindented) page break) - next unless s/^ //; - chomp; - - # add the line to @t - if (s/^ {15}/ /) { - # continuation line (add to last entry) - $t[-1] .= $_; - } - else { - # add a new entry - push @t, $_; - } -} - -# create a hash of desired input -my %di; -for (@t) { - my ($mnem, $char, $name) = split / +/, $_, 3; - next if length $mnem != 2; - $di{$mnem} = hex $char; -} - -# XXX -my %trans = ( - 0xE001 => 0, # join lines: not accepted - 0xE004 => 0, # umlaut is no different from diaeresis 0x0308 - 0xE005 => 0x0344, # discouraged - 0xE006 => 0x0300, - 0xE007 => 0x0301, - 0xE008 => 0x0302, - 0xE009 => 0x0303, - 0xE00A => 0x0304, - 0xE00B => 0x0306, - 0xE00C => 0x0307, - 0xE00D => 0x0308, - 0xE00E => 0x030A, - 0xE00F => 0x030B, - 0xE010 => 0x030C, - 0xE011 => 0x0327, - 0xE012 => 0x0328, - 0xE013 => 0x0332, - 0xE014 => 0x0333, - 0xE015 => 0x0338, - 0xE016 => 0x0345, - 0xE017 => 0x0314, - 0xE018 => 0x0313, - 0xE019 => 0x1FFE, - 0xE01A => 0x1FBF, - 0xE01B => 0x03D0, # middle beta = curled beta? - 0xE01C => 0x25CB, - 0xE01D => 0x0192, - 0xE01E => 0x0292, - 0xE01F => 0x33C2, # am, compatibility char - 0xE020 => 0x33D8, # pm, compatibility char - 0xE021 => 0x2121, - 0xE022 => 0xFE8E, - 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway - 0xE024 => 0x0393, - 0xE025 => 0x20D7, # also 20D1; non-spacing - 0xE026 => 0x1FEF, - 0xE027 => 0x1FC0, - 0xE028 => 0x01F0, #but uppercase -); -for (values %di) { - $_ >= 0xE000 or next; - $_ = $trans{$_} if defined $trans{$_}; -} - -# personal addendums -my @extra; -if (-r 'shiar.inc.txt') { - open my $include, '<:utf8', 'shiar.inc.txt'; - for (readline $include) { - m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next; - warn("$1 already defined"), next if defined $di{$1}; - $di{$1} = ord $2; - push @extra, $1; - } -} -warn $@ if $@; - -$di{chr $_} = $_ for 32 .. 126; -$di{'\\'.$_} = delete $di{$_} for '{', '}', '\\'; - -# optionally get unicode character information -my %info = eval { - require Unicode::UCD; - map { - $_ => Unicode::UCD::charinfo($di{$_}) - || { block => '?', category => 'Xn', name => '', script => '' } - } keys %di; -}; - -# add custom categories for certain blocks -for (values %info) { - $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin'; - $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement'; -} - -# mark unofficial extras as such -$info{$_}->{category} .= ' Xz' for @extra; - -for (keys %di) { - $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining}; - # find control characters (first 32 chars from 0 and 128) - next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127; - # rename to something more descriptive - $info{$_}->{name} = $info{$_}->{unicode10} - ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful - : sprintf('', $di{$_}); # at least identify by value - # show descriptive symbols instead of control chars themselves - $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD); -} -# presentational string for some control(lish) entries -$info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di; -$info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di; -$info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di; - -# convert info hashes into arrays of strings to output in display order -for my $row (values %info) { - $row = [ map { $row->{$_} } qw/name category script string/ ]; - # strip off trailing missing values (especially string may be unknown) - defined $row->[-1] ? last : pop @$row for 1 .. @$row; -} - -# output perl code of hash -# (assume no backslashes or curlies, so we can just q{} w/o escaping) -print "+{\n"; -printf '(map {$_=>0} qw{%s}),'."\n", join(' ', - map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di -); -printf "q{%s}=>[%s],\n", $_, join(',', - $di{$_}, # original code point - $info{$_} # optional additional arguments - ? map {"'$_'"} @{ $info{$_} } - : () -) for sort keys %di; -print "}\n"; - -__END__ - -=head1 NAME - -rfc1345convert - Output digraph data from RFC-1345 - -=head1 SYNOPSIS - -Extract digraphs from text specifications as a perl hash: - - rfc1345convert rfc1345.txt custom.txt > digraphs.inc.pl - -Input can be the literal RFC (or similar) document: - - curl http://www.ietf.org/rfc/rfc1345.txt | rfc1345convert - - -Test by printing the character for DO (should be a dollar sign): - - perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]' - -=head1 DESCRIPTION - -Parses the official RFC-1345 document, searching the -'character mnemonic table' for all digraph definitions. -If successful, Perl code is output resulting in a hash -with character data keyed by digraph. -Any errors and warnings are given at STDERR. - -The value can either be a scalar string containing another -digraph which can be considered identical (usually inverted), -or an array ref containing at least the resulting character's -Unicode code point value. If available, the following UCD data -is appended: character name, category, script, and output string. -For example: - - +{ - AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'], - EA => 'AE', - } - -=head1 AUTHOR - -Mischa POSLAWSKY - -=head1 LICENSE - -Licensed under the GNU Affero General Public License version 3. -