X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/fa15370b8f30686099ea3cbb1c91ed140c9e70bc..7e678eaffb1a57d300e8aa77078d93ab481f8006:/tools/mkdigraphlist diff --git a/tools/mkdigraphlist b/tools/mkdigraphlist index a1bfbcd..4cacc6c 100755 --- a/tools/mkdigraphlist +++ b/tools/mkdigraphlist @@ -7,116 +7,27 @@ use utf8; use open OUT => ':utf8', ':std'; use Data::Dumper; -our $VERSION = '1.02'; - -# expect input data source at command line -@ARGV or die "Specify input source file or - for STDIN\n"; - -# skip everything until a character indented by 1 space (table start) -do { - $_ = readline; - defined or die "Premature input end"; -} until /^\s\S/; - -my @t = $_; # add first line (already read, assume it's ok) - -# read the rest of the character table -while ($_ = readline) { - # check for table end (chapter 4) - last if /^\d/; - - # parse table lines (ignore (unindented) page break) - next unless s/^ //; - chomp; - - # add the line to @t - if (s/^ {15}/ /) { - # continuation line (add to last entry) - $t[-1] .= $_; - } - else { - # add a new entry - push @t, $_; - } -} +our $VERSION = '1.03'; # create a hash of desired input -my %di; -for (@t) { - my ($mnem, $char, $name) = split / +/, $_, 3; - next if length $mnem != 2; - $di{$mnem} = hex $char; -} - -# XXX -my %trans = ( - 0xE001 => 0, # join lines: not accepted - 0xE004 => 0, # umlaut is no different from diaeresis 0x0308 - 0xE005 => 0x0344, # discouraged - 0xE006 => 0x0300, - 0xE007 => 0x0301, - 0xE008 => 0x0302, - 0xE009 => 0x0303, - 0xE00A => 0x0304, - 0xE00B => 0x0306, - 0xE00C => 0x0307, - 0xE00D => 0x0308, - 0xE00E => 0x030A, - 0xE00F => 0x030B, - 0xE010 => 0x030C, - 0xE011 => 0x0327, - 0xE012 => 0x0328, - 0xE013 => 0x0332, - 0xE014 => 0x0333, - 0xE015 => 0x0338, - 0xE016 => 0x0345, - 0xE017 => 0x0314, - 0xE018 => 0x0313, - 0xE019 => 0x1FFE, - 0xE01A => 0x1FBF, - 0xE01B => 0x03D0, # middle beta = curled beta? - 0xE01C => 0x25CB, - 0xE01D => 0x0192, - 0xE01E => 0x0292, - 0xE01F => 0x33C2, # am, compatibility char - 0xE020 => 0x33D8, # pm, compatibility char - 0xE021 => 0x2121, - 0xE022 => 0xFE8E, - 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway - 0xE024 => 0x0393, - 0xE025 => 0x20D7, # also 20D1; non-spacing - 0xE026 => 0x1FEF, - 0xE027 => 0x1FC0, - 0xE028 => 0x01F0, #but uppercase -); -for (values %di) { - $_ >= 0xE000 or next; - $_ = $trans{$_} if defined $trans{$_}; -} +my $di = do 'data/digraphs-rfc.inc.pl' + or die "error reading digraphs include: ", $@ // $!; # personal addendums -my @extra; -if (-r 'shiar.inc.txt') { - open my $include, '<:utf8', 'shiar.inc.txt'; - for (readline $include) { - m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next; - warn("$1 already defined"), next if defined $di{$1}; - $di{$1} = ord $2; - push @extra, $1; - } -} -warn $@ if $@; +my $extra = do 'data/digraphs-shiar.inc.pl' + or warn "could not include shiar proposals: ", $@ // $!; +$di = { %{$di}, %{$extra // {}} }; -$di{chr $_} = $_ for 32 .. 126; -$di{'\\'.$_} = delete $di{$_} for '{', '}', '\\'; +$di->{chr $_} = $_ for 32 .. 126; +$di->{'\\'.$_} = delete $di->{$_} for '{', '}', '\\'; # optionally get unicode character information my %info = eval { require Unicode::UCD; map { - $_ => Unicode::UCD::charinfo($di{$_}) + $_ => Unicode::UCD::charinfo($di->{$_}) || { block => '?', category => 'Xn', name => '', script => '' } - } keys %di; + } keys %{$di}; }; # add custom categories for certain blocks @@ -126,23 +37,26 @@ for (values %info) { } # mark unofficial extras as such -$info{$_}->{category} .= ' Xz' for @extra; +$info{$_}->{category} .= ' Xz' for keys %{$extra}; -for (keys %di) { - $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining}; +for (keys %{$di}) { + $info{$_}->{string} = chr(9676) . chr($di->{$_}) if $info{$_}->{combining}; # find control characters (first 32 chars from 0 and 128) - next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127; + next unless ($di->{$_} & ~0b1001_1111) == 0 or $di->{$_} == 127; # rename to something more descriptive $info{$_}->{name} = $info{$_}->{unicode10} ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful - : sprintf('', $di{$_}); # at least identify by value + : sprintf('', $di->{$_}); # at least identify by value # show descriptive symbols instead of control chars themselves - $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD); + $info{$_}->{string} = $di->{$_} < 32 ? chr($di->{$_} + 0x2400) : chr(0xFFFD); } # presentational string for some control(lish) entries -$info{$_}->{string} = '-' for grep { $di{$_} == 0x00AD } keys %di; -$info{$_}->{string} = '→' for grep { $di{$_} == 0x200E } keys %di; -$info{$_}->{string} = '←' for grep { $di{$_} == 0x200F } keys %di; +$info{$_}->{string} = '-' for grep { $di->{$_} == 0x00AD } keys %{$di}; +$info{$_}->{string} = '␣' for grep { $di->{$_} == 0x200B } keys %{$di}; +$info{$_}->{string} = '|' for grep { $di->{$_} == 0x200C } keys %{$di}; +$info{$_}->{string} = '⁀' for grep { $di->{$_} == 0x200D } keys %{$di}; +$info{$_}->{string} = '→' for grep { $di->{$_} == 0x200E } keys %{$di}; +$info{$_}->{string} = '←' for grep { $di->{$_} == 0x200F } keys %{$di}; # convert info hashes into arrays of strings to output in display order for my $row (values %info) { @@ -153,37 +67,29 @@ for my $row (values %info) { # output perl code of hash # (assume no backslashes or curlies, so we can just q{} w/o escaping) +print "# automatically generated by $0\n"; print "use utf8;\n"; print "+{\n"; printf '(map {$_=>0} qw{%s}),'."\n", join(' ', - map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di + map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %{$di} ); printf "q{%s}=>[%s],\n", $_, join(',', - $di{$_}, # original code point + $di->{$_}, # original code point $info{$_} # optional additional arguments ? map {"'$_'"} @{ $info{$_} } : () -) for sort keys %di; +) for sort keys %{$di}; print "}\n"; __END__ =head1 NAME -mkdigraphlist - Output digraph data from RFC-1345 +mkdigraphlist - Output character list of combined digraph data =head1 SYNOPSIS -Extract digraphs from text specifications as a perl hash: - - mkdigraphlist rfc1345.txt custom.txt > digraphs.inc.pl - -Input can be the literal RFC (or similar) document: - - curl http://www.ietf.org/rfc/rfc1345.txt | mkdigraphlist - - -Test by printing the character for DO (should be a dollar sign): - + mkdigraphlist >digraphs.inc.pl perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]' =head1 DESCRIPTION