X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/7e678eaffb1a57d300e8aa77078d93ab481f8006..e41fa43a164e0932a2f8fe9efa5ff6f74b6b7de0:/tools/mkdigraphlist diff --git a/tools/mkdigraphlist b/tools/mkdigraphlist index 4cacc6c..0714a2b 100755 --- a/tools/mkdigraphlist +++ b/tools/mkdigraphlist @@ -5,7 +5,6 @@ use warnings; use utf8; use open OUT => ':utf8', ':std'; -use Data::Dumper; our $VERSION = '1.03'; @@ -16,53 +15,19 @@ my $di = do 'data/digraphs-rfc.inc.pl' # personal addendums my $extra = do 'data/digraphs-shiar.inc.pl' or warn "could not include shiar proposals: ", $@ // $!; -$di = { %{$di}, %{$extra // {}} }; - -$di->{chr $_} = $_ for 32 .. 126; -$di->{'\\'.$_} = delete $di->{$_} for '{', '}', '\\'; +my $vim = do 'data/digraphs-vim.inc.pl' + or warn "could not include vim extensions ", $@ // $!; +$di = { %{$vim // {}}, %{$di}, %{$extra // {}} }; # optionally get unicode character information -my %info = eval { - require Unicode::UCD; - map { - $_ => Unicode::UCD::charinfo($di->{$_}) - || { block => '?', category => 'Xn', name => '', script => '' } - } keys %{$di}; -}; - -# add custom categories for certain blocks -for (values %info) { - $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin'; - $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement'; -} - -# mark unofficial extras as such -$info{$_}->{category} .= ' Xz' for keys %{$extra}; - -for (keys %{$di}) { - $info{$_}->{string} = chr(9676) . chr($di->{$_}) if $info{$_}->{combining}; - # find control characters (first 32 chars from 0 and 128) - next unless ($di->{$_} & ~0b1001_1111) == 0 or $di->{$_} == 127; - # rename to something more descriptive - $info{$_}->{name} = $info{$_}->{unicode10} - ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful - : sprintf('', $di->{$_}); # at least identify by value - # show descriptive symbols instead of control chars themselves - $info{$_}->{string} = $di->{$_} < 32 ? chr($di->{$_} + 0x2400) : chr(0xFFFD); -} -# presentational string for some control(lish) entries -$info{$_}->{string} = '-' for grep { $di->{$_} == 0x00AD } keys %{$di}; -$info{$_}->{string} = '␣' for grep { $di->{$_} == 0x200B } keys %{$di}; -$info{$_}->{string} = '|' for grep { $di->{$_} == 0x200C } keys %{$di}; -$info{$_}->{string} = '⁀' for grep { $di->{$_} == 0x200D } keys %{$di}; -$info{$_}->{string} = '→' for grep { $di->{$_} == 0x200E } keys %{$di}; -$info{$_}->{string} = '←' for grep { $di->{$_} == 0x200F } keys %{$di}; +my $uninfo = do 'unicode-char.inc.pl' + or warn "could not include unicode details: ", $@ // $!; # convert info hashes into arrays of strings to output in display order -for my $row (values %info) { - $row = [ map { $row->{$_} } qw/name category script string/ ]; - # strip off trailing missing values (especially string may be unknown) - defined $row->[-1] ? last : pop @$row for 1 .. @$row; +for my $row (values %{$uninfo}) { + my ($class, $name, $di, $html, $string) = @{$row}; + $row = [$name, $class]; + push @{$row}, '', $string if defined $string; } # output perl code of hash @@ -71,13 +36,12 @@ print "# automatically generated by $0\n"; print "use utf8;\n"; print "+{\n"; printf '(map {$_=>0} qw{%s}),'."\n", join(' ', + grep { !defined $di->{$_} } map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %{$di} ); printf "q{%s}=>[%s],\n", $_, join(',', $di->{$_}, # original code point - $info{$_} # optional additional arguments - ? map {"'$_'"} @{ $info{$_} } - : () + (map {"'$_'"} @{ $uninfo->{ chr $di->{$_} } // [] }), # optional additional arguments ) for sort keys %{$di}; print "}\n"; @@ -94,11 +58,8 @@ mkdigraphlist - Output character list of combined digraph data =head1 DESCRIPTION -Parses the official RFC-1345 document, searching the -'character mnemonic table' for all digraph definitions. -If successful, Perl code is output resulting in a hash -with character data keyed by digraph. -Any errors and warnings are given at STDERR. +Combines precompiled digraph includes of rfc (1345), vim, and shiar +and outputs a complete map including character details and usage classes. The value can either be a scalar string containing another digraph which can be considered identical (usually inverted),