7 use open OUT => ':utf8', ':std';
10 our $VERSION = '1.03';
12 # create a hash of desired input
13 my $di = do 'data/digraphs-rfc.inc.pl'
14 or die "error reading digraphs include: ", $@ // $!;
18 if (-r 'shiar.inc.txt') {
19 open my $include, '<:utf8', 'shiar.inc.txt';
20 for (readline $include) {
21 m{^(\$?[!"%'-Z_a-z]{2}) (.)} or next;
22 warn("$1 already defined"), next if defined $di->{$1};
29 $di->{chr $_} = $_ for 32 .. 126;
30 $di->{'\\'.$_} = delete $di->{$_} for '{', '}', '\\';
32 # optionally get unicode character information
36 $_ => Unicode::UCD::charinfo($di->{$_})
37 || { block => '?', category => 'Xn', name => '', script => '' }
41 # add custom categories for certain blocks
43 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
44 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
47 # mark unofficial extras as such
48 $info{$_}->{category} .= ' Xz' for @extra;
51 $info{$_}->{string} = chr(9676) . chr($di->{$_}) if $info{$_}->{combining};
52 # find control characters (first 32 chars from 0 and 128)
53 next unless ($di->{$_} & ~0b1001_1111) == 0 or $di->{$_} == 127;
54 # rename to something more descriptive
55 $info{$_}->{name} = $info{$_}->{unicode10}
56 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
57 : sprintf('<control U+%04X>', $di->{$_}); # at least identify by value
58 # show descriptive symbols instead of control chars themselves
59 $info{$_}->{string} = $di->{$_} < 32 ? chr($di->{$_} + 0x2400) : chr(0xFFFD);
61 # presentational string for some control(lish) entries
62 $info{$_}->{string} = '-' for grep { $di->{$_} == 0x00AD } keys %{$di};
63 $info{$_}->{string} = '␣' for grep { $di->{$_} == 0x200B } keys %{$di};
64 $info{$_}->{string} = '|' for grep { $di->{$_} == 0x200C } keys %{$di};
65 $info{$_}->{string} = '⁀' for grep { $di->{$_} == 0x200D } keys %{$di};
66 $info{$_}->{string} = '→' for grep { $di->{$_} == 0x200E } keys %{$di};
67 $info{$_}->{string} = '←' for grep { $di->{$_} == 0x200F } keys %{$di};
69 # convert info hashes into arrays of strings to output in display order
70 for my $row (values %info) {
71 $row = [ map { $row->{$_} } qw/name category script string/ ];
72 # strip off trailing missing values (especially string may be unknown)
73 defined $row->[-1] ? last : pop @$row for 1 .. @$row;
76 # output perl code of hash
77 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
78 print "# automatically generated by $0\n";
81 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
82 map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %{$di}
84 printf "q{%s}=>[%s],\n", $_, join(',',
85 $di->{$_}, # original code point
86 $info{$_} # optional additional arguments
87 ? map {"'$_'"} @{ $info{$_} }
89 ) for sort keys %{$di};
96 mkdigraphlist - Output character list of combined digraph data
100 mkdigraphlist >digraphs.inc.pl
101 perl -e'$di = do "digraphs.inc.pl"; print chr $di->{DO}->[0]'
105 Parses the official RFC-1345 document, searching the
106 'character mnemonic table' for all digraph definitions.
107 If successful, Perl code is output resulting in a hash
108 with character data keyed by digraph.
109 Any errors and warnings are given at STDERR.
111 The value can either be a scalar string containing another
112 digraph which can be considered identical (usually inverted),
113 or an array ref containing at least the resulting character's
114 Unicode code point value. If available, the following UCD data
115 is appended: character name, category, script, and output string.
119 AE => [198, 'LATIN CAPITAL LETTER AE', 'Lu Xl', 'Latin'],
125 Mischa POSLAWSKY <perl@shiar.org>
129 Licensed under the GNU Affero General Public License version 3.