11 #TODO: automatic download if not specified on stdin
13 LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt');
16 # skip everything until a character indented by 1 space (table start)
17 do {$_ = <>} until /^\s\S/;
19 my @t = $_; # add first line (already read, assume it's ok)
21 # read the rest of the character table
23 # check for table end (chapter 4)
26 # parse table lines (ignore (unindented) page break)
32 # continuation line (add to last entry)
41 # create a hash of desired input
44 my ($mnem, $char, $name) = split / +/, $_, 3;
45 next if length $mnem != 2;
46 $di{$mnem} = hex $char;
51 if (-r 'shiar.inc.txt') {
52 open my $include, '<:utf8', 'shiar.inc.txt';
53 for (readline $include) {
54 m{^([!"%'-Z_a-z]{2}) (.)} or next;
55 warn("$1 already defined"), next if defined $di{$1};
62 # optionally get unicode character information
66 $_ => Unicode::UCD::charinfo($di{$_})
67 || { block => '?', category => 'Xn', name => '', script => '' }
71 # add custom categories for certain blocks
73 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
74 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
77 # mark unofficial extras as such
78 $info{$_}->{category} .= ' Xz' for @extra;
81 # find control characters (first 32 chars from 0 and 128)
82 next if $di{$_} & ~0b1001_1111;
83 # rename to something more descriptive
84 $info{$_}->{name} = $info{$_}->{unicode10}
85 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
86 : sprintf('<control U+%04X>', $di{$_}); # at least identify by value
87 # show descriptive symbols instead of control chars themselves
88 $di{$_} += 0x2400 if $di{$_} < 32;
91 # output perl code of hash
92 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
94 printf "q{%s}=>[%s],\n", $_, join(',',
95 $di{$_}, # glyph code point
96 $info{$_} # optional additional arguments
97 ? map {"'$_'"} @{ $info{$_} }{qw/name category script/}