11 #TODO: automatic download if not specified on stdin
13 LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt');
16 # skip everything until a character indented by 1 space (table start)
17 do {$_ = <>} until /^\s\S/;
19 my @t = $_; # add first line (already read, assume it's ok)
21 # read the rest of the character table
23 # check for table end (chapter 4)
26 # parse table lines (ignore (unindented) page break)
32 # continuation line (add to last entry)
41 # create a hash of desired input
44 my ($mnem, $char, $name) = split / +/, $_, 3;
45 next if length $mnem != 2;
46 $di{$mnem} = hex $char;
51 0xE001 => 0, # join lines: not accepted
52 0xE004 => 0, # umlaut is no different from diaeresis 0x0308
53 0xE005 => 0x0344, # discouraged
75 0xE01B => 0x03D0, # middle beta = curled beta?
79 0xE01F => 0x33C2, # am, compatibility char
80 0xE020 => 0x33D8, # pm, compatibility char
83 0xE023 => 0, # dutch guilder 0192 is already encoded, and not very useful anyway
85 0xE025 => 0x20D7, # also 20D1; non-spacing
88 0xE028 => 0x01F0, #but uppercase
92 $_ = $trans{$_} if defined $trans{$_};
97 if (-r 'shiar.inc.txt') {
98 open my $include, '<:utf8', 'shiar.inc.txt';
99 for (readline $include) {
100 m{^([!"%'-Z_a-z]{2}) (.)} or next;
101 warn("$1 already defined"), next if defined $di{$1};
108 # optionally get unicode character information
110 require Unicode::UCD;
112 $_ => Unicode::UCD::charinfo($di{$_})
113 || { block => '?', category => 'Xn', name => '', script => '' }
117 # add custom categories for certain blocks
119 $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin';
120 $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement';
123 # mark unofficial extras as such
124 $info{$_}->{category} .= ' Xz' for @extra;
127 $info{$_}->{string} = chr(9676) . chr($di{$_}) if $info{$_}->{combining};
128 # find control characters (first 32 chars from 0 and 128)
129 next unless ($di{$_} & ~0b1001_1111) == 0 or $di{$_} == 127;
130 # rename to something more descriptive
131 $info{$_}->{name} = $info{$_}->{unicode10}
132 ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful
133 : sprintf('<control U+%04X>', $di{$_}); # at least identify by value
134 # show descriptive symbols instead of control chars themselves
135 $info{$_}->{string} = $di{$_} < 32 ? chr($di{$_} + 0x2400) : chr(0xFFFD);
138 # output perl code of hash
139 # (assume no backslashes or curlies, so we can just q{} w/o escaping)
141 printf '(map {$_=>0} qw{%s}),'."\n", join(' ',
142 map { substr($_, 1, 1).substr($_, 0, 1) } sort keys %di
144 printf "q{%s}=>[%s],\n", $_, join(',',
145 $di{$_}, # original code point
146 $info{$_} # optional additional arguments
147 ? map {"'$_'"} @{ $info{$_} }{qw/name category script/},
148 $info{$_}->{string} || ()