#!/usr/bin/env perl use strict; use warnings; use Data::Dumper; our $VERSION = '1.00'; if (0) { #TODO: automatic download if not specified on stdin require LWP::Simple; LWP::Simple::get('http://www.ietf.org/rfc/rfc1345.txt'); } # skip everything until a character indented by 1 space (table start) do {$_ = <>} until /^\s\S/; my @t = $_; # add first line (already read, assume it's ok) # read the rest of the character table while ($_ = <>) { # check for table end (chapter 4) last if /^4/; # parse table lines (ignore (unindented) page break) next unless s/^ //; chomp; # add the line to @t if (s/^ {15}/ /) { # continuation line (add to last entry) $t[-1] .= $_; } else { # add a new entry push @t, $_; } } # create a hash of desired input my %di; for (@t) { my ($mnem, $char, $name) = split / +/, $_, 3; next if length $mnem != 2; $di{$mnem} = hex $char; } # personal addendums my @extra; if (-r 'shiar.inc.txt') { open my $include, '<:utf8', 'shiar.inc.txt'; for (readline $include) { m{^([!"%'-Z_a-z]{2}) (.)} or next; warn("$1 already defined"), next if defined $di{$1}; $di{$1} = ord $2; push @extra, $1; } } warn $@ if $@; # optionally get unicode character information my %info = eval { require Unicode::UCD; map { $_ => Unicode::UCD::charinfo($di{$_}) || { block => '?', category => 'Xn', name => '', script => '' } } keys %di; }; # add custom categories for certain blocks for (values %info) { $_->{category} .= ' Xa' if $_->{block} eq 'Basic Latin'; $_->{category} .= ' Xl' if $_->{block} eq 'Latin-1 Supplement'; } # mark unofficial extras as such $info{$_}->{category} .= ' Xz' for @extra; for (keys %di) { # find control characters (first 32 chars from 0 and 128) next if $di{$_} & ~0b1001_1111; # rename to something more descriptive $info{$_}->{name} = $info{$_}->{unicode10} ? '<'.$info{$_}->{unicode10}.'>' # the old name was much more useful : sprintf('', $di{$_}); # at least identify by value # show descriptive symbols instead of control chars themselves $di{$_} += 0x2400 if $di{$_} < 32; } # output perl code of hash # (assume no backslashes or curlies, so we can just q{} w/o escaping) print "{\n"; printf "q{%s}=>[%s],\n", $_, join(',', $di{$_}, # glyph code point $info{$_} # optional additional arguments ? map {"'$_'"} @{ $info{$_} }{qw/name category script/} : () ) for sort keys %di; print "}\n";