X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/3d6f1a6afd0d5bd83e105821553124b37e036668..f2e965b21a4be2aea69d887f619481e04f3ca58a:/tools/convert-unicode.pl diff --git a/tools/convert-unicode.pl b/tools/convert-unicode.pl new file mode 100755 index 0000000..731210b --- /dev/null +++ b/tools/convert-unicode.pl @@ -0,0 +1,116 @@ +#!/usr/bin/env perl +use 5.010; +use strict; +use warnings; +use utf8; + +use open OUT => ':utf8', ':std'; +use Data::Dump 'pp'; + +our $VERSION = '1.00'; + +my %info = ( + "\xAD" => {string => '-'}, + "\x{200E}" => {string => '→'}, + "\x{200F}" => {string => '←'}, +); +$info{chr $_} //= {} for 32 .. 126; + +my %diinc = ( + 'digraphs.inc.pl' => 'u-di', +); +for (keys %diinc) { + -e $_ or next; + my $di = do $_ or die "Error reading digraphs file $_: ", $@ || $!; + while (my ($mnem, $cp) = each %$di) { + length $mnem == 2 or next; # limit to digraphs + $cp = chr $cp->[0] if ref $cp; # old style array + $info{$cp}->{di} //= $mnem; + $info{$cp}->{class}->{$_}++ for $diinc{$_}; + } +} + +for my $chr (keys %info) { + my $cp = ord $chr; + # attempt to get unicode character information + my $info = eval { + require Unicode::UCD; + Unicode::UCD::charinfo($cp) + || { block => '?', category => 'Xn', name => '', script => '' } + } or next; + + $info->{$_} = $info{$chr}->{$_} for qw(di class string); + + # categorise by unicode types and writing script + $info->{class}->{$_}++ for $info->{category}; + $info->{class}->{"u-$_"}++ for $info->{script} || (); + + # add custom categories for certain blocks + $info->{class}->{Xa}++ if $info->{block} eq 'Basic Latin'; + $info->{class}->{Xl}++ if $info->{block} eq 'Latin-1 Supplement'; + + given ($cp) { + when ($info->{string}) { + # keep predefined presentational string + } + when ($info->{combining}) { + # overlay combining accents + $info->{string} = chr(9676) . $chr; + } + when (($cp & ~0b1001_1111) == 0 or $cp == 127) { + # control characters (first 32 chars from 0 and 128) + # rename to something more descriptive + $info->{name} = $info->{unicode10} + ? '<'.$info->{unicode10}.'>' # the old name was much more useful + : sprintf('', $cp); # at least identify by value + # show descriptive symbols instead of control chars themselves + $info->{string} = $cp < 32 ? chr($cp + 0x2400) : chr(0xFFFD); + } + } + + $info{$chr} = $info; +} + +# output perl code of hash +say '+{'; +for my $cp (sort keys %info) { + $info{$cp}->{classstr} = join(' ', sort keys %{ $info{$cp}->{class} }); + # convert info hashes into arrays of strings to output in display order + my $row = [ map { $info{$cp}->{$_} } qw/classstr name di string/ ]; + # strip off trailing missing values (especially string may be unknown) + defined $row->[-1] ? last : pop @$row for 1 .. @$row; + # final line (assume safe within single quotes) + say sprintf '"\x{%X}" => [%s],', + ord $cp, join(',', map { escapeq($_) } @$row); +} +say '}'; + +sub escapeq { + my $_ = shift; + return 'undef' if not defined; + s/(['\\])/\\$1/g; + return "'$_'"; +} + +__END__ + +=head1 NAME + +convert-unicode.pl - Gather Unicode character details in Perl array + +=head1 SYNOPSIS + + convert-unicode > unicode-char.inc.pl + +Test by printing the description of U+0041 (latin A): + + perl -e'$u = do "unicode-char.inc.pl"; print $u->{A}->[1]' + +=head1 AUTHOR + +Mischa POSLAWSKY + +=head1 LICENSE + +Licensed under the GNU Affero General Public License version 3. +