From: Mischa POSLAWSKY Date: Wed, 5 Jan 2011 21:15:54 +0000 (+0100) Subject: unicode: dedicated include with character details X-Git-Tag: v1.5~82 X-Git-Url: http://git.shiar.nl/sheet.git/commitdiff_plain/f2e965b21a4be2aea69d887f619481e04f3ca58a unicode: dedicated include with character details --- diff --git a/Shiar_Sheet/FormatChar.pm b/Shiar_Sheet/FormatChar.pm index 4668c55..f22f42b 100644 --- a/Shiar_Sheet/FormatChar.pm +++ b/Shiar_Sheet/FormatChar.pm @@ -6,11 +6,9 @@ use warnings; use Data::Dump 'pp'; use PLP::Functions 'EscapeHTML'; -our $VERSION = '1.05'; +our $VERSION = '1.06'; -our $diinfo = do 'digraphs.inc.pl'; -our %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} } - sort { length $a <=> length $b } keys %$diinfo; +our $uc = do 'unicode-char.inc.pl'; sub new { my ($class) = @_; @@ -19,29 +17,27 @@ sub new { sub glyph_info { my ($self, $codepoint) = @_; - if (defined (my $mnem = $di{$codepoint})) { - return ($diinfo->{$mnem}, length $mnem == 2 ? $mnem : undef); - } - require Unicode::UCD; - if (my $fullinfo = Unicode::UCD::charinfo($codepoint)) { - return [$codepoint, @$fullinfo{qw/name category script string/}]; - } - return [$codepoint]; + return $uc->{chr $codepoint} || eval { + require Unicode::UCD; + if (my $fullinfo = Unicode::UCD::charinfo($codepoint)) { + return [@$fullinfo{qw/category name - string/}]; + } + } || []; } sub glyph_html { my ($self, $char) = @_; - my ($info, $mnem) = $self->glyph_info(ord $char); - my ($codepoint, $name, $prop, $script, $string) = @$info; + my $codepoint = ord $char; + my $info = $self->glyph_info($codepoint); + my ($class, $name, $mnem, $string) = @$info; my $cell = EscapeHTML($string || $char); my $title = sprintf 'U+%04X%s', $codepoint, $name && " ($name)"; - my @class = ('X', grep {$_} $prop, $script); - $cell = "$cell" if $prop and $prop eq 'Zs'; + $cell = "$cell" if $class =~ /\bZs\b/; $cell = ' ' if $cell eq ''; - return ($cell, EscapeHTML($title), join(' ', @class), $mnem); + return ($cell, EscapeHTML($title), "X $class", $mnem); } sub glyphs_html { @@ -85,9 +81,11 @@ sub cell { ($cell, $title, my $class, $mnem) = $self->glyphs_html($input); if ($self->{style} = 'di') { - if (defined $mnem) { - push @class, $class =~ /\bXz\b/ ? ('l2', 'u-prop') # unofficial - : ('l3', 'u-di'); # standard digraph + if ($class =~ /\bu-di\b/) { + push @class, ('l3', 'u-di'); # standard digraph + } + elsif ($class =~ /\bu-prop\b/) { + push @class, ('l2', 'u-prop'); # unofficial } } else { diff --git a/tools/convert-unicode.pl b/tools/convert-unicode.pl new file mode 100755 index 0000000..731210b --- /dev/null +++ b/tools/convert-unicode.pl @@ -0,0 +1,116 @@ +#!/usr/bin/env perl +use 5.010; +use strict; +use warnings; +use utf8; + +use open OUT => ':utf8', ':std'; +use Data::Dump 'pp'; + +our $VERSION = '1.00'; + +my %info = ( + "\xAD" => {string => '-'}, + "\x{200E}" => {string => '→'}, + "\x{200F}" => {string => '←'}, +); +$info{chr $_} //= {} for 32 .. 126; + +my %diinc = ( + 'digraphs.inc.pl' => 'u-di', +); +for (keys %diinc) { + -e $_ or next; + my $di = do $_ or die "Error reading digraphs file $_: ", $@ || $!; + while (my ($mnem, $cp) = each %$di) { + length $mnem == 2 or next; # limit to digraphs + $cp = chr $cp->[0] if ref $cp; # old style array + $info{$cp}->{di} //= $mnem; + $info{$cp}->{class}->{$_}++ for $diinc{$_}; + } +} + +for my $chr (keys %info) { + my $cp = ord $chr; + # attempt to get unicode character information + my $info = eval { + require Unicode::UCD; + Unicode::UCD::charinfo($cp) + || { block => '?', category => 'Xn', name => '', script => '' } + } or next; + + $info->{$_} = $info{$chr}->{$_} for qw(di class string); + + # categorise by unicode types and writing script + $info->{class}->{$_}++ for $info->{category}; + $info->{class}->{"u-$_"}++ for $info->{script} || (); + + # add custom categories for certain blocks + $info->{class}->{Xa}++ if $info->{block} eq 'Basic Latin'; + $info->{class}->{Xl}++ if $info->{block} eq 'Latin-1 Supplement'; + + given ($cp) { + when ($info->{string}) { + # keep predefined presentational string + } + when ($info->{combining}) { + # overlay combining accents + $info->{string} = chr(9676) . $chr; + } + when (($cp & ~0b1001_1111) == 0 or $cp == 127) { + # control characters (first 32 chars from 0 and 128) + # rename to something more descriptive + $info->{name} = $info->{unicode10} + ? '<'.$info->{unicode10}.'>' # the old name was much more useful + : sprintf('', $cp); # at least identify by value + # show descriptive symbols instead of control chars themselves + $info->{string} = $cp < 32 ? chr($cp + 0x2400) : chr(0xFFFD); + } + } + + $info{$chr} = $info; +} + +# output perl code of hash +say '+{'; +for my $cp (sort keys %info) { + $info{$cp}->{classstr} = join(' ', sort keys %{ $info{$cp}->{class} }); + # convert info hashes into arrays of strings to output in display order + my $row = [ map { $info{$cp}->{$_} } qw/classstr name di string/ ]; + # strip off trailing missing values (especially string may be unknown) + defined $row->[-1] ? last : pop @$row for 1 .. @$row; + # final line (assume safe within single quotes) + say sprintf '"\x{%X}" => [%s],', + ord $cp, join(',', map { escapeq($_) } @$row); +} +say '}'; + +sub escapeq { + my $_ = shift; + return 'undef' if not defined; + s/(['\\])/\\$1/g; + return "'$_'"; +} + +__END__ + +=head1 NAME + +convert-unicode.pl - Gather Unicode character details in Perl array + +=head1 SYNOPSIS + + convert-unicode > unicode-char.inc.pl + +Test by printing the description of U+0041 (latin A): + + perl -e'$u = do "unicode-char.inc.pl"; print $u->{A}->[1]' + +=head1 AUTHOR + +Mischa POSLAWSKY + +=head1 LICENSE + +Licensed under the GNU Affero General Public License version 3. + diff --git a/unicode.plp b/unicode.plp index 4febc1c..f0e435f 100644 --- a/unicode.plp +++ b/unicode.plp @@ -11,7 +11,7 @@ Html({ unicode glyph char character reference common ipa symbol sign mark table digraph '], stylesheet => [qw'light dark mono circus red'], - data => [qw'digraphs.inc.pl unicode-table.inc.pl'], + data => [qw'unicode-table.inc.pl unicode-char.inc.pl'], }); :>