unicode: dedicated include with character details

author Mischa POSLAWSKY <perl@shiar.org>

Wed, 5 Jan 2011 21:15:54 +0000 (22:15 +0100)

committer Mischa POSLAWSKY <perl@shiar.org>

Tue, 11 Jan 2011 22:41:38 +0000 (23:41 +0100)
author Mischa POSLAWSKY <perl@shiar.org>
Wed, 5 Jan 2011 21:15:54 +0000 (22:15 +0100)
committer Mischa POSLAWSKY <perl@shiar.org>
Tue, 11 Jan 2011 22:41:38 +0000 (23:41 +0100)
diff --git a/Shiar_Sheet/FormatChar.pm b/Shiar_Sheet/FormatChar.pm

index 4668c5542dc1f52feacdfd972dd9f50755179064..f22f42bebd517cc1ffbd5e43a0d86b6ccd7595ad 100644 (file)
--- a/Shiar_Sheet/FormatChar.pm
+++ b/Shiar_Sheet/FormatChar.pm
@@ -6,11 +6,9 @@ use warnings;
  use Data::Dump 'pp';
  use PLP::Functions 'EscapeHTML';
  
-our $VERSION = '1.05';
+our $VERSION = '1.06';
  
-our $diinfo = do 'digraphs.inc.pl';
-our %di = map { $diinfo->{$_}->[0] => $_ } grep { ref $diinfo->{$_} }
-       sort { length $a <=> length $b } keys %$diinfo;
+our $uc = do 'unicode-char.inc.pl';
  
  sub new {
         my ($class) = @_;
@@ -19,29 +17,27 @@ sub new {
  
  sub glyph_info {
         my ($self, $codepoint) = @_;
-       if (defined (my $mnem = $di{$codepoint})) {
-               return ($diinfo->{$mnem}, length $mnem == 2 ? $mnem : undef);
-       }
-       require Unicode::UCD;
-       if (my $fullinfo = Unicode::UCD::charinfo($codepoint)) {
-               return [$codepoint, @$fullinfo{qw/name category script string/}];
-       }
-       return [$codepoint];
+       return $uc->{chr $codepoint} || eval {
+               require Unicode::UCD;
+               if (my $fullinfo = Unicode::UCD::charinfo($codepoint)) {
+                       return [@$fullinfo{qw/category name - string/}];
+               }
+       } || [];
  }
  
  sub glyph_html {
         my ($self, $char) = @_;
-       my ($info, $mnem) = $self->glyph_info(ord $char);
-       my ($codepoint, $name, $prop, $script, $string) = @$info;
+       my $codepoint = ord $char;
+       my $info = $self->glyph_info($codepoint);
+       my ($class, $name, $mnem, $string) = @$info;
  
         my $cell = EscapeHTML($string || $char);
         my $title = sprintf 'U+%04X%s', $codepoint, $name && " ($name)";
-       my @class = ('X', grep {$_} $prop, $script);
  
-       $cell = "<span>$cell</span>" if $prop and $prop eq 'Zs';
+       $cell = "<span>$cell</span>" if $class =~ /\bZs\b/;
         $cell = '&nbsp;' if $cell eq '';
  
-       return ($cell, EscapeHTML($title), join(' ', @class), $mnem);
+       return ($cell, EscapeHTML($title), "X $class", $mnem);
  }
  
  sub glyphs_html {
@@ -85,9 +81,11 @@ sub cell {
                 ($cell, $title, my $class, $mnem) = $self->glyphs_html($input);
  
                 if ($self->{style} = 'di') {
-                       if (defined $mnem) {
-                               push @class, $class =~ /\bXz\b/ ? ('l2', 'u-prop') # unofficial
-                                       : ('l3', 'u-di'); # standard digraph
+                       if ($class =~ /\bu-di\b/) {
+                               push @class, ('l3', 'u-di'); # standard digraph
+                       }
+                       elsif ($class =~ /\bu-prop\b/) {
+                               push @class, ('l2', 'u-prop'); # unofficial
                         }
                 }
                 else {
diff --git a/tools/convert-unicode.pl b/tools/convert-unicode.pl

new file mode 100755 (executable)

index 0000000..731210b
--- /dev/null
+++ b/tools/convert-unicode.pl
@@ -0,0 +1,116 @@
+#!/usr/bin/env perl
+use 5.010;
+use strict;
+use warnings;
+use utf8;
+
+use open OUT => ':utf8', ':std';
+use Data::Dump 'pp';
+
+our $VERSION = '1.00';
+
+my %info = (
+       "\xAD"     => {string => '-'},
+       "\x{200E}" => {string => '→'},
+       "\x{200F}" => {string => '←'},
+);
+$info{chr $_} //= {} for 32 .. 126;
+
+my %diinc = (
+       'digraphs.inc.pl' => 'u-di',
+);
+for (keys %diinc) {
+       -e $_ or next;
+       my $di = do $_ or die "Error reading digraphs file $_: ", $@ || $!;
+       while (my ($mnem, $cp) = each %$di) {
+               length $mnem == 2 or next;  # limit to digraphs
+               $cp = chr $cp->[0] if ref $cp;  # old style array
+               $info{$cp}->{di} //= $mnem;
+               $info{$cp}->{class}->{$_}++ for $diinc{$_};
+       }
+}
+
+for my $chr (keys %info) {
+       my $cp = ord $chr;
+       # attempt to get unicode character information
+       my $info = eval {
+               require Unicode::UCD;
+               Unicode::UCD::charinfo($cp)
+                       || { block => '?', category => 'Xn', name => '', script => '' }
+       } or next;
+
+       $info->{$_} = $info{$chr}->{$_} for qw(di class string);
+
+       # categorise by unicode types and writing script
+       $info->{class}->{$_}++ for $info->{category};
+       $info->{class}->{"u-$_"}++ for $info->{script} || ();
+
+       # add custom categories for certain blocks
+       $info->{class}->{Xa}++ if $info->{block} eq 'Basic Latin';
+       $info->{class}->{Xl}++ if $info->{block} eq 'Latin-1 Supplement';
+
+       given ($cp) {
+               when ($info->{string}) {
+                       # keep predefined presentational string
+               }
+               when ($info->{combining}) {
+                       # overlay combining accents
+                       $info->{string} = chr(9676) . $chr;
+               }
+               when (($cp & ~0b1001_1111) == 0 or $cp == 127) {
+                       # control characters (first 32 chars from 0 and 128)
+                       # rename to something more descriptive
+                       $info->{name} = $info->{unicode10}
+                               ? '<'.$info->{unicode10}.'>'  # the old name was much more useful
+                               : sprintf('<control U+%04X>', $cp);  # at least identify by value
+                       # show descriptive symbols instead of control chars themselves
+                       $info->{string} = $cp < 32 ? chr($cp + 0x2400) : chr(0xFFFD);
+               }
+       }
+
+       $info{$chr} = $info;
+}
+
+# output perl code of hash
+say '+{';
+for my $cp (sort keys %info) {
+       $info{$cp}->{classstr} = join(' ', sort keys %{ $info{$cp}->{class} });
+       # convert info hashes into arrays of strings to output in display order
+       my $row = [ map { $info{$cp}->{$_} } qw/classstr name di string/ ];
+       # strip off trailing missing values (especially string may be unknown)
+       defined $row->[-1] ? last : pop @$row for 1 .. @$row;
+       # final line (assume safe within single quotes)
+       say sprintf '"\x{%X}" => [%s],',
+               ord $cp, join(',', map { escapeq($_) } @$row);
+}
+say '}';
+
+sub escapeq {
+       my $_ = shift;
+       return 'undef' if not defined;
+       s/(['\\])/\\$1/g;
+       return "'$_'";
+}
+
+__END__
+
+=head1 NAME
+
+convert-unicode.pl - Gather Unicode character details in Perl array
+
+=head1 SYNOPSIS
+
+    convert-unicode > unicode-char.inc.pl
+
+Test by printing the description of U+0041 (latin A):
+
+    perl -e'$u = do "unicode-char.inc.pl"; print $u->{A}->[1]'
+
+=head1 AUTHOR
+
+Mischa POSLAWSKY <perl@shiar.org>
+
+=head1 LICENSE
+
+Licensed under the GNU Affero General Public License version 3.
+
diff --git a/unicode.plp b/unicode.plp

index 4febc1cd488082bb8d310e906780addc2df2562c..f0e435f6940a8c31f093fadaa95978211d9213e7 100644 (file)
--- a/unicode.plp
+++ b/unicode.plp
@@ -11,7 +11,7 @@ Html({
                 unicode glyph char character reference common ipa symbol sign mark table digraph
         '],
         stylesheet => [qw'light dark mono circus red'],
-       data => [qw'digraphs.inc.pl unicode-table.inc.pl'],
+       data => [qw'unicode-table.inc.pl unicode-char.inc.pl'],
  });
  
  :>
author	Mischa POSLAWSKY <perl@shiar.org>
	Wed, 5 Jan 2011 21:15:54 +0000 (22:15 +0100)
committer	Mischa POSLAWSKY <perl@shiar.org>
	Tue, 11 Jan 2011 22:41:38 +0000 (23:41 +0100)
Shiar_Sheet/FormatChar.pm		patch \| blob \| history
tools/convert-unicode.pl	[new file with mode: 0755]	patch \| blob
unicode.plp		patch \| blob \| history