X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/3d6f1a6afd0d5bd83e105821553124b37e036668..f2e965b21a4be2aea69d887f619481e04f3ca58a:/tools/convert-unicode.pl

diff --git a/tools/convert-unicode.pl b/tools/convert-unicode.pl
new file mode 100755
index 0000000..731210b
--- /dev/null
+++ b/tools/convert-unicode.pl
@@ -0,0 +1,116 @@
+#!/usr/bin/env perl
+use 5.010;
+use strict;
+use warnings;
+use utf8;
+
+use open OUT => ':utf8', ':std';
+use Data::Dump 'pp';
+
+our $VERSION = '1.00';
+
+my %info = (
+	"\xAD"     => {string => '-'},
+	"\x{200E}" => {string => 'â'},
+	"\x{200F}" => {string => 'â'},
+);
+$info{chr $_} //= {} for 32 .. 126;
+
+my %diinc = (
+	'digraphs.inc.pl' => 'u-di',
+);
+for (keys %diinc) {
+	-e $_ or next;
+	my $di = do $_ or die "Error reading digraphs file $_: ", $@ || $!;
+	while (my ($mnem, $cp) = each %$di) {
+		length $mnem == 2 or next;  # limit to digraphs
+		$cp = chr $cp->[0] if ref $cp;  # old style array
+		$info{$cp}->{di} //= $mnem;
+		$info{$cp}->{class}->{$_}++ for $diinc{$_};
+	}
+}
+
+for my $chr (keys %info) {
+	my $cp = ord $chr;
+	# attempt to get unicode character information
+	my $info = eval {
+		require Unicode::UCD;
+		Unicode::UCD::charinfo($cp)
+			|| { block => '?', category => 'Xn', name => '', script => '' }
+	} or next;
+
+	$info->{$_} = $info{$chr}->{$_} for qw(di class string);
+
+	# categorise by unicode types and writing script
+	$info->{class}->{$_}++ for $info->{category};
+	$info->{class}->{"u-$_"}++ for $info->{script} || ();
+
+	# add custom categories for certain blocks
+	$info->{class}->{Xa}++ if $info->{block} eq 'Basic Latin';
+	$info->{class}->{Xl}++ if $info->{block} eq 'Latin-1 Supplement';
+
+	given ($cp) {
+		when ($info->{string}) {
+			# keep predefined presentational string
+		}
+		when ($info->{combining}) {
+			# overlay combining accents
+			$info->{string} = chr(9676) . $chr;
+		}
+		when (($cp & ~0b1001_1111) == 0 or $cp == 127) {
+			# control characters (first 32 chars from 0 and 128)
+			# rename to something more descriptive
+			$info->{name} = $info->{unicode10}
+				? '<'.$info->{unicode10}.'>'  # the old name was much more useful
+				: sprintf('<control U+%04X>', $cp);  # at least identify by value
+			# show descriptive symbols instead of control chars themselves
+			$info->{string} = $cp < 32 ? chr($cp + 0x2400) : chr(0xFFFD);
+		}
+	}
+
+	$info{$chr} = $info;
+}
+
+# output perl code of hash
+say '+{';
+for my $cp (sort keys %info) {
+	$info{$cp}->{classstr} = join(' ', sort keys %{ $info{$cp}->{class} });
+	# convert info hashes into arrays of strings to output in display order
+	my $row = [ map { $info{$cp}->{$_} } qw/classstr name di string/ ];
+	# strip off trailing missing values (especially string may be unknown)
+	defined $row->[-1] ? last : pop @$row for 1 .. @$row;
+	# final line (assume safe within single quotes)
+	say sprintf '"\x{%X}" => [%s],',
+		ord $cp, join(',', map { escapeq($_) } @$row);
+}
+say '}';
+
+sub escapeq {
+	my $_ = shift;
+	return 'undef' if not defined;
+	s/(['\\])/\\$1/g;
+	return "'$_'";
+}
+
+__END__
+
+=head1 NAME
+
+convert-unicode.pl - Gather Unicode character details in Perl array
+
+=head1 SYNOPSIS
+
+    convert-unicode > unicode-char.inc.pl
+
+Test by printing the description of U+0041 (latin A):
+
+    perl -e'$u = do "unicode-char.inc.pl"; print $u->{A}->[1]'
+
+=head1 AUTHOR
+
+Mischa POSLAWSKY <perl@shiar.org>
+
+=head1 LICENSE
+
+Licensed under the GNU Affero General Public License version 3.
+