X-Git-Url: http://git.shiar.nl/sheet.git/blobdiff_plain/b5b3537710ed9f73e1c867e0cc27d50439eaf4cd..HEAD:/tools/mkcharinfo

diff --git a/tools/mkcharinfo b/tools/mkcharinfo
index 577ffb1..8bf3d54 100755
--- a/tools/mkcharinfo
+++ b/tools/mkcharinfo
@@ -1,23 +1,31 @@
 #!/usr/bin/env perl
-use 5.010;
-use strict;
+use 5.014;
 use warnings;
 use utf8;
+no if $] >= 5.018, warnings => 'experimental::smartmatch';
+use lib '.';
 
-use open OUT => ':utf8', ':std';
+use open OUT => ':encoding(utf-8)', ':std';
 use Data::Dump 'pp';
 
-our $VERSION = '1.00';
+our $VERSION = '1.03';
 
 my %info = (
+	# prepare presentational string for some control(lish) entries
 	"\xAD"     => {string => '-'},
 	"\x{200E}" => {string => 'â'},
 	"\x{200F}" => {string => 'â'},
+	"\x{200B}" => {string => 'â£'}, # nbsp: ~ in TeX
+	"\x{200C}" => {string => '|'}, # ISO-9995-7-081 lookalike (alt: â£ âº â)
+	"\x{200D}" => {string => 'â'}, # join (alt: â¤ |Í¯ â¨)
+	(map {( $_ => {string => chr(9676).$_.chr(9676)} )} map {chr} # combining double
+		0x35C .. 0x362, 0x1DCD, 0x1DFC,
+	),
 );
 $info{chr $_} //= {} for 32 .. 126;
 
 eval {
-	my $tables = do 'unicode-table.inc.pl' or die $@ || $!;
+	my $tables = do './unicode-table.inc.pl' or die $@ || $!;
 	for (values %$tables) {
 		for (values %$_) {
 			for (@$_) {
@@ -30,9 +38,19 @@ eval {
 	1;
 } or warn "Failed reading unicode tables: $@";
 
+for my $layout ('macos-abc', 'windows') {
+	eval {
+		my $kbd = do "./keyboard/altgr/$layout.eng.inc.pl" or die $@ || $!;
+		$info{$_} //= {} for map {s/â//g; m/\A./g} values %{ $kbd->{key} };
+		1;
+	} or warn "Failed reading additional keyboard map $layout: $@";
+}
+
 eval {
 	require HTML::Entities;
-	while (my ($char, $entity) = each %HTML::Entities::char2entity) {
+	our %char2entity;
+	HTML::Entities->import('%char2entity');
+	while (my ($char, $entity) = each %char2entity) {
 		$entity =~ /[a-zA-Z]/ or next;  # only actual aliases
 		$info{$char}->{html} = substr($entity, 1, -1);
 	}
@@ -40,26 +58,35 @@ eval {
 } or warn "Failed importing html entities: $@";
 
 my %diinc = (
-	'digraphs.inc.pl' => 'u-di',
+	'./data/digraphs-rfc.inc.pl' => 'u-di',
+	'./data/digraphs-shiar.inc.pl' => 'u-prop',
+	'./data/digraphs-vim.inc.pl' => 'u-vim',
 );
-for (keys %diinc) {
+for (sort keys %diinc) {
 	-e $_ or next;
 	my $di = do $_ or die "Error reading digraphs file $_: ", $@ || $!;
-	while (my ($mnem, $cp) = each %$di) {
+	for my $mnem (sort keys %{$di}) {
+		my $cp = $di->{$mnem};
 		length $mnem == 2 or next;  # limit to digraphs
 		my $class = $diinc{$_};
-		if (ref $cp) {
-			# old style array
-			$class = 'u-prop' if $cp->[2] and $cp->[2] =~ m/\bXz\b/;
-			$cp = chr $cp->[0];
-		}
 		$info{$cp}->{di} //= $mnem;
 		$info{$cp}->{class}->{$class}++;
 	}
 }
 
+eval {
+	# read introducing unicode versions for known characters
+	my $agemap = do './data/unicode-age.inc.pl' or die $@ || $!;
+	for my $chr (keys %info) {
+		my $version = $agemap->{ord $chr} or next;
+		$info{$chr}->{class}->{'u-v'.$version}++
+	}
+	1;
+} or warn "Failed including unicode version data: $@";
+
 for my $chr (keys %info) {
 	my $cp = ord $chr;
+	#my $info = glyph_mkinfo($cp) or next;
 	# attempt to get unicode character information
 	my $info = eval {
 		require Unicode::UCD;
@@ -67,7 +94,11 @@ for my $chr (keys %info) {
 			|| { block => '?', category => 'Xn', name => '', script => '' }
 	} or next;
 
-	$info->{$_} = $info{$chr}->{$_} for qw(di html class string);
+	$info->{$_} = $info{$chr}->{$_} for keys %{ $info{$chr} };
+
+	# ignore vim flag in addition to rfc support, replace otherwise
+	$info->{class}->{'u-di'} or $info->{class}->{'u-prop'}++
+		if delete $info->{class}->{'u-vim'};
 
 	# categorise by unicode types and writing script
 	$info->{class}->{$_}++ for $info->{category};
@@ -77,22 +108,24 @@ for my $chr (keys %info) {
 	$info->{class}->{Xa}++ if $info->{block} eq 'Basic Latin';
 	$info->{class}->{Xl}++ if $info->{block} eq 'Latin-1 Supplement';
 
-	given ($cp) {
-		when ($info->{string}) {
+	{
+		if ($info->{string}) {
 			# keep predefined presentational string
 		}
-		when ($info->{combining}) {
+		elsif ($info->{combining}) {
 			# overlay combining accents
 			$info->{string} = chr(9676) . $chr;
 		}
-		when (($cp & ~0b1001_1111) == 0 or $cp == 127) {
+		elsif (($cp & ~0b1001_1111) == 0 or $cp == 127) {
 			# control characters (first 32 chars from 0 and 128)
 			# rename to something more descriptive
 			$info->{name} = $info->{unicode10}
 				? '<'.$info->{unicode10}.'>'  # the old name was much more useful
 				: sprintf('<control U+%04X>', $cp);  # at least identify by value
 			# show descriptive symbols instead of control chars themselves
-			$info->{string} = $cp < 32 ? chr($cp + 0x2400) : chr(0xFFFD);
+			$info->{string} = $cp < 32   ? chr($cp + 0x2400) :
+			                  $cp == 127 ? chr(0x2421) :
+			                               chr(0xFFFD);
 		}
 	}
 
@@ -100,6 +133,7 @@ for my $chr (keys %info) {
 }
 
 # output perl code of hash
+say "# automatically generated by $0";
 say 'use utf8;';
 say '+{';
 for my $cp (sort keys %info) {
@@ -115,7 +149,7 @@ for my $cp (sort keys %info) {
 say '}';
 
 sub escapeq {
-	my $_ = shift;
+	local $_ = shift;
 	return 'undef' if not defined;
 	s/(['\\])/\\$1/g;
 	return "'$_'";