#!/usr/bin/env perl
use 5.014;
use warnings;
use utf8;
no if $] >= 5.018, warnings => 'experimental::smartmatch';
use lib '.';

use open OUT => ':encoding(utf-8)', ':std';
use Data::Dump 'pp';

our $VERSION = '1.03';

my %info = (
	# prepare presentational string for some control(lish) entries
	"\xAD"     => {string => '-'},
	"\x{200E}" => {string => '→'},
	"\x{200F}" => {string => '←'},
	"\x{200B}" => {string => '␣'}, # nbsp: ~ in TeX
	"\x{200C}" => {string => '|'}, # ISO-9995-7-081 lookalike (alt: ∣ ⊺ ⟙)
	"\x{200D}" => {string => '⁀'}, # join (alt: ∤ |ͯ ⨝)
	(map {( $_ => {string => chr(9676).$_.chr(9676)} )} map {chr} # combining double
		0x35C .. 0x362, 0x1DCD, 0x1DFC,
	),
);
$info{chr $_} //= {} for 32 .. 126;

eval {
	my $tables = do './unicode-table.inc.pl' or die $@ || $!;
	for (values %$tables) {
		for (values %$_) {
			for (@$_) {
				length $_ == 1 or next;  # ignore meta values
				s/\\//;  # unescape
				$info{$_} //= {};
			}
		}
	}
	1;
} or warn "Failed reading unicode tables: $@";

for my $layout ('macos-abc', 'windows') {
	eval {
		my $kbd = do "./keyboard/altgr/$layout.eng.inc.pl" or die $@ || $!;
		$info{$_} //= {} for map {s/◌//g; m/\A./g} values %{ $kbd->{key} };
		1;
	} or warn "Failed reading additional keyboard map $layout: $@";
}

eval {
	require HTML::Entities;
	our %char2entity;
	HTML::Entities->import('%char2entity');
	while (my ($char, $entity) = each %char2entity) {
		$entity =~ /[a-zA-Z]/ or next;  # only actual aliases
		$info{$char}->{html} = substr($entity, 1, -1);
	}
	1;
} or warn "Failed importing html entities: $@";

my %diinc = (
	'./data/digraphs-rfc.inc.pl' => 'u-di',
	'./data/digraphs-shiar.inc.pl' => 'u-prop',
	'./data/digraphs-vim.inc.pl' => 'u-vim',
);
for (sort keys %diinc) {
	-e $_ or next;
	my $di = do $_ or die "Error reading digraphs file $_: ", $@ || $!;
	for my $mnem (sort keys %{$di}) {
		my $cp = $di->{$mnem};
		length $mnem == 2 or next;  # limit to digraphs
		my $class = $diinc{$_};
		$info{$cp}->{di} //= $mnem;
		$info{$cp}->{class}->{$class}++;
	}
}

eval {
	# read introducing unicode versions for known characters
	my $agemap = do './data/unicode-age.inc.pl' or die $@ || $!;
	for my $chr (keys %info) {
		my $version = $agemap->{ord $chr} or next;
		$info{$chr}->{class}->{'u-v'.$version}++
	}
	1;
} or warn "Failed including unicode version data: $@";

for my $chr (keys %info) {
	my $cp = ord $chr;
	#my $info = glyph_mkinfo($cp) or next;
	# attempt to get unicode character information
	my $info = eval {
		require Unicode::UCD;
		Unicode::UCD::charinfo($cp)
			|| { block => '?', category => 'Xn', name => '', script => '' }
	} or next;

	$info->{$_} = $info{$chr}->{$_} for keys %{ $info{$chr} };

	# ignore vim flag in addition to rfc support, replace otherwise
	$info->{class}->{'u-di'} or $info->{class}->{'u-prop'}++
		if delete $info->{class}->{'u-vim'};

	# categorise by unicode types and writing script
	$info->{class}->{$_}++ for $info->{category};
	$info->{class}->{$_}++ for $info->{script} || ();

	# add custom categories for certain blocks
	$info->{class}->{Xa}++ if $info->{block} eq 'Basic Latin';
	$info->{class}->{Xl}++ if $info->{block} eq 'Latin-1 Supplement';

	{
		if ($info->{string}) {
			# keep predefined presentational string
		}
		elsif ($info->{combining}) {
			# overlay combining accents
			$info->{string} = chr(9676) . $chr;
		}
		elsif (($cp & ~0b1001_1111) == 0 or $cp == 127) {
			# control characters (first 32 chars from 0 and 128)
			# rename to something more descriptive
			$info->{name} = $info->{unicode10}
				? '<'.$info->{unicode10}.'>'  # the old name was much more useful
				: sprintf('<control U+%04X>', $cp);  # at least identify by value
			# show descriptive symbols instead of control chars themselves
			$info->{string} = $cp < 32   ? chr($cp + 0x2400) :
			                  $cp == 127 ? chr(0x2421) :
			                               chr(0xFFFD);
		}
	}

	$info{$chr} = $info;
}

# output perl code of hash
say "# automatically generated by $0";
say 'use utf8;';
say '+{';
for my $cp (sort keys %info) {
	$info{$cp}->{classstr} = join(' ', sort keys %{ $info{$cp}->{class} });
	# convert info hashes into arrays of strings to output in display order
	my $row = [ map { $info{$cp}->{$_} } qw/classstr name di html string/ ];
	# strip off trailing missing values (especially string may be unknown)
	defined $row->[-1] ? last : pop @$row for 1 .. @$row;
	# final line (assume safe within single quotes)
	say sprintf '"\x{%X}" => [%s],',
		ord $cp, join(',', map { escapeq($_) } @$row);
}
say '}';

sub escapeq {
	local $_ = shift;
	return 'undef' if not defined;
	s/(['\\])/\\$1/g;
	return "'$_'";
}

__END__

=head1 NAME

mkcharinfo - Gather Unicode character details in Perl array

=head1 SYNOPSIS

    mkcharinfo > unicode-char.inc.pl

Test by printing the description of U+0041 (latin A):

    perl -e'$u = do "unicode-char.inc.pl"; print $u->{A}->[1]'

=head1 AUTHOR

Mischa POSLAWSKY <perl@shiar.org>

=head1 LICENSE

Licensed under the GNU Affero General Public License version 3.