use strict;
use warnings;
use utf8;
+no if $] >= 5.018, warnings => 'experimental::smartmatch';
use open OUT => ':utf8', ':std';
use Data::Dump 'pp';
eval {
require HTML::Entities;
- while (my ($char, $entity) = each %HTML::Entities::char2entity) {
+ our %char2entity;
+ HTML::Entities->import('%char2entity');
+ while (my ($char, $entity) = each %char2entity) {
$entity =~ /[a-zA-Z]/ or next; # only actual aliases
$info{$char}->{html} = substr($entity, 1, -1);
}
}
}
+eval {
+ # read introducing unicode versions for known characters
+ my $agemap = do 'unicode-age.inc.pl' or die $@ || $!;
+ for my $chr (keys %info) {
+ my $version = $agemap->{ord $chr} or next;
+ $info{$chr}->{class}->{'u-v'.$version}++
+ }
+ 1;
+} or warn "Failed including unicode version data $@";
+
for my $chr (keys %info) {
my $cp = ord $chr;
# attempt to get unicode character information
|| { block => '?', category => 'Xn', name => '', script => '' }
} or next;
- $info->{$_} = $info{$chr}->{$_} for qw(di html class string);
+ $info->{$_} = $info{$chr}->{$_} for keys %{ $info{$chr} };
# categorise by unicode types and writing script
$info->{class}->{$_}++ for $info->{category};
? '<'.$info->{unicode10}.'>' # the old name was much more useful
: sprintf('<control U+%04X>', $cp); # at least identify by value
# show descriptive symbols instead of control chars themselves
- $info->{string} = $cp < 32 ? chr($cp + 0x2400) : chr(0xFFFD);
+ $info->{string} = $cp < 32 ? chr($cp + 0x2400) :
+ $cp == 127 ? chr(0x2421) :
+ chr(0xFFFD);
}
}
say '}';
sub escapeq {
- my $_ = shift;
+ local $_ = shift;
return 'undef' if not defined;
s/(['\\])/\\$1/g;
return "'$_'";