browser: new mkusage-wikipedia to parse analytics tsv
authorMischa POSLAWSKY <perl@shiar.org>
Thu, 27 Apr 2017 00:33:39 +0000 (02:33 +0200)
committerMischa POSLAWSKY <perl@shiar.org>
Mon, 29 May 2017 16:51:18 +0000 (18:51 +0200)
Amazing export from analytics.wikimedia.org continuing previous squid stats
but all nicely prepared, only needing translation to caniuse agent identifiers.
Contains all samples, so restrict by year for now.

tools/mkusage-wikimedia [new file with mode: 0755]

diff --git a/tools/mkusage-wikimedia b/tools/mkusage-wikimedia
new file mode 100755 (executable)
index 0000000..5e264ed
--- /dev/null
@@ -0,0 +1,65 @@
+#!/usr/bin/perl
+use 5.014;
+use warnings;
+
+use Data::Dump 'pp';
+
+our $VERSION = '1.00';
+
+my %BROWSERID = qw(
+       IE                      ie
+       IE-Mobile               ie_mob
+       Edge                    edge
+       Edge-Mobile             edge
+       Firefox                 firefox
+       Firefox-Mobile          and_ff
+       Safari                  safari
+       Mobile-Safari           ios_saf
+       Mobile-Safari-UIWebView ios_saf
+       Chrome                  chrome
+       Chromium                chrome
+       Chrome-Mobile           and_chr
+       Chrome-Mobile-iOS       and_chr
+       Android                 android
+       Opera                   opera
+       Opera-Mini              op_mini
+       BlackBerry-WebKit       bb
+       UC-Browser              and_uc
+);
+
+my %count = (
+       -title  => 'Wikimedia',
+       -site   => 'https://analytics.wikimedia.org/',
+);
+
+my $recent = qr/^2017-/;
+
+(readline =~ y/\t//) == 3 or die "unexpected amount of columns in header\n";
+
+while (my $row = readline) {
+       my ($date, $name, $version, $pct) = split /\t/, $row;
+       $date =~ $recent or next;
+       $name =~ y/ /-/;
+       my $browser = $BROWSERID{$name} or next;
+       $count{$browser}{$version} += $pct;
+       $count{-total} += $pct;
+       $count{-date}->{$date}++;
+}
+
+$_ = join ' to ', (sort keys %{$_})[0, -1] for $count{-date};
+
+my $mult = 100 / delete $count{-total};
+for (values %count) {
+       ref $_ eq 'HASH' or next;
+       $_ *= $mult for values %{$_};
+}
+
+say '+', pp(\%count);
+
+__END__
+
+=head1 USAGE
+
+       curl https://analytics.wikimedia.org/datasets/periodic/reports/metrics/browser/all_sites_by_browser_family_and_major_percent.tsv |
+       ./mkusage-wikimedia >browser-usage.inc.pl
+