198 lines
5 KiB
Perl
Executable file
198 lines
5 KiB
Perl
Executable file
#!/usr/bin/env perl
|
|
use 5.010;
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
use MediaWiki::API;
|
|
use Test::More qw(no_plan);
|
|
use YAML::XS qw(Dump);
|
|
|
|
=head1 NAME
|
|
|
|
update-wiki-pages - Scrape the wiki for key/value wiki description pages
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
perl script/misc/update-wiki-pages config/wiki_pages.yml
|
|
|
|
Or with prove(1):
|
|
|
|
prove -e 'perl script/misc/update-wiki-pages' config/wiki_pages.yml
|
|
|
|
=cut
|
|
|
|
# Get the command-line options
|
|
Getopt::Long::Parser->new(
|
|
config => [ qw< bundling no_ignore_case no_require_order pass_through > ],
|
|
)->getoptions(
|
|
'h|help' => \my $help,
|
|
) or help();
|
|
|
|
# On --help
|
|
help() if $help;
|
|
|
|
my $out_file = $ARGV[0];
|
|
$out_file //= 'config/wiki_pages.yml';
|
|
|
|
help() unless -f $out_file;
|
|
|
|
# Get a API interface
|
|
my $mw = MediaWiki::API->new();
|
|
ok($mw, "Got a MediaWiki API");
|
|
$mw->{config}->{api_url} = 'https://wiki.openstreetmap.org/w/api.php';
|
|
$mw->{config}->{retries} = 5;
|
|
$mw->{config}->{retry_delay} = 30;
|
|
|
|
# All our goodies
|
|
my (%feature, %count);
|
|
|
|
# This is what you get on:
|
|
## http://wiki.openstreetmap.org/w/index.php?search=Template:KeyDescription&fulltext=Search&fulltext=Search
|
|
for my $lang ('', map { "${_}:" } qw[ Pt Fi De It HU Cz Fr RU Pl ]) {
|
|
ok(1, " Templates for language '$lang'");
|
|
|
|
# Key pages
|
|
ok(1, " Getting key pages");
|
|
my $cnt = stick_content_in_hash("key", "Template:${lang}KeyDescription", \%feature);
|
|
$cnt += stick_content_in_hash("key", "Template:${lang}Feature", \%feature);
|
|
ok(1, " Got $cnt key pages");
|
|
$count{key} += $cnt;
|
|
|
|
# Key prefix pages
|
|
ok(1, " Getting key prefix pages");
|
|
$cnt = stick_content_in_hash("key", "Template:${lang}KeyPrefixDescription", \%feature);
|
|
ok(1, " Got $cnt key prefix pages");
|
|
$count{keyprefix} += $cnt;
|
|
|
|
# Value pages
|
|
ok(1, " Getting value pages");
|
|
$cnt = stick_content_in_hash("tag", "Template:${lang}ValueDescription", \%feature);
|
|
ok(1, " Got $cnt value pages");
|
|
$count{value} += $cnt;
|
|
}
|
|
|
|
ok(1, "Got a total of $count{$_} ${_}s") for qw[ key keyprefix value ];
|
|
|
|
# Dump to .yml file
|
|
open my $out, ">", $out_file or die "Can't open file '$out_file' supplied on the command line";
|
|
say $out "# THIS FILE IS AUTOGENERATED WITH THE script/misc/update-wiki-pages";
|
|
say $out "# PROGRAM DO NOT MANUALLY EDIT IT";
|
|
say $out "";
|
|
say $out Dump(\%feature);
|
|
close $out;
|
|
|
|
exit 0;
|
|
|
|
sub stick_content_in_hash
|
|
{
|
|
my ($key, $title, $hash) = @_;
|
|
my $ukey = ucfirst $key;
|
|
|
|
my $space_to_underscore = sub {
|
|
my $txt = shift;
|
|
$txt =~ s/ /_/g;
|
|
$txt;
|
|
};
|
|
|
|
my $count = 0;
|
|
|
|
my $process_link = sub {
|
|
my $link = shift;
|
|
$count++;
|
|
ok(1, " ... got $count links") if $count % 200 == 0;
|
|
my $title = $link->{title};
|
|
my $lang;
|
|
my $key_name;
|
|
if ($title =~ /^$ukey:(?<key_name>.*?)$/) {
|
|
# English by default
|
|
$lang = "en";
|
|
$key_name = $space_to_underscore->($+{key_name});
|
|
} elsif ($title =~ /^(?<lang>[^:]+):$ukey:(?<key_name>.*?)$/) {
|
|
$lang = lc $+{lang};
|
|
$key_name = $space_to_underscore->($+{key_name});
|
|
}
|
|
if ($lang && !exists($hash->{$lang}->{$key}->{$key_name})) {
|
|
$hash->{$lang}->{$key}->{$key_name} = $title;
|
|
}
|
|
};
|
|
|
|
get_embeddedin(
|
|
$title,
|
|
sub {
|
|
my $link = shift;
|
|
$process_link->($link);
|
|
get_redirects(
|
|
$link->{title},
|
|
sub {
|
|
my $link = shift;
|
|
$process_link->($link) if exists($link->{redirect});
|
|
}
|
|
);
|
|
}
|
|
);
|
|
|
|
return $count;
|
|
}
|
|
|
|
sub process_list
|
|
{
|
|
my $callback = shift;
|
|
my $links = shift;
|
|
for my $link (@$links) {
|
|
$callback->($link);
|
|
}
|
|
}
|
|
|
|
sub get_embeddedin
|
|
{
|
|
my ($title, $callback) = @_;
|
|
my $articles = $mw->list(
|
|
{
|
|
action => 'query',
|
|
list => 'embeddedin',
|
|
eititle => $title,
|
|
eifilterredir => 'nonredirects',
|
|
# Doesn't work for De:* and anything non-en. Odd.
|
|
# einamespace => '0|8',
|
|
eilimit => '200',
|
|
},
|
|
{
|
|
max => '0',
|
|
hook => sub { process_list($callback, @_) },
|
|
skip_encoding => 1,
|
|
}
|
|
) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
|
|
}
|
|
|
|
sub get_redirects
|
|
{
|
|
my ($title, $callback) = @_;
|
|
my $articles = $mw->list(
|
|
{
|
|
action => 'query',
|
|
list => 'backlinks',
|
|
bltitle => $title,
|
|
blfilterredir => 'redirects',
|
|
# Doesn't work for De:* and anything non-en. Odd.
|
|
# einamespace => '0|8',
|
|
bllimit => '200',
|
|
},
|
|
{
|
|
max => '0',
|
|
hook => sub { process_list($callback, @_) },
|
|
skip_encoding => 1,
|
|
}
|
|
) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
|
|
}
|
|
|
|
sub help
|
|
{
|
|
my %arg = @_;
|
|
|
|
Pod::Usage::pod2usage(
|
|
-verbose => $arg{ verbose },
|
|
-exitval => $arg{ exitval } || 0,
|
|
);
|
|
}
|