eggdrop/parse-fileext-wiki.pl
2011-08-16 10:51:41 +02:00

42 lines
922 B
Perl

#!/usr/bin/perl
#use strict;
#use warnings;
use Web::Scraper;
use URI;
use HTML::Entities;
use Encode;
use URI::Escape;
use LWP::UserAgent;
my $scrap;
my $wikiurl = "http://de.wikipedia.org/wiki/Liste_der_Dateiendungen";
my $scrapp = scraper {
process '//div[@id="bodyContent"]/table/tr/td/a', 'chars[]' => 'TEXT';
};
my $url = URI->new($wikiurl);
my $blubb = $scrapp->scrape($url);
my $list = $blubb->{'chars'};
binmode(STDOUT, ":utf8");
foreach(@$list) {
$scrap = scraper {
process '//div[@id="bodyContent"]/table[@class="prettytable"]/tr/td', 'table[]' => 'TEXT';
};
$url = URI->new("$wikiurl/$_");
my $res = $scrap->scrape($url);
my $table = $res->{'table'};
for ($i=0; $i<=$#$table; $i+=3) {
if ($$table[$i] !~ /\..*(\..*)+/ && $$table[$i+1] !~ /^.?$/ ) {
print "$$table[$i] $$table[$i+1]\n";
}
if ($$table[$i+2] =~ /^\./) {
$i--;
}
}
}