blob: 906751be52774aee203d684c6fec8ee88a19a5e4 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
#!/usr/bin/perl
#use strict;
#use warnings;
use Web::Scraper;
use URI;
use HTML::Entities;
use Encode;
use URI::Escape;
use LWP::UserAgent;
my $scrap;
my $wikiurl = "http://de.wikipedia.org/wiki/Liste_der_Dateiendungen";
my $scrapp = scraper {
process '//div[@id="bodyContent"]/table/tr/td/a', 'chars[]' => 'TEXT';
};
my $url = URI->new($wikiurl);
my $blubb = $scrapp->scrape($url);
my $list = $blubb->{'chars'};
binmode(STDOUT, ":utf8");
foreach(@$list) {
$scrap = scraper {
process '//div[@id="bodyContent"]/table[@class="prettytable"]/tr/td', 'table[]' => 'TEXT';
};
$url = URI->new("$wikiurl/$_");
my $res = $scrap->scrape($url);
my $table = $res->{'table'};
for ($i=0; $i<=$#$table; $i+=3) {
if ($$table[$i] !~ /\..*(\..*)+/ && $$table[$i+1] !~ /^.?$/ ) {
print "$$table[$i] $$table[$i+1]\n";
}
if ($$table[$i+2] =~ /^\./) {
$i--;
}
}
}
|