summaryrefslogtreecommitdiffstats
path: root/parse-fileext-wiki.pl
blob: 906751be52774aee203d684c6fec8ee88a19a5e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/perl

#use strict;
#use warnings;
use Web::Scraper;
use URI;
use HTML::Entities;
use Encode;
use URI::Escape;
use LWP::UserAgent;

my $scrap;

my $wikiurl = "http://de.wikipedia.org/wiki/Liste_der_Dateiendungen";

my $scrapp = scraper {
   process '//div[@id="bodyContent"]/table/tr/td/a', 'chars[]' => 'TEXT';
};
my $url = URI->new($wikiurl);
my $blubb = $scrapp->scrape($url);
my $list = $blubb->{'chars'};

binmode(STDOUT, ":utf8");

foreach(@$list) {
   $scrap = scraper {
      process '//div[@id="bodyContent"]/table[@class="prettytable"]/tr/td', 'table[]' => 'TEXT';
   };
   $url = URI->new("$wikiurl/$_");

   my $res = $scrap->scrape($url);
   my $table = $res->{'table'};

   for ($i=0; $i<=$#$table; $i+=3) {
      if ($$table[$i] !~ /\..*(\..*)+/ && $$table[$i+1] !~ /^.?$/ ) {
         print "$$table[$i] $$table[$i+1]\n";
      }
      if ($$table[$i+2] =~ /^\./) {
         $i--;
      }
   }
}