diff options
author | Stefan Ritter <xeno@thehappy.de> | 2011-08-16 10:51:41 +0200 |
---|---|---|
committer | Stefan Ritter <xeno@thehappy.de> | 2011-08-16 10:51:41 +0200 |
commit | ba46c304bdffdce0b6019a47f6759ce6abbeef90 (patch) | |
tree | 40a5b204637cb65c5bb91bd4f26a248ea498e019 /parse-fileext-wiki.pl |
Initial commit
Diffstat (limited to 'parse-fileext-wiki.pl')
-rw-r--r-- | parse-fileext-wiki.pl | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/parse-fileext-wiki.pl b/parse-fileext-wiki.pl new file mode 100644 index 0000000..906751b --- /dev/null +++ b/parse-fileext-wiki.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $wikiurl = "http://de.wikipedia.org/wiki/Liste_der_Dateiendungen"; + +my $scrapp = scraper { + process '//div[@id="bodyContent"]/table/tr/td/a', 'chars[]' => 'TEXT'; +}; +my $url = URI->new($wikiurl); +my $blubb = $scrapp->scrape($url); +my $list = $blubb->{'chars'}; + +binmode(STDOUT, ":utf8"); + +foreach(@$list) { + $scrap = scraper { + process '//div[@id="bodyContent"]/table[@class="prettytable"]/tr/td', 'table[]' => 'TEXT'; + }; + $url = URI->new("$wikiurl/$_"); + + my $res = $scrap->scrape($url); + my $table = $res->{'table'}; + + for ($i=0; $i<=$#$table; $i+=3) { + if ($$table[$i] !~ /\..*(\..*)+/ && $$table[$i+1] !~ /^.?$/ ) { + print "$$table[$i] $$table[$i+1]\n"; + } + if ($$table[$i+2] =~ /^\./) { + $i--; + } + } +} |