From ba46c304bdffdce0b6019a47f6759ce6abbeef90 Mon Sep 17 00:00:00 2001 From: Stefan Ritter Date: Tue, 16 Aug 2011 10:51:41 +0200 Subject: Initial commit --- parse-fileext-wiki.pl | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 parse-fileext-wiki.pl (limited to 'parse-fileext-wiki.pl') diff --git a/parse-fileext-wiki.pl b/parse-fileext-wiki.pl new file mode 100644 index 0000000..906751b --- /dev/null +++ b/parse-fileext-wiki.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $wikiurl = "http://de.wikipedia.org/wiki/Liste_der_Dateiendungen"; + +my $scrapp = scraper { + process '//div[@id="bodyContent"]/table/tr/td/a', 'chars[]' => 'TEXT'; +}; +my $url = URI->new($wikiurl); +my $blubb = $scrapp->scrape($url); +my $list = $blubb->{'chars'}; + +binmode(STDOUT, ":utf8"); + +foreach(@$list) { + $scrap = scraper { + process '//div[@id="bodyContent"]/table[@class="prettytable"]/tr/td', 'table[]' => 'TEXT'; + }; + $url = URI->new("$wikiurl/$_"); + + my $res = $scrap->scrape($url); + my $table = $res->{'table'}; + + for ($i=0; $i<=$#$table; $i+=3) { + if ($$table[$i] !~ /\..*(\..*)+/ && $$table[$i+1] !~ /^.?$/ ) { + print "$$table[$i] $$table[$i+1]\n"; + } + if ($$table[$i+2] =~ /^\./) { + $i--; + } + } +} -- cgit v1.2.3