summaryrefslogtreecommitdiffstats
path: root/parse-fileext-wiki.pl
diff options
context:
space:
mode:
authorStefan Ritter <xeno@thehappy.de>2011-08-16 10:51:41 +0200
committerStefan Ritter <xeno@thehappy.de>2011-08-16 10:51:41 +0200
commitba46c304bdffdce0b6019a47f6759ce6abbeef90 (patch)
tree40a5b204637cb65c5bb91bd4f26a248ea498e019 /parse-fileext-wiki.pl
Initial commit
Diffstat (limited to 'parse-fileext-wiki.pl')
-rw-r--r--parse-fileext-wiki.pl42
1 files changed, 42 insertions, 0 deletions
diff --git a/parse-fileext-wiki.pl b/parse-fileext-wiki.pl
new file mode 100644
index 0000000..906751b
--- /dev/null
+++ b/parse-fileext-wiki.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/perl
+
+#use strict;
+#use warnings;
+use Web::Scraper;
+use URI;
+use HTML::Entities;
+use Encode;
+use URI::Escape;
+use LWP::UserAgent;
+
+my $scrap;
+
+my $wikiurl = "http://de.wikipedia.org/wiki/Liste_der_Dateiendungen";
+
+my $scrapp = scraper {
+ process '//div[@id="bodyContent"]/table/tr/td/a', 'chars[]' => 'TEXT';
+};
+my $url = URI->new($wikiurl);
+my $blubb = $scrapp->scrape($url);
+my $list = $blubb->{'chars'};
+
+binmode(STDOUT, ":utf8");
+
+foreach(@$list) {
+ $scrap = scraper {
+ process '//div[@id="bodyContent"]/table[@class="prettytable"]/tr/td', 'table[]' => 'TEXT';
+ };
+ $url = URI->new("$wikiurl/$_");
+
+ my $res = $scrap->scrape($url);
+ my $table = $res->{'table'};
+
+ for ($i=0; $i<=$#$table; $i+=3) {
+ if ($$table[$i] !~ /\..*(\..*)+/ && $$table[$i+1] !~ /^.?$/ ) {
+ print "$$table[$i] $$table[$i+1]\n";
+ }
+ if ($$table[$i+2] =~ /^\./) {
+ $i--;
+ }
+ }
+}