aboutsummaryrefslogtreecommitdiffstats
path: root/download_webscraper.pl
diff options
context:
space:
mode:
authorlookshe <lookshe@lookshe-laptop.(none)>2014-09-21 00:39:52 +0200
committerlookshe <lookshe@lookshe-laptop.(none)>2014-09-21 00:39:52 +0200
commit61672bf575c84e1f3b313fe9ea2dd4de75b601ae (patch)
treee42d542afd5de8ead38bdc42a002fe0c336b6b75 /download_webscraper.pl
parent404e0e28825c805b54d129f8b9d7c5690a27ef17 (diff)
parent9dd1e786b135cbf333e861f790eb53091904a607 (diff)
Merge branch 'master' of github.com:lookshe/scripts
Diffstat (limited to 'download_webscraper.pl')
-rw-r--r--download_webscraper.pl35
1 files changed, 35 insertions, 0 deletions
diff --git a/download_webscraper.pl b/download_webscraper.pl
new file mode 100644
index 0000000..a8bd1ea
--- /dev/null
+++ b/download_webscraper.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl
+
+#use strict;
+#use warnings;
+use Web::Scraper;
+use URI;
+use HTML::Entities;
+use Encode;
+use URI::Escape;
+use LWP::UserAgent;
+
+my $scrap;
+
+my $wikiurl = "http://www.bildung-lsa.de/unterricht/zentrale_leistungserhebungen__schriftliche_pruefungen__zentrale_klassenarbeiten__vergleichsarbeiten____/schriftliche_abiturpruefung.html";
+
+my $ua = new LWP::UserAgent;
+my $req = HTTP::Request->new('GET', $wikiurl);
+my $res = $ua->request($req);
+my $url = $res->request->uri;
+
+binmode(STDOUT, ":utf8");
+
+
+ $scrap = scraper {
+ process '//a[@class="subjectlink"]', 'href[]' => '@href';
+ };
+ $url = URI->new($wikiurl);
+
+ my $res = $scrap->scrape($url);
+ my $href = $res->{'href'};
+ for ($i = 0; $i <= $#$href; $i++)
+ {
+ my $url = $$href[$i];
+ system("wget -q \"$url\"");
+ }