diff options
author | lookshe <lookshe@lookshe-server.(none)> | 2014-09-21 00:38:54 +0200 |
---|---|---|
committer | lookshe <lookshe@lookshe-server.(none)> | 2014-09-21 00:38:54 +0200 |
commit | 9dd1e786b135cbf333e861f790eb53091904a607 (patch) | |
tree | 542210a8ed74db270a1c83b3c46f81ccf35444af | |
parent | 7aa21bd2fd75f8949cd6db69db8ba7a801df37b5 (diff) | |
parent | c7898d86561d26b796d0b2b73eb7121ce73c1e26 (diff) |
Merge branch 'master' of github.com:lookshe/scripts
-rw-r--r-- | download_webscraper.pl | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/download_webscraper.pl b/download_webscraper.pl new file mode 100644 index 0000000..a8bd1ea --- /dev/null +++ b/download_webscraper.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $wikiurl = "http://www.bildung-lsa.de/unterricht/zentrale_leistungserhebungen__schriftliche_pruefungen__zentrale_klassenarbeiten__vergleichsarbeiten____/schriftliche_abiturpruefung.html"; + +my $ua = new LWP::UserAgent; +my $req = HTTP::Request->new('GET', $wikiurl); +my $res = $ua->request($req); +my $url = $res->request->uri; + +binmode(STDOUT, ":utf8"); + + + $scrap = scraper { + process '//a[@class="subjectlink"]', 'href[]' => '@href'; + }; + $url = URI->new($wikiurl); + + my $res = $scrap->scrape($url); + my $href = $res->{'href'}; + for ($i = 0; $i <= $#$href; $i++) + { + my $url = $$href[$i]; + system("wget -q \"$url\""); + } |