diff options
author | lookshe <lookshe@lookshe-laptop.(none)> | 2014-09-21 00:39:52 +0200 |
---|---|---|
committer | lookshe <lookshe@lookshe-laptop.(none)> | 2014-09-21 00:39:52 +0200 |
commit | 61672bf575c84e1f3b313fe9ea2dd4de75b601ae (patch) | |
tree | e42d542afd5de8ead38bdc42a002fe0c336b6b75 /download_webscraper.pl | |
parent | 404e0e28825c805b54d129f8b9d7c5690a27ef17 (diff) | |
parent | 9dd1e786b135cbf333e861f790eb53091904a607 (diff) |
Merge branch 'master' of github.com:lookshe/scripts
Diffstat (limited to 'download_webscraper.pl')
-rw-r--r-- | download_webscraper.pl | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/download_webscraper.pl b/download_webscraper.pl new file mode 100644 index 0000000..a8bd1ea --- /dev/null +++ b/download_webscraper.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $wikiurl = "http://www.bildung-lsa.de/unterricht/zentrale_leistungserhebungen__schriftliche_pruefungen__zentrale_klassenarbeiten__vergleichsarbeiten____/schriftliche_abiturpruefung.html"; + +my $ua = new LWP::UserAgent; +my $req = HTTP::Request->new('GET', $wikiurl); +my $res = $ua->request($req); +my $url = $res->request->uri; + +binmode(STDOUT, ":utf8"); + + + $scrap = scraper { + process '//a[@class="subjectlink"]', 'href[]' => '@href'; + }; + $url = URI->new($wikiurl); + + my $res = $scrap->scrape($url); + my $href = $res->{'href'}; + for ($i = 0; $i <= $#$href; $i++) + { + my $url = $$href[$i]; + system("wget -q \"$url\""); + } |