diff options
author | lookshe <github@lookshe.org> | 2014-09-21 00:35:30 +0200 |
---|---|---|
committer | lookshe <github@lookshe.org> | 2014-09-21 00:35:30 +0200 |
commit | c7898d86561d26b796d0b2b73eb7121ce73c1e26 (patch) | |
tree | 921295232040823dd96fc403cb3d67b7aa264675 | |
parent | c4d2aba012967a8636069b723c428622680b969b (diff) |
one example for analysing webpages with scraper
-rw-r--r-- | download_webscraper.pl | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/download_webscraper.pl b/download_webscraper.pl new file mode 100644 index 0000000..a8bd1ea --- /dev/null +++ b/download_webscraper.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $wikiurl = "http://www.bildung-lsa.de/unterricht/zentrale_leistungserhebungen__schriftliche_pruefungen__zentrale_klassenarbeiten__vergleichsarbeiten____/schriftliche_abiturpruefung.html"; + +my $ua = new LWP::UserAgent; +my $req = HTTP::Request->new('GET', $wikiurl); +my $res = $ua->request($req); +my $url = $res->request->uri; + +binmode(STDOUT, ":utf8"); + + + $scrap = scraper { + process '//a[@class="subjectlink"]', 'href[]' => '@href'; + }; + $url = URI->new($wikiurl); + + my $res = $scrap->scrape($url); + my $href = $res->{'href'}; + for ($i = 0; $i <= $#$href; $i++) + { + my $url = $$href[$i]; + system("wget -q \"$url\""); + } |