From c7898d86561d26b796d0b2b73eb7121ce73c1e26 Mon Sep 17 00:00:00 2001 From: lookshe Date: Sun, 21 Sep 2014 00:35:30 +0200 Subject: one example for analysing webpages with scraper --- download_webscraper.pl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 download_webscraper.pl diff --git a/download_webscraper.pl b/download_webscraper.pl new file mode 100644 index 0000000..a8bd1ea --- /dev/null +++ b/download_webscraper.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $wikiurl = "http://www.bildung-lsa.de/unterricht/zentrale_leistungserhebungen__schriftliche_pruefungen__zentrale_klassenarbeiten__vergleichsarbeiten____/schriftliche_abiturpruefung.html"; + +my $ua = new LWP::UserAgent; +my $req = HTTP::Request->new('GET', $wikiurl); +my $res = $ua->request($req); +my $url = $res->request->uri; + +binmode(STDOUT, ":utf8"); + + + $scrap = scraper { + process '//a[@class="subjectlink"]', 'href[]' => '@href'; + }; + $url = URI->new($wikiurl); + + my $res = $scrap->scrape($url); + my $href = $res->{'href'}; + for ($i = 0; $i <= $#$href; $i++) + { + my $url = $$href[$i]; + system("wget -q \"$url\""); + } -- cgit v1.2.3