aboutsummaryrefslogtreecommitdiffstats
path: root/download_webscraper.pl
blob: a8bd1eab822fdae3f1d37518a3e8b65c929317db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/perl

#use strict;
#use warnings;
use Web::Scraper;
use URI;
use HTML::Entities;
use Encode;
use URI::Escape;
use LWP::UserAgent;

my $scrap;

my $wikiurl = "http://www.bildung-lsa.de/unterricht/zentrale_leistungserhebungen__schriftliche_pruefungen__zentrale_klassenarbeiten__vergleichsarbeiten____/schriftliche_abiturpruefung.html";

my $ua = new LWP::UserAgent;
my $req = HTTP::Request->new('GET', $wikiurl);
my $res = $ua->request($req);
my $url = $res->request->uri;

binmode(STDOUT, ":utf8");


   $scrap = scraper {
      process '//a[@class="subjectlink"]', 'href[]' => '@href';
   };
   $url = URI->new($wikiurl);

   my $res = $scrap->scrape($url);
   my $href = $res->{'href'};
   for ($i = 0; $i <= $#$href; $i++)
   {
         my $url = $$href[$i];
         system("wget -q \"$url\"");
   }