#!/usr/bin/perl #use strict; #use warnings; use Web::Scraper; use URI; use HTML::Entities; use Encode; use URI::Escape; use LWP::UserAgent; my $scrap; my $lang = $ARGV[1]; if (!$lang) { $lang = "de"; } my $wikiurl = "http://$lang.wikipedia.org/wiki/Special:Search?search=$ARGV[0]&go=Go"; my $ua = new LWP::UserAgent; my $req = HTTP::Request->new('GET', $wikiurl); my $res = $ua->request($req); my $url = $res->request->uri; my $origurl = $url; $url =~ s/.*\/wiki\///; binmode(STDOUT, ":utf8"); if ($url !~ m/Special:Search/) { #artikel $scrap = scraper { process '//div[@id="bodyContent"]/p', 'text[]' => 'TEXT'; process '//img', 'img[]' => '@src'; process '//div[@id="bodyContent"]/ul/li', 'list[]' => 'TEXT'; process '//table/tr/td', 'table[]' => 'TEXT'; }; $url = URI->new($wikiurl); my $res = $scrap->scrape($url); my $text = $res->{'text'}; my $img = $res->{'img'}; my $list = $res->{'list'}; my $table = $res->{'table'}; my $isDis = 0; if ($#$table > 0) { foreach (@$img) { #print "$_\n"; # if ($_ =~ m/^http:\/\/upload\.wikimedia\.org\/wikipedia\/commons\/thumb\/.*\/.*\/Disambig/) { if ($_ =~ m/Disambig/i) { $isDis = 1; last; } } } if (!$isDis) { $text = decode_entities($$text[0]); $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; $text =~ s/\s+/ /g; $text =~ s/\s([,.\?!])/$1/g; if ($text =~ m/.{448}.*/) { $text =~ s/^(.{448}).*$/$1/; $text =~ s/^(.*[\.!\?])[^\.!\?]*$/$1 (...)/; } print $text, "\n"; } else { for ($count = 0; $count < 3 && $count <= $#$list; $count++) { print "$$list[$count]\n"; } print "For more see $origurl\n"; } } else { #kein artikel $scrap = scraper { process '//div[@class="searchresult"]', 'text[]' => 'TEXT'; process '//ul[@class="mw-search-results"]/li/div/a', 'href[]' => '@href'; }; $url = URI->new($wikiurl); my $res = $scrap->scrape($url); if (keys(%$res)) { my $text = $res->{'text'}; my $href = $res->{'href'}; my $result = ""; for ($count = 0; $count < 5 && $count <= $#$text; $count++) { $result = ($result?"$result || ":"").$$href[$count], "\n"; } print "$result\n"; } else { print "No matches with $ARGV[0]\n"; } }