diff options
author | lookshe <lookshe@fumuga.com> | 2012-09-09 21:02:03 +0200 |
---|---|---|
committer | lookshe <lookshe@fumuga.com> | 2012-09-09 21:02:03 +0200 |
commit | 009a87ffad8d3ea804dba65aedef7fddec7aa6d2 (patch) | |
tree | 4225ac012adf5eecca15f81756f69a16922e383e /wiki_export.pl | |
parent | e92f4f4425e9afab798910880c146567c22898c8 (diff) |
don't know what changed, but there's something ;-)
Diffstat (limited to '')
-rw-r--r-- | wiki_export.pl | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/wiki_export.pl b/wiki_export.pl new file mode 100644 index 0000000..b41a341 --- /dev/null +++ b/wiki_export.pl @@ -0,0 +1,100 @@ +#!/usr/bin/perl + +#use strict; +#use warnings; +use Web::Scraper; +use URI; +use HTML::Entities; +use Encode; +use URI::Escape; +use LWP::UserAgent; + +my $scrap; + +my $lang = $ARGV[1]; +if (!$lang) { + $lang = "de"; +} +my $wikiurl = "http://$lang.wikipedia.org/wiki/Special:Search?search=$ARGV[0]&go=Go"; + +my $ua = new LWP::UserAgent; +my $req = HTTP::Request->new('GET', $wikiurl); +my $res = $ua->request($req); +my $url = $res->request->uri; +my $origurl = $url; +$url =~ s/.*\/wiki\///; + +binmode(STDOUT, ":utf8"); + +if ($url !~ m/Special:Search/) { +#artikel + + $scrap = scraper { + process '//div[@id="bodyContent"]/p', 'text[]' => 'TEXT'; + process '//img', 'img[]' => '@src'; + process '//div[@id="bodyContent"]/ul/li', 'list[]' => 'TEXT'; + process '//table/tr/td', 'table[]' => 'TEXT'; + }; + $url = URI->new($wikiurl); + + my $res = $scrap->scrape($url); + my $text = $res->{'text'}; + my $img = $res->{'img'}; + my $list = $res->{'list'}; + my $table = $res->{'table'}; + my $isDis = 0; + + if ($#$table > 0) { + foreach (@$img) { +#print "$_\n"; +# if ($_ =~ m/^http:\/\/upload\.wikimedia\.org\/wikipedia\/commons\/thumb\/.*\/.*\/Disambig/) { + if ($_ =~ m/Disambig/i) { + $isDis = 1; + last; + } + } + } + if (!$isDis) { + $text = decode_entities($$text[0]); + $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; + $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; + $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; + $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g; + $text =~ s/\s+/ /g; + $text =~ s/\s([,.\?!])/$1/g; + + if ($text =~ m/.{448}.*/) { + $text =~ s/^(.{448}).*$/$1/; + $text =~ s/^(.*[\.!\?])[^\.!\?]*$/$1 (...)/; + } + + print $text, "\n"; + } else { + for ($count = 0; $count < 3 && $count <= $#$list; $count++) { + print "$$list[$count]\n"; + } + print "For more see $origurl\n"; + } + +} else { +#kein artikel + + $scrap = scraper { + process '//div[@class="searchresult"]', 'text[]' => 'TEXT'; + process '//ul[@class="mw-search-results"]/li/div/a', 'href[]' => '@href'; + }; + $url = URI->new($wikiurl); + + my $res = $scrap->scrape($url); + if (keys(%$res)) { + my $text = $res->{'text'}; + my $href = $res->{'href'}; + my $result = ""; + for ($count = 0; $count < 5 && $count <= $#$text; $count++) { + $result = ($result?"$result || ":"").$$href[$count], "\n"; + } + print "$result\n"; + } else { + print "No matches with $ARGV[0]\n"; + } +} |