don't know what changed, but there's something ;-)

author: lookshe <lookshe@fumuga.com> 2012-09-09 21:02:03 +0200
committer: lookshe <lookshe@fumuga.com> 2012-09-09 21:02:03 +0200
commit: 009a87ffad8d3ea804dba65aedef7fddec7aa6d2 (patch)
tree: 4225ac012adf5eecca15f81756f69a16922e383e /wiki_export.pl
parent: e92f4f4425e9afab798910880c146567c22898c8 (diff)
1 files changed, 100 insertions, 0 deletions
diff --git a/wiki_export.pl b/wiki_export.pl
new file mode 100644
index 0000000..b41a341
--- /dev/null
+++ b/wiki_export.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+
+#use strict;
+#use warnings;
+use Web::Scraper;
+use URI;
+use HTML::Entities;
+use Encode;
+use URI::Escape;
+use LWP::UserAgent;
+
+my $scrap;
+
+my $lang = $ARGV[1];
+if (!$lang) {
+   $lang = "de";
+}
+my $wikiurl = "http://$lang.wikipedia.org/wiki/Special:Search?search=$ARGV[0]&go=Go";
+
+my $ua = new LWP::UserAgent;
+my $req = HTTP::Request->new('GET', $wikiurl);
+my $res = $ua->request($req);
+my $url = $res->request->uri;
+my $origurl = $url;
+$url =~ s/.*\/wiki\///;
+
+binmode(STDOUT, ":utf8");
+
+if ($url !~ m/Special:Search/) {
+#artikel
+
+   $scrap = scraper {
+      process '//div[@id="bodyContent"]/p', 'text[]' => 'TEXT';
+      process '//img', 'img[]' => '@src';
+      process '//div[@id="bodyContent"]/ul/li', 'list[]' => 'TEXT';
+      process '//table/tr/td', 'table[]' => 'TEXT';
+   };
+   $url = URI->new($wikiurl);
+
+   my $res = $scrap->scrape($url);
+   my $text = $res->{'text'};
+   my $img = $res->{'img'};
+   my $list = $res->{'list'};
+   my $table = $res->{'table'};
+   my $isDis = 0;
+
+   if ($#$table > 0) {
+      foreach (@$img) {
+#print "$_\n";
+#         if ($_ =~ m/^http:\/\/upload\.wikimedia\.org\/wikipedia\/commons\/thumb\/.*\/.*\/Disambig/) {
+         if ($_ =~ m/Disambig/i) {
+            $isDis = 1;
+            last;
+         }
+      }
+   }
+   if (!$isDis) {
+      $text = decode_entities($$text[0]);
+      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
+      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
+      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
+      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
+      $text =~ s/\s+/ /g;
+      $text =~ s/\s([,.\?!])/$1/g;
+
+      if ($text =~ m/.{448}.*/) {
+         $text =~ s/^(.{448}).*$/$1/;
+         $text =~ s/^(.*[\.!\?])[^\.!\?]*$/$1 (...)/;
+      }
+
+      print $text, "\n";
+   } else {
+      for ($count = 0; $count < 3 && $count <= $#$list; $count++) {
+         print "$$list[$count]\n";
+      }
+      print "For more see $origurl\n";
+   }
+
+} else {
+#kein artikel
+
+   $scrap = scraper {
+      process '//div[@class="searchresult"]', 'text[]' => 'TEXT';
+      process '//ul[@class="mw-search-results"]/li/div/a', 'href[]' => '@href';
+   };
+   $url = URI->new($wikiurl);
+
+   my $res = $scrap->scrape($url);
+   if (keys(%$res)) {
+      my $text = $res->{'text'};
+      my $href = $res->{'href'};
+      my $result = "";
+      for ($count = 0; $count < 5 && $count <= $#$text; $count++) {
+         $result = ($result?"$result || ":"").$$href[$count], "\n";
+      }
+      print "$result\n";
+   } else {
+      print "No matches with $ARGV[0]\n";
+   }
+}
author	lookshe <lookshe@fumuga.com>	2012-09-09 21:02:03 +0200
committer	lookshe <lookshe@fumuga.com>	2012-09-09 21:02:03 +0200
commit	009a87ffad8d3ea804dba65aedef7fddec7aa6d2 (patch)
tree	4225ac012adf5eecca15f81756f69a16922e383e /wiki_export.pl
parent	e92f4f4425e9afab798910880c146567c22898c8 (diff)