wiki_export.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

#!/usr/bin/perl

#use strict;
#use warnings;
use Web::Scraper;
use URI;
use HTML::Entities;
use Encode;
use URI::Escape;
use LWP::UserAgent;

my $scrap;

my $lang = $ARGV[1];
if (!$lang) {
   $lang = "de";
}
my $wikiurl = "http://$lang.wikipedia.org/wiki/Special:Search?search=$ARGV[0]&go=Go";

my $ua = new LWP::UserAgent;
my $req = HTTP::Request->new('GET', $wikiurl);
my $res = $ua->request($req);
my $url = $res->request->uri;
my $origurl = $url;
$url =~ s/.*\/wiki\///;

binmode(STDOUT, ":utf8");

if ($url !~ m/Special:Search/) {
#artikel

   $scrap = scraper {
      process '//div[@id="bodyContent"]/p', 'text[]' => 'TEXT';
      process '//img', 'img[]' => '@src';
      process '//div[@id="bodyContent"]/ul/li', 'list[]' => 'TEXT';
      process '//table/tr/td', 'table[]' => 'TEXT';
   };
   $url = URI->new($wikiurl);

   my $res = $scrap->scrape($url);
   my $text = $res->{'text'};
   my $img = $res->{'img'};
   my $list = $res->{'list'};
   my $table = $res->{'table'};
   my $isDis = 0;

   if ($#$table > 0) {
      foreach (@$img) {
#print "$_\n";
#         if ($_ =~ m/^http:\/\/upload\.wikimedia\.org\/wikipedia\/commons\/thumb\/.*\/.*\/Disambig/) {
         if ($_ =~ m/Disambig/i) {
            $isDis = 1;
            last;
         }
      }
   }
   if (!$isDis) {
      $text = decode_entities($$text[0]);
      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
      $text =~ s/\([^\(\)]*\)||\[[^\[\]]*\]//g;
      $text =~ s/\s+/ /g;
      $text =~ s/\s([,.\?!])/$1/g;

      if ($text =~ m/.{448}.*/) {
         $text =~ s/^(.{448}).*$/$1/;
         $text =~ s/^(.*[\.!\?])[^\.!\?]*$/$1 (...)/;
      }

      print $text, "\n";
   } else {
      for ($count = 0; $count < 3 && $count <= $#$list; $count++) {
         print "$$list[$count]\n";
      }
      print "For more see $origurl\n";
   }

} else {
#kein artikel

   $scrap = scraper {
      process '//div[@class="searchresult"]', 'text[]' => 'TEXT';
      process '//ul[@class="mw-search-results"]/li/div/a', 'href[]' => '@href';
   };
   $url = URI->new($wikiurl);

   my $res = $scrap->scrape($url);
   if (keys(%$res)) {
      my $text = $res->{'text'};
      my $href = $res->{'href'};
      my $result = "";
      for ($count = 0; $count < 5 && $count <= $#$text; $count++) {
         $result = ($result?"$result || ":"").$$href[$count], "\n";
      }
      print "$result\n";
   } else {
      print "No matches with $ARGV[0]\n";
   }
}