User:Eloquence/Wikinfo import script

From Wikipedia, the free encyclopedia

Copy this from the source, not from here.

This is the first pre-release of the new import script. It runs as a webserver on 8450 and does some cool stuff:

  1. Filters Special:Newpages for new articles that are not "from Wikipedia"
  2. Does not require a cookies.txt - uses its own account. Works with Wikinfo's new login requirement
  3. Auto-generates diffs of new articles that exist in both Wikinfo and Wikipedia
  4. Strips signatures from titles when importing

Note that to do all this it has to load quite a few pages, which takes some time for a full set of 500 new pages.

To do:

  • Cache previous runs in wikinfo.db so we don't have to wait 60 seconds
  • Use Special:Export

If you want to use this in some way or another, make sure you install all the used modules first using CPAN.

#!/usr/bin/perl
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Daemon;
use HTTP::Status;
use HTTP::Response;
use URI::Escape;
use Text::ParagraphDiff;
use GDBM_File ;
tie %storage, 'GDBM_File', "wikinfo.db", &GDBM_WRCREAT, 0640;
$SIG{INT} = \&catch_zap;  # best strategy
$WKPREFIX="WIKINFO_";
$WKSPREFIX="WIKINFOSIZE_";
$LCPREFIX="LASTCHECK_";
$WPPREFIX="WIKIPEDIA_";
$DIPREFIX="DIFF_";

$browser=LWP::UserAgent->new();

$browser->cookie_jar( {} );
@ns_headers = (
   'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
        image/pjpeg, image/png, */*',
   'Accept-Charset' => 'iso-8859-1,*,utf-8',
   'Accept-Language' => 'en-US',
);
$browser->post("http://www.wikinfo.org/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);
$browser->post("http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);


$d=new HTTP::Daemon(LocalHost=>'localhost', LocalPort => '8450', Reuse=>1);
print "Please contact me at: ".$d->url. "\n";
@ns_headers = (
   'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
        image/pjpeg, image/png, */*',
   'Accept-Charset' => 'iso-8859-1,*,utf-8',
   'Accept-Language' => 'en-US',
);
#get_wikinfo_new();
#exit 0;

while ($c = $d->accept) {
      $r = $c->get_request;
      my $html;
      $html.= <<HTML ;
<html>
<head>
<style TYPE="text/css">
<!--
body { margin-left:2em;margin-right:2em;background:#eeeeee;}
a { text-decoration:none;color:blue;}
a.ext { color:green;cursor:help; }
-->
</STYLE>

<body>
<h1>Wikinfo Import Script</h1>
HTML

      if ($r) {
          if ($r->method eq 'GET' and $r->url->path eq "/") {

	  	my $re=new HTTP::Response();
		$re->header("content_type"=>"text/html");
		$html.= <<HTML ;
<table border="1" width="100%">
<tr><td><b>Wikinfo page</B></td><td><b>Corresponding Wikipedia page</B></td><td><b>Import</B></td></tr>
HTML

		my @lines=get_wikinfo_new();
		while(@lines) {
			$linkopen=shift @lines;
			$linktitle=shift @lines;
			$linkclose=shift @lines;
			$bytes=shift @lines;
			$comment=shift @lines;
			$wikipedia=shift @lines;
			$diff=shift @lines;
			$import=shift @lines;
			$html.="<tr valign='top'><td>".$linkopen .$linktitle. $linkclose ." (".$bytes." bytes)";
			if($comment) { $html.="<br><I>$comment</I>";}
			$html.="</td><td>$wikipedia</td><td>$import</td></tr>";
			if($diff ne "N/A") {

				$html .= "<tr><td colspan=3 bgcolor=\"#dddddd\"><b>Diff:</B><P><font size=-1>".
				"$diff</font></td></tr>";

			}
		}

		$html.= <<HTML ;
</table>
</body>
</html>
HTML


		$re->content($html);
          	$c->send_response($re);

	  } elsif($r->method eq 'GET' and $r->url->path ne "/") {

	  	my $re=new HTTP::Response();
		$re->header("content_type"=>"text/html");

		$page=substr($r->url->path,1);
		$html.=import_wikinfo($page);
		$html.="</body></html>";
		$re->content($html);
		$c->send_response($re);
	  }
	  else {
              $c->send_error(RC_FORBIDDEN)
          }
      }
      $c = undef;  # close connection
  }

sub get_wikinfo_new {

	my $response = $browser->get(
	"http://www.wikinfo.org/wiki.phtml?title=Special:Newpages&limit=500&offset=0",
	@ns_headers);
	$response->content=~m/<ol start=.*?>(.*?)<\/ol>/s;
	@lines=split(/<LI>/i,$1);
	print $#lines;
	my @checklines;

	foreach $line(@lines) {
		if($line=~m/(.*?)(<a href.*?>)(.*?)(<\/a>).*?\((.*?) bytes\)/i) {
			$date=$1;
			$linkopen=$2;
			$linktitle=$3;
			$linkclose=$4;
			$bytes=$5;
			if($line=~m/<em>\((.*)\)<\/em>/i) {
				$comment=$1;
			} else {
				$comment="";

			}
			$wikititle=to_url($linktitle);
			$pediatitle=to_url(strip_sig($linktitle));


			if(!($comment=~m/from wikipedia \(note changes here\)/i)) {
				push @checklines,$linkopen;
				push @checklines,$linktitle;
				push @checklines,$linkclose;
				push @checklines,$bytes;
				push @checklines,$comment;

				$tryurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediatitle .
				  "&action=edit";
				$response=$browser->get($tryurl,@ns_headers);
				$response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
				$pediasource=$1;
				if(($pediasource=~m/\w+/)) {

					push @checklines,"<a href='$tryurl'>$pediatitle</a>";


					if(1) {
						$tryurl="http://www.wikinfo.org/wiki.phtml?title=" .
						  $wikititle .
						"&action=edit";
						$response=$browser->get($tryurl,@ns_headers);
						$response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
						$wikinfosource=$1;
						$diff=text_diff($pediasource, $wikinfosource, {string=>1, plain=>1, escape=>1});
						$diff=~m/<p>(.*)<\/p>/si;
						$diff=$1;
						$diff=~s/ size="\+1">/>/gi;
						push @checklines, $diff;
					} else {
						push @checklines, "N/A";
					}
					push @checklines, "N/A"; # exists, no import possible

				} else {

					push @checklines, "<a href='http://en.wikipedia.org/wiki/$pediatitle'>N/A</A>"; # no Wikipedia URL
					push @checklines, "N/A"; # no diff
					$importurl=$d->url.$wikititle;
					$importlink="<a href='$importurl'>Go!</a>";
					push @checklines,$importlink;

				}

			}

		}
	}

	return @checklines;
}

sub import_wikinfo {

	my $title=shift;
	my $editurl="http://www.wikinfo.org/wiki.phtml?title=".$title."&action=edit";
	my $viewurl="http://www.wikinfo.org/wiki.phtml?title=".$title;
	my $response = $browser->get($editurl,@ns_headers);
	my $rv;
	$pagetitle=to_wiki($title);
	$pediaurl=to_url(strip_sig($pagetitle));

#	print "Full:\n".$response->content;
	$response->content=~m/<textarea.*?>(.*)<\/textarea>/is;
#	print "Source:\n".$source;
	$source=$1;
	$source=~s/\"/"/gi; # unescape
	$source=~s/\>/>/gi;
	$source=~s/\</</gi;
	$source=~s/\&/\&/gi;
	if(!($source=~m/\w+/)) {
	$rv.= "The page with the specified title was not found: <A HREF='$viewurl'>$viewurl</A> (<a href='$editurl'>edit</a>)";
	return $rv;
	}

	$source.="\n\n''Adapted from the [[Wikinfo]] article [$viewurl $pagetitle], licensed under the [[GNU Free Documentation License]].''";

	$rv.="Checking for duplicate of <A HREF='$viewurl'>$viewurl</A>..<P>";

	$tryurl="http://en.wikipedia.org/wiki/".$pediaurl;
	$response=$browser->get($tryurl,@ns_headers);
	if($response->content=~m/There is currently no text in this page/) {

		$rv.="Posted new article to <a href='$tryurl'>$tryurl</A>!<P>";
		$wpurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediaurl."&action=submit";
		$browser->post($wpurl,@ns_headers,Content=>
		[
 		wpTextbox1=>$source,
 		wpSave=>"Save page",
 		wpSummary=>"Imported from Wikinfo via [[User:Eloquence/Wikinfo import script]]"
		]);
	} else {

  		$rv.="Page already exists on Wikipedia: <a href='$tryurl'>$tryurl</A>! You have to merge by hand. :-("

	}

	return $rv;

}

    sub catch_zap {
        my $signame = shift;
	untie %storage;
        die "Program terminated: Received $signame";
    }

    sub strip_sig {
	my $title=shift;
	@names=( "Levan Urushadze", "Fred Bauder");
	while ($name=shift(@names)) {
		$title=~s/(.*) by $name$/$1/g;
	}
	return $title;
    }

    sub to_url {
    	my $title=shift;
	$title=~s/ /_/gi;
	$title=uri_escape($title);
	$title=~s/\'/\%27/gi;
	return $title;
    }

    sub to_wiki {
    	my $title=shift;
    	$title=uri_unescape($title);
	$title=~s/\%27/\'/gi;
	$title=~s/_/ /gi;
	return $title;
    }