User:Ap/LaTeX conversion utility

From Wikipedia, the free encyclopedia
#!/usr/bin/perl -w
##
## Copyright (C) 2003 Arno W. Peters.
## released under GNU GPL version 2 or higher.

use strict;
use DBI();

sub texheader {
  print "\\documentclass[10pt,english,a4paper,twocolumn]{book}\n";
  print "\\usepackage[latin1]{inputenc}\n";
  print "\\usepackage[T1]{fontenc}\n";
  print "\\usepackage{geometry}\n";
  print "\\usepackage[cm]{fullpage}\n";
  print "\\usepackage{babel}\n";
  print "\n";
  print "\\def\\sup#1{\\ensuremath{^#1}}\n";
  print "\\def\\sub#1{\\ensuremath{_#1}}\n";
  print "\\def\\slash{/}\n";
  print "\\catcode`\\/=\\active\n";
  print "\\def/{\\slash\\discretionary{}{}{}}\n";
  print "\n";
  print "\\begin{document}\n";
}

sub texfooter {
  print "\\end{document}\n";
}

sub article2tex {
  my ($title, $text) = @_;
  my @math;
  my $n = 0;
  my $tmp;

  $title =~ s/_/ /g;
  $title =~ s/&/\\&/g;

###  while ($tmp =~ m%<math>(.*)</math>%) {
###    $math[$n] = $1;
###    $n++;
###    $tmp = $';
###  }

  $text =~ s/\r//g;
  $text =~ s/\\/\//g;
  $text =~ s/\$/\\\$/g;
  $text =~ s/\^/\\\^{\ }/g;
  $text =~ s/{/\\{/g;
  $text =~ s/}/\\}/g;

  $text =~ s/&euml;/\\"e/g;
  $text =~ s/&ouml;/\\"o/g;
  $text =~ s/&uuml;/\\"u/g;
  $text =~ s/&iuml;/\\"{\\i}/g;
  $text =~ s/&aacute;/\\'a/g;
  $text =~ s/&eacute;/\\'e/g;
  $text =~ s/&egrave;/\\`e/g;
  $text =~ s/&Eacute;/\\'E/g;

  $text =~ s/&alpha;/\\ensuremath{\\alpha}/g;
  $text =~ s/&Alpha;/A/g;
  $text =~ s/&beta;/\\ensuremath{\\beta}/g;
  $text =~ s/&Beta;/B/g;
  $text =~ s/&gamma;/\\ensuremath{\\gamma}/g;
  $text =~ s/&Gamma;/\\ensuremath{\\Gamma}/g;
  $text =~ s/&delta;/\\ensuremath{\\delta}/g;
  $text =~ s/&Delta;/\\ensuremath{\\Delta}/g;
  $text =~ s/&epsilon;/\\ensuremath{\\epsilon}/g;
  $text =~ s/&Epsilon;/E/g;
  $text =~ s/&zeta;/\\ensuremath{\\zeta}/g;
  $text =~ s/&Zeta;/Z/g;
  $text =~ s/&eta;/\\ensuremath{\\eta}/g;
  $text =~ s/&Eta;/E/g;
  $text =~ s/&theta;/\\ensuremath{\\theta}/g;
  $text =~ s/&Theta;/\\ensuremath{\\Theta}/g;
  $text =~ s/&iota;/\\ensuremath{\\iota}/g;
  $text =~ s/&Iota;/I/g;
  $text =~ s/&kappa;/\\ensuremath{\\kappa}/g;
  $text =~ s/&Kappa;/K/g;
  $text =~ s/&lambda;/\\ensuremath{\\lambda}/g;
  $text =~ s/&Lambda;/\\ensuremath{\\Lambda}/g;
  $text =~ s/&mu;/\\ensuremath{\\mu}/g;
  $text =~ s/&Mu;/M/g;
  $text =~ s/&nu;/\\ensuremath{\\nu}/g;
  $text =~ s/&Nu;/N/g;
  $text =~ s/&xi;/\\ensuremath{\\xi}/g;
  $text =~ s/&Xi;/\\ensuremath{\\Xi}/g;
  $text =~ s/&pi;/\\ensuremath{\\pi}/g;
  $text =~ s/&Pi;/\\ensuremath{\\Pi}/g;
  $text =~ s/&rho;/\\ensuremath{\\rho}/g;
  $text =~ s/&Rho;/R/g;
  $text =~ s/&sigma;/\\ensuremath{\\sigma}/g;
  $text =~ s/&Sigma;/\\ensuremath{\\Sigma}/g;
  $text =~ s/&tau;/\\ensuremath{\\tau}/g;
  $text =~ s/&Tau;/\\ensuremath{\\Tau}/g;
  $text =~ s/&upsilon;/\\ensuremath{\\upsilon}/g;
  $text =~ s/&Upsilon;/\\ensuremath{\\Upsilon}/g;
  $text =~ s/&phi;/\\ensuremath{\\phi}/g;
  $text =~ s/&Phi;/\\ensuremath{\\Phi}/g;
  $text =~ s/&chi;/\\ensuremath{\\chi}/g;
  $text =~ s/&Chi;/X/g;
  $text =~ s/&psi;/\\ensuremath{\\psi}/g;
  $text =~ s/&Psi;/\\ensuremath{\\Psi}/g;
  $text =~ s/&omega;/\\ensuremath{\\omega}/g;
  $text =~ s/&Omega;/\\ensuremath{\\Omega}/g;

  $text =~ s/&sup2;/\\sup{2}/g;
  $text =~ s/&nbsp;/~/g;
  $text =~ s/&deg;/\\ensuremath{^\\circ}/g;
  $text =~ s/&lt;/\\ensuremath{<}/g;
  $text =~ s/&gt;/\\ensuremath{>}/g;
  $text =~ s/&le;/\\ensuremath{\\leq}/g;
  $text =~ s/&ge;/\\ensuremath{\\geq}/g;
  $text =~ s/&uarr;/\\ensuremath{\\uparrow}/g;
  $text =~ s/&darr;/\\ensuremath{\\downarrow}/g;
  $text =~ s/&rarr;/\\ensuremath{\\rightarrow}/g;
  $text =~ s/&larr;/\\ensuremath{\\leftarrow}/g;
  $text =~ s/&forall;/\\ensuremath{\\forall}/g;
  $text =~ s/&isin;/\\ensuremath{\\in}/g;
  $text =~ s/&cup;/\\ensuremath{\\cup}/g;
  $text =~ s/&int;/\\ensuremath{\\int}/g;
  $text =~ s/&mdash;/---/g;

  $text =~ s/&clubs;/\\ensuremath{\\clubsuit}}/g;
  $text =~ s/&spades;/\\ensuremath{\\spadesuit}}/g;
  $text =~ s/&diams;/\\ensuremath{\\diamondsuit}}/g;
  $text =~ s/&hearts;/\\ensuremath{\\heartsuit}}/g;

  $text =~ s/^#redirect/See /i;

  $text =~ s/&/\\&/g;
  $text =~ s/#/\\#/g;
  $text =~ s/%/\\%/g;
  $text =~ s/_/\\_/g;

  $text =~ s/''''''(.+?)''''''/\{\}$1\{\}/mg;
  $text =~ s/'''''(.+?)'''''/\\emph{\\textbf{$1}}/mg;
  $text =~ s/''''(.+?)''''/\\textbf{'$1'}/mg;
  $text =~ s/'''(.+?)'''/\\textbf{$1}/mg;
  $text =~ s/''(.+?)''/\\emph{$1}/mg;

  ## Ignore verbatim 
  ## $text =~ s/^ (.*)$/\\begin{verbatim}$1\n\\end{verbatim}/mg;

  $text =~ s/^=====(.*)=====/\\subparagraph*{$1}\n/mg;
  $text =~ s/^====(.*)====/\\paragraph*{$1}\n/mg;
  $text =~ s/^===(.*)===/\\subsubsection*{$1}\n/mg;
  $text =~ s/^==(.*)==/\\subsection*{$1}\n/mg;
  $text =~ s/(\d)\-(\d)/$1--$2/mg;

  $text =~ s/^\*\*\*/\\par\\noindent\\hangindent=6em\\hskip5em\\llap{\\ensuremath{\\bullet}}\\quad /mg;
  $text =~ s/^\*\*/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\ensuremath{\\bullet}}\\quad /mg;
  $text =~ s/^\*/\\par\\noindent\\hangindent=2em\\quad\\llap{\\ensuremath{\\bullet}}\\quad /mg;
  $text =~ s/^:::/\\par\\noindent\\hangindent=6em\\hskip6em /mg;
  $text =~ s/^::/\\par\\noindent\\hangindent=4em\\hskip4em /mg;
  $text =~ s/^:/\\par\\noindent\\hangindent=2em\\qquad /mg;
  $text =~ s/^\\#\\#/\\par\\noindent\\hangindent=4em\\hskip3em\\llap{\\#}\\quad /mg;
  $text =~ s/^\\#/\\par\\noindent\\hangindent=2em\\quad\\llap{\\#}\\quad /mg;

  $text =~ s/<li[^>]*?>/\\par\\noindent\\qquad /ig;
  $text =~ s/<\/li>/ /ig;
  $text =~ s/<dd[^>]*?>/\\par\\noindent\\qquad /ig;
  $text =~ s/<\/dd>/ /ig;

  $text =~ s/<ul[^>]*?>/\n\n/ig;
  $text =~ s/<\/ul>/ /ig;
  $text =~ s/<dl[^>]*?>/\n\n/ig;
  $text =~ s/<\/dl>/ /ig;
  $text =~ s/<ol[^>]*?>/\n\n/ig;
  $text =~ s/<\/ol>/ /ig;
  $text =~ s/<p[^>]*?>/\n\n/ig;
  $text =~ s/<\/p>/ /ig;

  $text =~ s/<h1>/\\subsection*{/ig;
  $text =~ s/<h2>/\\subsubsection*{/ig;
  $text =~ s/<h3>/\\paragraph*{/ig;
  $text =~ s/<h4>/\\subparagraph*{/ig;
  $text =~ s/<\/h\d>/}\n/ig;

  $text =~ s/<blockquote>/\\begin{quotation}\n/ig;
  $text =~ s/<\/blockquote>/\\end{quotation}\n/ig;

  $text =~ s/<strong>/\\textbf{/ig;
  $text =~ s/<\/strong>/}/ig;
  $text =~ s/<b>/\\textbf{/ig;
  $text =~ s/<\/b>/}/ig;
  $text =~ s/<i>/\\emph{/ig;
  $text =~ s/<\/i>/}/ig;
  $text =~ s/<var>/\\emph{/ig;
  $text =~ s/<\/var>/}/ig;
  $text =~ s/<emph>/\\emph{/ig;
  $text =~ s/<\/emph>/}/ig;
  $text =~ s/<sup>/\\sup{/ig;
  $text =~ s/<\/sup>/}/ig;
  $text =~ s/<sub>/\\sub{/ig;
  $text =~ s/<\/sub>/}/ig;
  $text =~ s/<small>/{\\small /ig;
  $text =~ s/<\/small>/}/ig;

  $text =~ s/<br[^>]*?>/\\\\[.5\\baselineskip]/ig;

  $text =~ s/----+/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /g;
  $text =~ s/<hr[^>]*?>/\\vskip.25\\baselineskip \\hbox to\\hsize{\\hfil\\vrule width5cm height1pt\\hfil}\\vskip.25\\baselineskip /ig;

  ## Ignore tables for now
  $text =~ s/<table[^>]*?>/ /ig;
  $text =~ s/<\/table>/} /ig;
  $text =~ s/<tr[^>]*?>/ /ig;
  $text =~ s/<\/tr>/ /ig;
  $text =~ s/<td[^>]*?>/ /ig;
  $text =~ s/<\/td>/ /ig;
  $text =~ s/<th[^>]*?>/ /ig;
  $text =~ s/<\/th>/ /ig;

  $text =~ s/<center[^>]*?>/\\begin{center}\n/ig;
  $text =~ s/<\/center>/\\end{center}\n/ig;

  $text =~ s/<div[^>]*?>/ /ig;
  $text =~ s/<\/div>/ /ig;
  $text =~ s/<font[^>]*?>//ig;
  $text =~ s/<\/font>//ig;

  $text =~ s/²/\\sup{2}/g;
  $text =~ s/³/\\sup{3}/g;
  $text =~ s/&#146;/'/g;
  $text =~ s/&#150;//g;
  $text =~ s/&#151;//g;

  print "\\section*{$title}\n\n";
  print "\\begingroup\n";
  print $text;
  print "\\par\\endgroup\n";
  print "\n\n";
}

# Connect to the database.
my $dbh = DBI->connect("DBI:mysql:database=wikipedia-en;host=localhost",
		       "arno", "",
		       {'RaiseError' => 1});

die "No argument specified" if $#ARGV == -1;
my $letter = shift;
print STDERR "Generating all articles starting with $letter.\n";

# Now retrieve data from the table.
my $sth = $dbh->prepare("SELECT cur_title, cur_text FROM cur " .
			"WHERE cur_title LIKE '$letter%' and cur_namespace = 0");
$sth->execute;
texheader();
while (my $ref = $sth->fetchrow_hashref) {
  article2tex($ref->{'cur_title'}, $ref->{'cur_text'});
}
$sth->finish;
texfooter();

# Disconnect from the database.
$dbh->disconnect;