User:Hendrik Brummermann/XHTMLDumper.java

From Wikipedia, the free encyclopedia

//

// This work is licensed under CC-BY
// (Creative Commons License - Attribution 2.0).
// see: http://creativecommons.org/licenses/by/2.0/

// You need the program "tidy" in your system's search path.

/*
 * $Log: XHTMLDumper.java,v $
 * Revision 1.8  2005/01/08 12:01:30  nhb
 * Fixing invokation of wget
 *
 * Revision 1.7  2004/12/11 18:08:28  nhb
 * Store output of tidy into a file instead of reading it directly from stdout.
 * Do not depand on node.toString() dumping the whole xml tree.
 *
 * Revision 1.6  2004/09/28 19:50:46  nhb
 * Bugfix: Doppeltes head-Element beseitigt und im inline Stylesheet die linke Spalte auf 0 gesetzt
 *
 * Revision 1.5  2004/09/28 19:09:55  nhb
 * - Skriptbasiertes Herunterladen von allen Links auf einer Seite.
 * - Der Head-Bereich wird ausgetauscht.
 * - Bild-URLs werden entsprechend umgeschrieben.
 * - Ausgabe eines wget-Skripts fuer Bilder
 * - keine Umkodierung von UTF-8 nach ISO-8859-1 mehr.
 *
 * Revision 1.4  2004/08/29 18:12:50  nhb
 * Neue Klasse: Book
 *
 * Revision 1.3  2004/08/28 21:27:26  nhb
 * *** empty log message ***
 *
 * Revision 1.2  2004/08/28 08:16:09  nhb
 * Refectoring
 *
 * Revision 1.1  2004/08/23 22:16:56  nhb
 * inital checkin
 *
 */
package nhb.wikipedia;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.xpath.XPathAPI;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.SAXException;

/**
 * fetches a collection of articles to disk.
 * It can rewrite links and remove the MediaWiki navigation.
 * Several articles can be combined to one file.
 *
 * @author Hendrik Brummermann <nhb_web@nexgo.de>
 * @link http://creativecommons.org/licenses/by/2.0/
 */
public class XHTMLDumper {

    // --> --> --> --> --> --> --> --> --> --> --> --> --> --> -->
    private static final String URL_PREFIX = "http://localhost:10080"; //"http://wiki";
    private static final String WIKI_PATH = "/mediawiki/index.php/";
    private static final String UPLOAD_PATH = "/mediawiki/images";
    private static final String TARGET = "/tmp/wiki";
    private static final String IMAGE_FOLDER = "wiki_files";
    private static final String ID_SEP = "_____";
    // <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <--


    public XHTMLDumper() {
    }


    public class Article {
        // Variablen
        // private final Namespace NS_HTML = Namespace.getNamespace("http://www.w3.org/1999/xhtml");
        private String title = null;
        private String url = null;
        private Element root = null;
        private Element content = null;
        private boolean unifyIDs = false;
        private boolean convertShortTags = true;
        private boolean fetchPageRequisites = true;
        private boolean rewriteLocalURLs = true;
        private Set pageRequisites = new HashSet();
        private String head = "<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\r\n"
    + "<link rel=\"shortcut icon\" href=\"wiki_files/favicon.ico\" />\r\n"
    + "<style type=\"text/css\" media=\"screen,projection\">/*<![CDATA[*/ @import \"wiki_files/main.css\"; /*]]>*/#content {margin: 0}</style>\r\n"
    + "<link rel=\"stylesheet\" type=\"text/css\" media=\"print\" href=\"wiki_files/commonPrint.css\" />\r\n"
    + "<script type=\"text/javascript\" src=\"wiki_files/wikibits.js\"> </script>\r\n"
    + "<title>HISLSF - Dokumentation</title>\r\n"
    + "</head>";
        // <title>DB-Interface-Admin - His</title>

        public Article(String title) {
            this.title = title;
            url = URL_PREFIX + WIKI_PATH + title;
            fetchAsXHTML();
        }

        public void process() {
            extractContent();
            unifyIDsAndConvertShortTags(root);
        }

        /**
         * Stores the file to disk.
         *
         * @throws IOException bei einem E/A-Fehler
         * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers
         * @throws SAXException bei einem XML-Fehler
         */
        public void saveToDisk() throws SAXException, IOException, ParserConfigurationException {
            process();
            replaceHead();
//            fetchPageRequisites();
            String filename = TARGET + "/" + title.replace(' ', '_').replace('/', '-') + ".html";
            OutputStream of = new FileOutputStream(filename);
            of.write(XMLUtils.dumpXML(root).getBytes("UTF-8"));
            of.close();
        }

        /**
         * replaces the head-element <!-- preserving the title-element.-->
         *
         * @throws IOException bei einem E/A-Fehler
         * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers
         * @throws SAXException bei einem XML-Fehler
         */
        private void replaceHead() throws SAXException, IOException, ParserConfigurationException {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new ByteArrayInputStream(head.getBytes()));
            Node oldHead = null;
            for (int i = 0; i < root.getChildNodes().getLength(); i++) {
                oldHead = root.getChildNodes().item(i);
                if ((oldHead instanceof Element) && ((Element) oldHead).getNodeName().equalsIgnoreCase("head")) {
                    break;
                }
            }
            Node newHead = doc.getDocumentElement();
            newHead = root.getOwnerDocument().importNode(newHead, true);
            root.insertBefore(newHead, oldHead);
            root.removeChild(oldHead);
        }

        /**
         * Downloads a HTML-document, converts it into xhtml using tidy
         * and parses it into an xml object tree.
         */
        private void fetchAsXHTML() {
            try {
                // fetch
                String file = NetUtil.fetchDocumentAsFile(url);

                // run tidy
                //"tidy -asxhtml -utf8 $1 >$1.html 2> /dev/null"
//              Process process = Runtime.getRuntime().exec("tidy -q -asxhtml -utf8 " + file);
                String outFile = File.createTempFile("xhtml", ".html").getAbsolutePath();
                Process process = Runtime.getRuntime().exec("tidy -q -asxhtml -utf8 -o " + outFile + " "+ file);
/*                System.out.println("sleeping");
                Thread.sleep(5000);
                System.out.println("sleeped");*/
                process.waitFor();
                //Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(process.getInputStream());
                Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(outFile));
                root = doc.getDocumentElement();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        /**
         * Extracts the content (i. e. strips the navigation).
         */
        private void extractContent() {
            try {
                content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']");

                // Einige Elemente loeschen
//                XMLUtils.removeChildren(root, "//self::node()[@id='contentSub' or @id='siteSub' or @id='toc' or @class='printfooter' or @id='catlinks' or @class='editsection']");
                XMLUtils.removeChildren(root, "//self::node()[@id='column-one' or @id='footer' or @id='contentSub' or @id='siteSub' or @class='printfooter' or @id='catlinks' or @class='editsection']");

            } catch (TransformerException e) {
                e.printStackTrace();
            }
        }

/*
name: top
id: contentTop
id: bodyContent
id: contentSub
*/

        private void unifyIDsAndConvertShortTags(Element element) {
            NodeList nodes = element.getChildNodes();
            for (int i = 0; i < nodes.getLength(); i++) {
                Node node = nodes.item(i);
                if (node instanceof Element) {
                    Element e = (Element) node;
                    String nodeName = node.getNodeName();

                    // unify IDs
                    if (unifyIDs && "a".equals(nodeName)) {
                        String val = e.getAttribute("name");
                        if (!val.equals("")) {
                            e.setAttribute("name", title + ID_SEP + val);
                            e.setAttribute("id", title + ID_SEP + val);
                        }
                        val = e.getAttribute("href");
                        if ((val.length() > 1) && val.charAt(0) == '#') {
                            e.setAttribute("href", title + ID_SEP + val.substring(1));
                            System.out.println("#" + title + ID_SEP + val.substring(1));
                        }
                    }

                    // convert short tags
                    if (convertShortTags && ("a".equals(nodeName) || "div".equals(nodeName)) && (e.getFirstChild() == null)) {
                        e.appendChild(e.getOwnerDocument().createTextNode(""));
                    }

                    // convert links to other pages
                    if (rewriteLocalURLs && "a".equals(nodeName)) {
                        String val = e.getAttribute("href");
                        if (val.startsWith(WIKI_PATH)) {
                            e.setAttribute("href", val.substring(WIKI_PATH.length()));
                        }
                    }

                    // collection image urls and rewrite img-src links.
                    if (fetchPageRequisites && "img".equals(nodeName)) {
                        String url = e.getAttribute("src");
                        pageRequisites.add(url);
                        if (url.startsWith(UPLOAD_PATH)) {
                            url = IMAGE_FOLDER + url.substring(UPLOAD_PATH.length() + 5);
                            e.setAttribute("src", url);
                        }
                    }

                    // go to the next level
                    unifyIDsAndConvertShortTags((Element) node);
                }
            }
        }

        /**
         * Returns a set of page requisites (like images)
         *
         * @return Set
         */
        public Set getPageRequisites() {
            return pageRequisites;
        }

        /**
         * Return the xml object.
         *
         * @return Element
         */
        public Element getXML() {
            if (content != null) {
                return content;
            } else {
                return root;
            }
        }
    }


    public class Book {
        private Set pages = new HashSet();
        private Set pageRequisites = new HashSet();

        /**
         * creates a new book
         *
         * @param name page containing a list of links
         * @throws IOException
         */
        public Book(String name) throws IOException {
            // fetch wiki text
            BufferedReader br = NetUtil.fetchDocumentAsBufferedReader(URL_PREFIX + WIKI_PATH + name + "?action=raw");
            fetchLinkList(br);
            br.close();
        }

        /**
         * Fetches all pages of this book
         * @throws IOException
         * @throws ParserConfigurationException
         * @throws SAXException
         */
        public void fetchBook() throws SAXException, IOException, ParserConfigurationException {
            Iterator itr = pages.iterator();
            while (itr.hasNext()) {
                String page = (String) itr.next();
                System.out.println("fetching " + page + "...");
                Article article = new Article(page);
                article.saveToDisk();
                pageRequisites.addAll(article.getPageRequisites());
            }
            fetchPageRequisites();
        }

        private void fetchPageRequisites() {
            System.out.println("cd " + TARGET + "/" + IMAGE_FOLDER);
            Iterator itr = pageRequisites.iterator();
            while (itr.hasNext()) {
                System.out.println("wget -N " + URL_PREFIX + itr.next());
            }
        }


        private void fetchLinkList(BufferedReader br) throws IOException {
            String line = br.readLine();
            while (line != null) {
                int pos = line.indexOf("[[");
                while (pos > -1) {
                    line = line.substring(pos + 2);
                    int posEnd = line.indexOf("]]");
                    if (posEnd == -1) { // is the link closed?
                        break;
                    }
                    String link = line.substring(0, posEnd);
                    pos = link.indexOf("|");
                    if (pos > -1) {
                        link = link.substring(0, pos);
                    }
                    link = link.trim();
                    String page = link;
                    if (page.length() == 0) {
                        continue;
                    }
                    page = page.replace(' ', '_');
                    pages.add(page);

                    // find next link
                    line = line.substring(posEnd + 2);
                    pos = line.indexOf("[[");
                }
                line = br.readLine();
            }
        }
    }


    public class Cover {
        private Element root = null;

        public Cover(String name) {
            Article cover = new Article(name);
            root = cover.getXML();
            XMLUtils.removeChildren(root, "//div[@id='content']/*");
            XMLUtils.removeChildren(root, "//div[@id='column-one' or @id='footer']");
            try {
                Element content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']");

            } catch (TransformerException e) {
                e.printStackTrace();
            }
        }

        public Element getXML() {
            return root;
        }
    }


    public static class XMLUtils {
        /** hide constructor */
        private XMLUtils() { }

        /**
         * Dumps an XML-tree into a String
         *
         * @param node xml-node
         * @return String
         */
        public static String dumpXML(Node node) {
            try {
                // Message-ID: <gX009.6306$Ky3.363117@newsread2.prod.itd.earthlink.net> From: "Billy Ng"
                DOMSource source = new DOMSource(node);
                TransformerFactory tfFactory = TransformerFactory.newInstance();
                Transformer transformer = tfFactory.newTransformer();
                StringWriter sw = new StringWriter();
                StreamResult result = new StreamResult(sw);
                transformer.transform(source, result);
                return sw.toString();
            } catch (TransformerConfigurationException e) {
                e.printStackTrace();
            } catch (TransformerException e) {
                e.printStackTrace();
            }
            return "";
        }

        public static void removeChildren(Element parent, String xpath) {
            try {
                NodeIterator itr = XPathAPI.selectNodeIterator(parent, xpath);
                Node node = itr.nextNode();
                Set set = new HashSet();
                while (node != null) {
                    set.add(node);
                    node = itr.nextNode();
                }
                Iterator itr2 = set.iterator();
                while (itr2.hasNext()) {
                    node = (Node) itr2.next();
                    node.getParentNode().removeChild(node);
                }
            } catch (TransformerException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Utility class for network access.
     */
    public static final class NetUtil {
        private static final int BUFFER_SIZE = 10240;

        /** Hide constructor */
        private NetUtil() {
        }

        /**
         * Gibt einen BufferedReader mit dem Ziel der URL zurueck.
         *
         * @param urlString URL
         * @return BufferedReader
         * @throws IOException bei einem E/A-Fehler
         */
        public static BufferedReader fetchDocumentAsBufferedReader(String urlString) throws IOException {
            URL url = new URL(urlString);
            InputStream is = url.openStream();
            return new BufferedReader(new InputStreamReader(is));
        }

        /**
         * Laedt ein Dokument aus dem Netz herunter und
         * speichert es in einer lokalen Datei.
         *
         * @param urlString URL
         * @return Dateiname
         * @throws IOException bei einem Fehler
         */
        public static String fetchDocumentAsFile(String urlString) throws IOException {
            byte[] temp = new byte[BUFFER_SIZE + 1];
            URL url = new URL(urlString);
            BufferedInputStream is = new BufferedInputStream(url.openStream());
            File file = File.createTempFile("dump", ".html");
            file.deleteOnExit();
            String tempFile = file.getAbsolutePath();
            BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(tempFile));
            while (true) {
                int aval = is.available();
                if (aval == 0) {
                    try {
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
                int count = is.read(temp, 0, BUFFER_SIZE);
                if (count == -1) {
                    break;
                }
                os.write(temp, 0, count);
            }
            is.close();
            os.close();
            return tempFile;
        }

    }

    /**
     * main entry point
     *
     * @param args command line arguments
     * @throws Exception if something unexpected happend
     */
    public static void main(String[] args) throws Exception {
        if (args.length == 0) {
            System.err.println("Aufruf: nhb.wikipedia.XHTMLDumper title-of-link-list");
            System.err.println("    title-of-link-list is the title of the page containing a list of links.");
            System.exit(1);
        }
        XHTMLDumper xd = new XHTMLDumper();
        Book book = xd.new Book(args[0]);
        book.fetchBook();

        /*article.process();
        System.out.println(article.getXML());*/

        System.out.println("Fertig.");
    }
}

//