/*
 * Decompiled with CFR 0.152.
 */
package org.openimaj.web.readability;

import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.EnumSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.cyberneko.html.parsers.DOMParser;
import org.openimaj.web.readability.Anchor;
import org.pojava.datetime.DateTime;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.bootstrap.DOMImplementationRegistry;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSSerializer;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.TreeWalker;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

public class Readability {
    public static float LINK_DENSITY_THRESHOLD = 0.33f;
    protected Document document;
    private Node bodyCache;
    protected EnumSet<Flag> flags = EnumSet.allOf(Flag.class);
    protected String articleTitle;
    protected Element articleContent;
    protected String article_date_string;
    protected Date article_date;
    protected String article_contentType;
    protected boolean debug = false;
    protected boolean addTitle = false;

    public Readability(Document document) {
        this(document, false);
    }

    public Readability(Document document, boolean debug) {
        this(document, debug, false);
    }

    public Readability(Document document, boolean debug, boolean addTitle) {
        this.debug = debug;
        this.document = document;
        this.addTitle = addTitle;
        Readability.augmentDocument(document);
        this.init();
    }

    public static void augmentDocument(Document document) {
        DocumentTraversal traversal = (DocumentTraversal)((Object)document);
        TreeWalker walker = traversal.createTreeWalker(document, 1, null, true);
        Readability.traverseLevel(walker, 0);
    }

    private static int traverseLevel(TreeWalker walker, int counter) {
        Node parend = walker.getCurrentNode();
        if (parend instanceof Element && ((Element)parend).getAttribute("id").length() == 0) {
            ((Element)parend).setAttribute("id", "gen-id-" + counter);
            ++counter;
        }
        Node n = walker.firstChild();
        while (n != null) {
            counter = Readability.traverseLevel(walker, counter);
            n = walker.nextSibling();
        }
        walker.setCurrentNode(parend);
        return counter;
    }

    protected void dbg(String s) {
        if (this.debug) {
            System.err.println(s);
        }
    }

    protected String getTitle() {
        NodeList l = this.document.getElementsByTagName("title");
        if (l.getLength() == 0) {
            return "";
        }
        return l.item(0).getTextContent();
    }

    protected String[] match(String input, String regex) {
        Matcher matcher = Pattern.compile(regex).matcher(input);
        ArrayList<String> matches = new ArrayList<String>();
        while (matcher.find()) {
            matches.add(matcher.group(0));
        }
        return matches.toArray(new String[matches.size()]);
    }

    public boolean hasContent() {
        return this.articleContent != null;
    }

    protected int search(String input, String regex) {
        Matcher matcher = Pattern.compile(regex).matcher(input);
        if (!matcher.find()) {
            return -1;
        }
        return matcher.start();
    }

    protected void findArticleEncoding() {
        NodeList nl = this.document.getElementsByTagName("meta");
        for (int j = 0; j < nl.getLength(); ++j) {
            if (!((Element)nl.item(j)).getAttribute("http-equiv").equals("Content-Type")) continue;
            this.article_contentType = ((Element)nl.item(j)).getAttribute("content");
            return;
        }
    }

    protected void findArticleDate() {
        int j;
        NodeList nl = this.document.getElementsByTagName("meta");
        for (j = 0; j < nl.getLength(); ++j) {
            if (((Element)nl.item(j)).getAttribute("name").equals("OriginalPublicationDate")) {
                this.article_date_string = ((Element)nl.item(j)).getAttribute("content");
                this.article_date = DateTime.parse((String)this.article_date_string).toDate();
                return;
            }
            if (!((Element)nl.item(j)).getAttribute("name").equals("DC.date.issued")) continue;
            this.article_date_string = ((Element)nl.item(j)).getAttribute("content");
            this.article_date = DateTime.parse((String)this.article_date_string).toDate();
            return;
        }
        nl = this.document.getElementsByTagName("time");
        for (j = 0; j < nl.getLength(); ++j) {
            if (((Element)nl.item(j)).getAttributeNode("pubdate") == null) continue;
            this.article_date_string = ((Element)nl.item(j)).getAttribute("datetime");
            this.article_date = DateTime.parse((String)this.article_date_string).toDate();
            return;
        }
        nl = this.document.getElementsByTagName("*");
        for (j = 0; j < nl.getLength(); ++j) {
            if (!((Element)nl.item(j)).getAttribute("class").contains("date") && !((Element)nl.item(j)).getAttribute("class").contains("Date") || ((Element)nl.item(j)).getAttribute("class").contains("update") || ((Element)nl.item(j)).getAttribute("class").contains("Update")) continue;
            this.article_date_string = this.getInnerTextSep(nl.item(j)).trim();
            this.parseDate();
            return;
        }
        for (j = 0; j < nl.getLength(); ++j) {
            if (!((Element)nl.item(j)).getAttribute("id").contains("date") && !((Element)nl.item(j)).getAttribute("id").contains("Date") || ((Element)nl.item(j)).getAttribute("id").contains("update") || ((Element)nl.item(j)).getAttribute("id").contains("Update")) continue;
            this.article_date_string = this.getInnerTextSep(nl.item(j)).trim();
            this.parseDate();
            return;
        }
        nl = this.document.getElementsByTagName("*");
        for (j = 0; j < nl.getLength(); ++j) {
            Pattern p;
            Matcher m;
            String text = nl.item(j).getTextContent();
            if (text == null || !(m = (p = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)")).matcher(text)).find()) continue;
            this.article_date_string = m.group(1);
            String cpy = this.article_date_string.replaceAll("th", "");
            cpy = cpy.replaceAll("st", "");
            cpy = cpy.replaceAll("nd", "");
            cpy = cpy.replaceAll("rd", "");
            SimpleDateFormat sdf = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy");
            try {
                this.article_date = sdf.parse(cpy);
            }
            catch (ParseException parseException) {
                // empty catch block
            }
            return;
        }
    }

    protected void parseDate() {
        if (this.article_date_string == null || this.article_date_string.trim().isEmpty()) {
            return;
        }
        if (this.article_date_string.contains("Today")) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("'Today @' HH:mm z");
                this.article_date = sdf.parse(this.article_date_string);
                Date now = new Date();
                this.article_date.setDate(now.getDate());
                this.article_date.setMonth(now.getMonth());
                this.article_date.setYear(now.getYear());
            }
            catch (ParseException sdf) {}
        } else {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("h:mm z',' E',' dd M yyyy");
                this.article_date = sdf.parse(this.article_date_string);
            }
            catch (ParseException e) {
                try {
                    SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z");
                    this.article_date = sdf.parse(this.article_date_string);
                }
                catch (ParseException ee) {
                    try {
                        SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy");
                        this.article_date = sdf.parse(this.article_date_string);
                    }
                    catch (ParseException eee) {
                        try {
                            this.article_date = DateTime.parse((String)this.article_date_string).toDate();
                        }
                        catch (IllegalArgumentException illegalArgumentException) {
                        }
                        catch (ArrayIndexOutOfBoundsException ie) {
                            System.out.println(this.article_date_string);
                        }
                    }
                }
            }
        }
    }

    protected String findArticleTitle() {
        NodeList hOnes;
        String curTitle = "";
        String origTitle = "";
        curTitle = origTitle = this.getTitle();
        ArrayList<String> potentialTitles = new ArrayList<String>();
        for (int i = 1; i <= 6; ++i) {
            NodeList nl = this.document.getElementsByTagName("h" + i);
            if (nl.getLength() <= 0) continue;
            for (int j = 0; j < nl.getLength(); ++j) {
                potentialTitles.add(nl.item(j).getTextContent().trim());
            }
        }
        String potentialTitle = null;
        int score = 0;
        for (String s : potentialTitles) {
            if (s.length() <= score || !curTitle.contains(s)) continue;
            potentialTitle = s;
            score = s.length();
        }
        if (potentialTitle != null) {
            return potentialTitle;
        }
        if (this.match(curTitle, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0) {
            curTitle = origTitle.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1");
            if (curTitle.split(" ").length < 3) {
                curTitle = origTitle.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe + "]+(.*)", "$1");
            }
        } else if (curTitle.indexOf(": ") != -1) {
            curTitle = origTitle.replaceAll("(?i).*:(.*)", "$1");
            if (curTitle.split(" ").length < 3) {
                curTitle = origTitle.replaceAll("(?i)[^:]*[:](.*)", "$1");
            }
        } else if ((curTitle.length() > 150 || curTitle.length() < 15) && (hOnes = this.document.getElementsByTagName("h1")).getLength() == 1) {
            curTitle = this.getInnerText((Element)hOnes.item(0));
        }
        if ((curTitle = curTitle.replaceAll(Regexps.trimRe, "")).split(" ").length <= 3) {
            curTitle = origTitle;
        }
        return curTitle;
    }

    protected Element getBody() {
        NodeList nl = this.document.getElementsByTagName("body");
        if (nl.getLength() == 0) {
            return null;
        }
        return (Element)nl.item(0);
    }

    protected void init() {
        if (this.getBody() != null && this.bodyCache == null) {
            this.bodyCache = this.getBody().cloneNode(true);
        }
        this.findArticleDate();
        this.findArticleEncoding();
        this.prepDocument();
        this.articleTitle = this.findArticleTitle();
        this.articleContent = this.grabArticle();
        if (this.getInnerText(this.articleContent, false).length() < 250) {
            if (this.flags.contains((Object)Flag.FLAG_STRIP_UNLIKELYS)) {
                this.flags.remove((Object)Flag.FLAG_STRIP_UNLIKELYS);
                this.getBody().getParentNode().replaceChild(this.bodyCache, this.getBody());
                this.init();
                return;
            }
            if (this.flags.contains((Object)Flag.FLAG_WEIGHT_CLASSES)) {
                this.flags.remove((Object)Flag.FLAG_WEIGHT_CLASSES);
                this.getBody().getParentNode().replaceChild(this.bodyCache, this.getBody());
                this.init();
                return;
            }
            this.articleContent = null;
        }
        if (this.addTitle && this.articleContent != null) {
            Element titleNode = this.document.createElement("h1");
            titleNode.setAttribute("id", "title");
            titleNode.appendChild(this.document.createTextNode(this.getArticleTitle()));
            this.articleContent.insertBefore(titleNode, this.articleContent.getFirstChild());
        }
    }

    protected void prepDocument() {
        if (this.getBody() == null) {
            Element body = this.document.createElement("body");
            this.document.appendChild(body);
        }
        NodeList scripts = this.document.getElementsByTagName("script");
        for (int i = scripts.getLength() - 1; i >= 0; --i) {
            scripts.item(i).getParentNode().removeChild(scripts.item(i));
        }
        NodeList styleTags = this.document.getElementsByTagName("style");
        for (int st = styleTags.getLength() - 1; st >= 0; --st) {
            styleTags.item(st).getParentNode().removeChild(styleTags.item(st));
        }
        NodeList metaTags = this.document.getElementsByTagName("meta");
        for (int mt = metaTags.getLength() - 1; mt >= 0; --mt) {
            metaTags.item(mt).getParentNode().removeChild(metaTags.item(mt));
        }
        Element body = this.getBody();
        Node frag = this.stringToNode(this.getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>"));
        this.removeChildren(body);
        body.appendChild(frag);
        this.removeComments(this.document);
    }

    protected void removeComments(Node n) {
        if (n.getNodeType() == 8) {
            n.getParentNode().removeChild(n);
        } else {
            NodeList nl = n.getChildNodes();
            for (int i = nl.getLength() - 1; i >= 0; --i) {
                this.removeComments(nl.item(i));
            }
        }
    }

    protected void prepArticle(Element articleContent) {
        this.cleanStyles(articleContent);
        this.killBreaks(articleContent);
        this.clean(articleContent, "form");
        this.clean(articleContent, "object");
        this.clean(articleContent, "h1");
        if (articleContent.getElementsByTagName("h2").getLength() == 1) {
            this.clean(articleContent, "h2");
        }
        this.clean(articleContent, "iframe");
        this.cleanHeaders(articleContent);
        this.cleanConditionally(articleContent, "table");
        this.cleanConditionally(articleContent, "ul");
        this.cleanConditionally(articleContent, "div");
        NodeList articleParagraphs = articleContent.getElementsByTagName("p");
        for (int i = articleParagraphs.getLength() - 1; i >= 0; --i) {
            int imgCount = ((Element)articleParagraphs.item(i)).getElementsByTagName("img").getLength();
            int embedCount = ((Element)articleParagraphs.item(i)).getElementsByTagName("embed").getLength();
            int objectCount = ((Element)articleParagraphs.item(i)).getElementsByTagName("object").getLength();
            if (imgCount != 0 || embedCount != 0 || objectCount != 0 || this.getInnerText((Element)articleParagraphs.item(i), false) != "") continue;
            articleParagraphs.item(i).getParentNode().removeChild(articleParagraphs.item(i));
        }
        Node n = this.stringToNode(this.getInnerHTML(articleContent).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
        this.removeChildren(articleContent);
        articleContent.appendChild(n);
        NodeList nl = articleContent.getElementsByTagName("p");
        for (int i = nl.getLength() - 1; i >= 0; --i) {
            if (nl.item(i).getTextContent().trim().length() == 0) {
                nl.item(i).getParentNode().removeChild(nl.item(i));
                continue;
            }
            if (nl.item(i).getChildNodes().getLength() == 1 && nl.item(i).getChildNodes().item(0).getNodeType() == 3) {
                nl.item(i).setTextContent("\n" + nl.item(i).getTextContent().trim() + "\n");
                continue;
            }
            if (!((Element)nl.item(i)).getAttribute("class").equals("readability-styled")) continue;
            nl.item(i).getParentNode().replaceChild(this.document.createTextNode(nl.item(i).getTextContent()), nl.item(i));
        }
    }

    protected void removeChildren(Node n) {
        NodeList nl = n.getChildNodes();
        int nn = nl.getLength();
        for (int i = 0; i < nn; ++i) {
            n.removeChild(nl.item(0));
        }
    }

    protected void initializeNode(Element node) {
        float contentScore = 0.0f;
        if (node.getTagName() == "DIV") {
            contentScore += 5.0f;
        } else if (node.getTagName() == "PRE" || node.getTagName() == "TD" || node.getTagName() == "BLOCKQUOTE") {
            contentScore += 3.0f;
        } else if (node.getTagName() == "ADDRESS" || node.getTagName() == "OL" || node.getTagName() == "UL" || node.getTagName() == "DL" || node.getTagName() == "DD" || node.getTagName() == "DT" || node.getTagName() == "LI" || node.getTagName() == "FORM") {
            contentScore -= 3.0f;
        } else if (node.getTagName() == "H1" || node.getTagName() == "H2" || node.getTagName() == "H3" || node.getTagName() == "H4" || node.getTagName() == "H5" || node.getTagName() == "H6" || node.getTagName() == "TH") {
            contentScore -= 5.0f;
        }
        node.setUserData("readability", Float.valueOf(contentScore += (float)this.getClassWeight(node)), null);
    }

    protected int getClassWeight(Element e) {
        if (!this.flags.contains((Object)Flag.FLAG_WEIGHT_CLASSES)) {
            return 0;
        }
        int weight = 0;
        if (e.getAttribute("class") != "") {
            if (this.search(e.getAttribute("class"), Regexps.negativeRe) != -1) {
                weight -= 25;
            }
            if (this.search(e.getAttribute("class"), Regexps.positiveRe) != -1) {
                weight += 25;
            }
        }
        if (e.getAttribute("id") != "") {
            if (this.search(e.getAttribute("id"), Regexps.negativeRe) != -1) {
                weight -= 25;
            }
            if (this.search(e.getAttribute("id"), Regexps.positiveRe) != -1) {
                weight += 25;
            }
        }
        return weight;
    }

    protected void cleanStyles() {
        this.cleanStyles((Element)((Object)this.document));
    }

    protected void cleanStyles(Element e) {
        if (e == null) {
            return;
        }
        Node cur = e.getFirstChild();
        if (!e.getAttribute("class").equals("readability-styled")) {
            e.removeAttribute("style");
        }
        while (cur != null) {
            if (cur.getNodeType() == 1) {
                if (!((Element)cur).getAttribute("class").equals("readability-styled")) {
                    ((Element)cur).removeAttribute("style");
                }
                this.cleanStyles((Element)cur);
            }
            cur = cur.getNextSibling();
        }
    }

    protected void killBreaks(Element e) {
        Node n = this.stringToNode(this.getInnerHTML(e).replaceAll(Regexps.killBreaksRe, "<BR />"));
        this.removeChildren(e);
        e.appendChild(n);
    }

    protected void clean(Element e, String tag) {
        NodeList targetList = e.getElementsByTagName(tag);
        boolean isEmbed = tag.equals("object") || tag.equals("embed");
        for (int y = targetList.getLength() - 1; y >= 0; --y) {
            if (isEmbed) {
                String attributeValues = "";
                int il = targetList.item(y).getAttributes().getLength();
                for (int i = 0; i < il; ++i) {
                    attributeValues = attributeValues + targetList.item(y).getAttributes().item(i).getNodeValue() + "|";
                }
                if (this.search(attributeValues, Regexps.videoRe) != -1 || this.search(this.getInnerHTML(targetList.item(y)), Regexps.videoRe) != -1) continue;
            }
            targetList.item(y).getParentNode().removeChild(targetList.item(y));
        }
    }

    protected void cleanHeaders(Element e) {
        for (int headerIndex = 1; headerIndex < 7; ++headerIndex) {
            NodeList headers = e.getElementsByTagName("h" + headerIndex);
            for (int i = headers.getLength() - 1; i >= 0; --i) {
                if (this.getClassWeight((Element)headers.item(i)) >= 0 && !(this.getLinkDensity((Element)headers.item(i)) > LINK_DENSITY_THRESHOLD)) continue;
                headers.item(i).getParentNode().removeChild(headers.item(i));
            }
        }
    }

    protected float getLinkDensity(Element e) {
        NodeList links = e.getElementsByTagName("a");
        int textLength = this.getInnerText(e).length();
        int linkLength = 0;
        int il = links.getLength();
        for (int i = 0; i < il; ++i) {
            linkLength += this.getInnerText((Element)links.item(i)).length();
        }
        if (linkLength == 0) {
            return 0.0f;
        }
        return (float)linkLength / (float)textLength;
    }

    protected void cleanConditionally(Element e, String tag) {
        NodeList tagsList = e.getElementsByTagName(tag);
        int curTagsLength = tagsList.getLength();
        for (int i = curTagsLength - 1; i >= 0; --i) {
            int weight = this.getClassWeight((Element)tagsList.item(i));
            float contentScore = tagsList.item(i).getUserData("readability") != null ? ((Float)tagsList.item(i).getUserData("readability")).floatValue() : 0.0f;
            this.dbg("Cleaning Conditionally " + tagsList.item(i) + " (" + ((Element)tagsList.item(i)).getAttribute("class") + ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")" + (tagsList.item(i).getUserData("readability") != null ? " with score " + tagsList.item(i).getUserData("readability") : ""));
            if ((float)weight + contentScore < 0.0f) {
                this.dbg("Removing " + tagsList.item(i) + " (" + ((Element)tagsList.item(i)).getAttribute("class") + ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")");
                tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
                continue;
            }
            if (this.getCharCount((Element)tagsList.item(i), ",") >= 10) continue;
            int p = ((Element)tagsList.item(i)).getElementsByTagName("p").getLength();
            int img = ((Element)tagsList.item(i)).getElementsByTagName("img").getLength();
            int li = ((Element)tagsList.item(i)).getElementsByTagName("li").getLength() - 100;
            int input = ((Element)tagsList.item(i)).getElementsByTagName("input").getLength();
            int embedCount = 0;
            NodeList embeds = ((Element)tagsList.item(i)).getElementsByTagName("embed");
            int il = embeds.getLength();
            for (int ei = 0; ei < il; ++ei) {
                if (this.search(((Element)embeds.item(ei)).getAttribute("src"), Regexps.videoRe) != -1) continue;
                ++embedCount;
            }
            float linkDensity = this.getLinkDensity((Element)tagsList.item(i));
            int contentLength = this.getInnerText((Element)tagsList.item(i)).length();
            boolean toRemove = false;
            if (img > p) {
                toRemove = true;
            } else if (li > p && tag != "ul" && tag != "ol") {
                toRemove = true;
            } else if ((double)input > Math.floor(p / 3)) {
                toRemove = true;
            } else if (contentLength < 25 && (img == 0 || img > 2)) {
                toRemove = true;
            } else if (weight < 25 && (double)linkDensity > 0.2) {
                toRemove = true;
            } else if (weight >= 25 && (double)linkDensity > 0.5) {
                toRemove = true;
            } else if (embedCount == 1 && contentLength < 75 || embedCount > 1) {
                toRemove = true;
            }
            if (img == 1 && p == 0 && contentLength == 0) {
                Element theImg = (Element)((Element)tagsList.item(i)).getElementsByTagName("img").item(0);
                String w = "";
                if (theImg.getAttribute("width") != null) {
                    w = theImg.getAttribute("width");
                }
                String h = "";
                if (theImg.getAttribute("height") != null) {
                    h = theImg.getAttribute("height");
                }
                if (!w.equals("0") && !h.equals("0")) {
                    toRemove = false;
                }
            }
            if (!toRemove) continue;
            this.dbg("Removing " + tagsList.item(i) + " (" + ((Element)tagsList.item(i)).getAttribute("class") + ":" + ((Element)tagsList.item(i)).getAttribute("id") + ")");
            tagsList.item(i).getParentNode().removeChild(tagsList.item(i));
        }
    }

    protected int getCharCount(Element e, String s) {
        return this.getInnerText(e).split(s).length - 1;
    }

    protected int getCharCount(Element e) {
        return this.getCharCount(e, ",");
    }

    public String getArticleTitle() {
        return this.articleTitle;
    }

    public String getArticleContentType() {
        return this.article_contentType;
    }

    protected Element grabArticle() {
        boolean stripUnlikelyCandidates = this.flags.contains((Object)Flag.FLAG_STRIP_UNLIKELYS);
        Element node = null;
        ArrayList<Element> nodesToScore = new ArrayList<Element>();
        int nodeIndex = 0;
        while ((node = (Element)this.document.getElementsByTagName("*").item(nodeIndex)) != null) {
            String unlikelyMatchString;
            if (stripUnlikelyCandidates && this.search(unlikelyMatchString = node.getAttribute("class") + node.getAttribute("id"), Regexps.unlikelyCandidatesRe) != -1 && this.search(unlikelyMatchString, Regexps.okMaybeItsACandidateRe) == -1 && !node.getTagName().equals("BODY")) {
                this.dbg("Removing unlikely candidate - " + unlikelyMatchString);
                node.getParentNode().removeChild(node);
                --nodeIndex;
            } else {
                if (node.getTagName().equals("P") || node.getTagName().equals("TD")) {
                    nodesToScore.add(node);
                }
                if (node.getTagName().equals("DIV")) {
                    if (this.search(this.getInnerHTML(node), Regexps.divToPElementsRe) == -1) {
                        this.dbg("Altering div to p");
                        Element newNode = this.document.createElement("P");
                        NodeList nl = node.getChildNodes();
                        for (int i = 0; i < nl.getLength(); ++i) {
                            newNode.appendChild(nl.item(i));
                        }
                        node.getParentNode().replaceChild(newNode, node);
                        --nodeIndex;
                    } else {
                        int il = node.getChildNodes().getLength();
                        for (int i = 0; i < il; ++i) {
                            Node childNode = node.getChildNodes().item(i);
                            if (childNode.getNodeType() != 3) continue;
                            this.dbg("replacing text node with a p tag with the same content.");
                            Element p = this.document.createElement("p");
                            p.setNodeValue(childNode.getNodeValue());
                            p.setTextContent(childNode.getTextContent());
                            p.setAttribute("class", "readability-styled");
                            childNode.getParentNode().replaceChild(p, childNode);
                        }
                    }
                }
            }
            ++nodeIndex;
        }
        ArrayList<Element> candidates = new ArrayList<Element>();
        for (int pt = 0; pt < nodesToScore.size(); ++pt) {
            Element parentNode = (Element)((Element)nodesToScore.get(pt)).getParentNode();
            Element grandParentNode = (Element)parentNode.getParentNode();
            String innerText = this.getInnerText((Element)nodesToScore.get(pt));
            if (innerText.length() < 25) continue;
            if (parentNode.getUserData("readability") == null) {
                this.initializeNode(parentNode);
                candidates.add(parentNode);
            }
            if (grandParentNode.getUserData("readability") == null) {
                this.initializeNode(grandParentNode);
                candidates.add(grandParentNode);
            }
            float contentScore = 0.0f;
            contentScore += 1.0f;
            contentScore += (float)innerText.split(",").length;
            contentScore = (float)((double)contentScore + Math.min(Math.floor((float)innerText.length() / 100.0f), 3.0));
            parentNode.setUserData("readability", Float.valueOf(((Float)parentNode.getUserData("readability")).floatValue() + contentScore), null);
            grandParentNode.setUserData("readability", Float.valueOf(((Float)grandParentNode.getUserData("readability")).floatValue() + contentScore / 2.0f), null);
        }
        Element topCandidate = null;
        int cl = candidates.size();
        for (int c = 0; c < cl; ++c) {
            ((Element)candidates.get(c)).setUserData("readability", Float.valueOf(((Float)((Element)candidates.get(c)).getUserData("readability")).floatValue() * (1.0f - this.getLinkDensity((Element)candidates.get(c)))), null);
            this.dbg("Candidate: " + candidates.get(c) + " (" + ((Element)candidates.get(c)).getAttribute("class") + ":" + ((Element)candidates.get(c)).getAttribute("id") + ") with score " + ((Element)candidates.get(c)).getUserData("readability"));
            if (topCandidate != null && !(((Float)((Element)candidates.get(c)).getUserData("readability")).floatValue() > ((Float)topCandidate.getUserData("readability")).floatValue())) continue;
            topCandidate = (Element)candidates.get(c);
        }
        if (topCandidate != null) {
            this.dbg("==> TOP Candidate: " + topCandidate + " (" + topCandidate.getAttribute("class") + ":" + topCandidate.getAttribute("id") + ") with score " + topCandidate.getUserData("readability"));
        }
        if (topCandidate == null || topCandidate.getTagName().equals("BODY")) {
            topCandidate = this.document.createElement("DIV");
            NodeList nl = this.getBody().getChildNodes();
            for (int i = 0; i < nl.getLength(); ++i) {
                topCandidate.appendChild(nl.item(i));
            }
            this.getBody().appendChild(topCandidate);
            this.initializeNode(topCandidate);
        }
        Element articleContent = this.document.createElement("DIV");
        articleContent.setAttribute("id", "readability-content");
        float siblingScoreThreshold = Math.max(10.0f, ((Float)topCandidate.getUserData("readability")).floatValue() * 0.2f);
        NodeList siblingNodes = topCandidate.getParentNode().getChildNodes();
        int sl = siblingNodes.getLength();
        for (int s = 0; s < sl; ++s) {
            Node siblingNode = siblingNodes.item(s);
            boolean append = false;
            if (siblingNode instanceof Element) {
                this.dbg("Looking at sibling node: " + siblingNode + " (" + ((Element)siblingNode).getAttribute("class") + ":" + ((Element)siblingNode).getAttribute("id") + ")" + (siblingNode.getUserData("readability") != null ? " with score " + siblingNode.getUserData("readability") : ""));
            }
            this.dbg("Sibling has score " + (siblingNode.getUserData("readability") != null ? siblingNode.getUserData("readability") : "Unknown"));
            if (siblingNode == topCandidate) {
                append = true;
            }
            float contentBonus = 0.0f;
            if (siblingNode instanceof Element && ((Element)siblingNode).getAttribute("class").equals(topCandidate.getAttribute("class")) && !topCandidate.getAttribute("class").equals("")) {
                contentBonus += ((Float)topCandidate.getUserData("readability")).floatValue() * 0.2f;
            }
            if (siblingNode.getUserData("readability") != null && ((Float)siblingNode.getUserData("readability")).floatValue() + contentBonus >= siblingScoreThreshold) {
                append = true;
            }
            if (siblingNode.getNodeName().equals("P")) {
                float linkDensity = this.getLinkDensity((Element)siblingNode);
                String nodeContent = this.getInnerText((Element)siblingNode);
                int nodeLength = nodeContent.length();
                if (nodeLength > 80 && (double)linkDensity < 0.25) {
                    append = true;
                } else if (nodeLength < 80 && linkDensity == 0.0f && this.search(nodeContent, "\\.( |$)") != -1) {
                    append = true;
                }
            }
            if (!append) continue;
            this.dbg("Appending node: " + siblingNode);
            Node nodeToAppend = null;
            if (!siblingNode.getNodeName().equals("DIV") && !siblingNode.getNodeName().equals("P")) {
                this.dbg("Altering siblingNode of " + siblingNode.getNodeName() + " to div.");
                nodeToAppend = this.document.createElement("div");
                if (siblingNode instanceof Element) {
                    ((Element)nodeToAppend).setAttribute("id", ((Element)siblingNode).getAttribute("id"));
                }
                NodeList nl = siblingNode.getChildNodes();
                for (int i = 0; i < nl.getLength(); ++i) {
                    nodeToAppend.appendChild(nl.item(i));
                }
            } else {
                nodeToAppend = siblingNode;
                --s;
                --sl;
            }
            if (nodeToAppend instanceof Element) {
                ((Element)nodeToAppend).setAttribute("class", "");
            }
            articleContent.appendChild(nodeToAppend);
        }
        this.prepArticle(articleContent);
        return articleContent;
    }

    protected String getInnerHTML(Node n) {
        if (n.getNodeType() == 3) {
            return n.getTextContent();
        }
        String result = "";
        NodeList nl = n.getChildNodes();
        for (int i = 0; i < nl.getLength(); ++i) {
            result = nl.item(i).getNodeType() == 3 ? result + nl.item(i).getTextContent() : (nl.item(i).getNodeType() == 8 ? result + "<!-- " + nl.item(i).getTextContent() + " -->" : result + this.nodeToString(nl.item(i)));
        }
        return result;
    }

    protected String nodeToString(Node n) {
        return Readability.nodeToString(n, false);
    }

    protected static String nodeToString(Node n, boolean pretty) {
        try {
            DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
            DOMImplementationLS impl = (DOMImplementationLS)((Object)registry.getDOMImplementation("LS"));
            LSSerializer writer = impl.createLSSerializer();
            writer.getDomConfig().setParameter("xml-declaration", false);
            if (pretty) {
                writer.getDomConfig().setParameter("format-pretty-print", true);
            }
            return writer.writeToString(n);
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    protected Node stringToNode(String str) {
        try {
            DOMFragmentParser parser = new DOMFragmentParser();
            DocumentFragment fragment = this.document.createDocumentFragment();
            parser.parse(new InputSource(new StringReader(str)), fragment);
            return fragment;
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    protected String getInnerText(Element e, boolean normalizeSpaces) {
        String textContent = "";
        textContent = e.getTextContent().replaceAll(Regexps.trimRe, "");
        if (normalizeSpaces) {
            return textContent.replaceAll(Regexps.normalizeRe, " ");
        }
        return textContent;
    }

    protected String getInnerTextSep(Node e) {
        if (e.hasChildNodes()) {
            String s = "";
            NodeList nl = e.getChildNodes();
            for (int i = 0; i < nl.getLength(); ++i) {
                if (nl.item(i).getNodeName().equalsIgnoreCase("script")) continue;
                s = s + this.getInnerTextSep(nl.item(i));
            }
            return s;
        }
        return e.getTextContent() + " ";
    }

    protected String getInnerText(Element e) {
        return this.getInnerText(e, true);
    }

    public String getArticleHTML() {
        if (this.articleContent == null) {
            return "";
        }
        return Readability.nodeToString(this.articleContent, true);
    }

    public Node getArticleHTML_DOM() {
        return this.articleContent;
    }

    protected String getArticleDateString() {
        return this.article_date_string;
    }

    public Date getArticleDate() {
        return this.article_date;
    }

    public String getArticleText() {
        if (this.articleContent == null) {
            return "Unable to find article content";
        }
        return this.articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
    }

    public List<Anchor> getArticleLinks() {
        ArrayList<Anchor> anchors = new ArrayList<Anchor>();
        if (this.articleContent == null) {
            return anchors;
        }
        NodeList nl = this.articleContent.getElementsByTagName("a");
        for (int i = 0; i < nl.getLength(); ++i) {
            Element a = (Element)nl.item(i);
            Anchor anchor = new Anchor(this.getInnerText(a), a.getAttribute("href"));
            anchors.add(anchor);
        }
        return anchors;
    }

    public List<Anchor> getAllLinks() {
        ArrayList<Anchor> anchors = new ArrayList<Anchor>();
        NodeList nl = this.document.getElementsByTagName("a");
        for (int i = 0; i < nl.getLength(); ++i) {
            Element a = (Element)nl.item(i);
            Anchor anchor = new Anchor(this.getInnerText(a), a.getAttribute("href"));
            anchors.add(anchor);
        }
        return anchors;
    }

    public List<String> getArticleImages() {
        ArrayList<String> images = new ArrayList<String>();
        if (this.articleContent == null) {
            return images;
        }
        NodeList nl = this.articleContent.getElementsByTagName("img");
        for (int i = 0; i < nl.getLength(); ++i) {
            Element img = (Element)nl.item(i);
            images.add(img.getAttribute("src"));
        }
        return images;
    }

    public List<String> getArticleSubheadings() {
        ArrayList<String> subtitles = new ArrayList<String>();
        if (this.articleContent == null) {
            return subtitles;
        }
        for (int j = 1; j <= 6; ++j) {
            NodeList nl = this.articleContent.getElementsByTagName("h" + j);
            if (nl.getLength() <= 0) continue;
            for (int i = 0; i < nl.getLength(); ++i) {
                subtitles.add(nl.item(i).getTextContent());
            }
            break;
        }
        if (subtitles.size() == 0) {
            NodeList nl = this.articleContent.getElementsByTagName("*");
            for (int i = 0; i < nl.getLength(); ++i) {
                if (!(nl.item(i) instanceof Element) || ((Element)nl.item(i)).getAttribute("class") == null || this.search(((Element)nl.item(i)).getAttribute("class"), Regexps.likelySubheadCandidateRe) == -1) continue;
                subtitles.add(nl.item(i).getTextContent());
            }
        }
        return subtitles;
    }

    protected List<Node> findChildNodesWithName(Node parent, String name) {
        NodeList children = parent.getChildNodes();
        ArrayList<Node> results = new ArrayList<Node>();
        for (int i = 0; i < children.getLength(); ++i) {
            String nodeName;
            Node child = children.item(i);
            if (child == null || (nodeName = child.getNodeName()) == null || !nodeName.equals(name)) continue;
            results.add(child);
        }
        return results;
    }

    protected int findChildNodeIndex(Node parent, Node childToFind) {
        for (int index = 0; index < parent.getChildNodes().getLength(); ++index) {
            if (parent.getChildNodes().item(index) != childToFind) continue;
            return index;
        }
        return -1;
    }

    protected void getArticleTextMapping(TreeWalker walker, List<MappingNode> map) throws DOMException {
        int index;
        Node parend = walker.getCurrentNode();
        if (parend.getNodeType() == 3 && parend.getParentNode().getAttributes().getNamedItem("id") != null && parend.getTextContent().trim().length() > 0 && (index = this.findChildNodeIndex(parend.getParentNode(), parend)) != -1) {
            map.add(new MappingNode(parend.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + index + "]", parend.getNodeValue()));
        }
        Node n = walker.firstChild();
        while (n != null) {
            this.getArticleTextMapping(walker, map);
            n = walker.nextSibling();
        }
        walker.setCurrentNode(parend);
    }

    public List<MappingNode> getArticleTextMapping() {
        if (this.articleContent == null) {
            return null;
        }
        ArrayList<MappingNode> map = new ArrayList<MappingNode>();
        TreeWalker walker = ((DocumentTraversal)((Object)this.document)).createTreeWalker(this.articleContent, 5, null, true);
        this.getArticleTextMapping(walker, map);
        return map;
    }

    public static Readability getReadability(String html) throws SAXException, IOException {
        return Readability.getReadability(html, false);
    }

    public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
        DOMParser parser = new DOMParser();
        parser.parse(new InputSource(new StringReader(html)));
        return new Readability(parser.getDocument(), false, addTitle);
    }

    public static void main(String[] argv) throws Exception {
        URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
        DOMParser parser = new DOMParser();
        parser.parse(new InputSource(input.openStream()));
        Readability r = new Readability(parser.getDocument(), true, true);
        System.out.println(r.getArticleHTML());
        System.out.println();
        System.out.println("***");
        System.out.println();
        for (MappingNode s : r.getArticleTextMapping()) {
            System.out.println(s);
        }
        System.out.println(r.getArticleImages());
    }

    protected class MappingNode {
        String id;
        String text;

        public MappingNode(String id, String text) {
            this.id = id;
            this.text = text;
        }

        public String getId() {
            return this.id;
        }

        public String getText() {
            return this.text;
        }

        public String toString() {
            return "MappingNode(" + this.id + " -> " + this.text + ")";
        }
    }

    static enum Flag {
        FLAG_STRIP_UNLIKELYS,
        FLAG_WEIGHT_CLASSES;

    }

    protected static class Regexps {
        public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner";
        public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
        public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
        public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
        public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
        public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
        public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>";
        public static String trimRe = "^\\s+|\\s+$";
        public static String normalizeRe = "\\s{2,}";
        public static String killBreaksRe = "(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}";
        public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
        public static String titleSeparatorRe = "\\|\\-\\/";
        public static String likelySubheadCandidateRe = "(?i)cross-head";

        protected Regexps() {
        }
    }
}

