package org.fbk.cit.hlt.thewikimachine.xmldump;

import info.bliki.api.AbstractXMLParser;
import java.io.FileInputStream;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.log4j.Logger;
import org.tukaani.xz.common.Util;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/AbstractWikipediaXmlDumpParser.class */
public abstract class AbstractWikipediaXmlDumpParser extends DefaultHandler implements LexicalHandler {
    static Logger logger = Logger.getLogger(AbstractWikipediaXmlDumpParser.class.getName());
    protected static final String LEXICAL_HANDLER_PROPERTY_ID = "http://xml.org/sax/properties/lexical-handler";
    protected static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
    protected int fElementDepth;
    protected boolean fXML11;
    protected boolean fInCDATA;
    protected long begin;
    protected long end;
    protected DecimalFormat decimalFormat;
    protected String xpath;
    public static final int DEFAULT_THREADS_NUMBER = 1;
    private int numThreads;
    private ExecutorService myExecutor;
    public static final int DEFAULT_QUEUE_SIZE = 10000;
    public AtomicLong genBegin = new AtomicLong();
    public AtomicLong genEnd = new AtomicLong();
    public AtomicInteger generalCount = new AtomicInteger();
    private String currentTitle = "";
    private int currentWikiID = 0;
    private String currentRedirect = null;
    private String currentText = "";
    protected StringBuilder content = new StringBuilder();

    /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/AbstractWikipediaXmlDumpParser$AnalyzePage.class */
    public class AnalyzePage implements Runnable {
        private String text;
        private String title;
        private int wikiID;
        private String redirect;

        public AnalyzePage(String str, String str2, int i, String str3) {
            this.text = str;
            this.title = str2;
            this.wikiID = i;
            this.redirect = str3;
        }

        @Override // java.lang.Runnable
        public void run() {
            AbstractWikipediaXmlDumpParser.this.getPage(this.text, this.title, this.wikiID, this.redirect);
        }
    }

    public AbstractWikipediaXmlDumpParser(int i) {
        this.numThreads = i;
        logger.info("creating the thread executor (" + i + ")");
        this.myExecutor = new ThreadPoolExecutor(i, i, 1L, TimeUnit.MINUTES, new ArrayBlockingQueue(10000), new ThreadPoolExecutor.CallerRunsPolicy());
    }

    public int getNumThreads() {
        return this.numThreads;
    }

    public void startProcess(String str) {
        this.xpath = "";
        this.decimalFormat = new DecimalFormat("###,###,###,###");
        try {
            XMLReader createXMLReader = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME);
            createXMLReader.setContentHandler(this);
            createXMLReader.setErrorHandler(this);
            createXMLReader.setProperty(LEXICAL_HANDLER_PROPERTY_ID, this);
            if (str.endsWith(".bz2")) {
                logger.info("parsing a bz2 file");
                createXMLReader.parse(new InputSource(new BZip2CompressorInputStream(new FileInputStream(str))));
            } else {
                createXMLReader.parse(str);
            }
        } catch (SAXParseException e) {
            logger.error("SAXParseException at " + this.currentTitle + " " + e);
        } catch (Exception e2) {
            logger.error("Error at " + this.currentTitle + " " + e2.getMessage());
            e2.printStackTrace();
        }
    }

    public void printSituation() {
        logger.info(String.format("Pages: %10s - %7s ms - %s", this.decimalFormat.format(this.generalCount.intValue()), this.decimalFormat.format(this.end - this.begin), new Date()));
    }

    public void printLog() {
    }

    public abstract void getPage(String str, String str2, int i, String str3);

    public void endProcess() {
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startDocument() throws SAXException {
        logger.info("Process started at " + new Date());
        this.begin = System.currentTimeMillis();
        this.genBegin.set(System.currentTimeMillis());
        this.fElementDepth = 0;
        this.fXML11 = false;
        this.fInCDATA = false;
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endDocument() throws SAXException {
        this.end = System.currentTimeMillis();
        printSituation();
        logger.info("Finished to read the document " + new Date());
        boolean z = true;
        try {
            this.myExecutor.shutdown();
            logger.debug("waiting to end " + new Date() + "...");
            z = this.myExecutor.awaitTermination(Util.VLI_MAX, TimeUnit.NANOSECONDS);
        } catch (InterruptedException e) {
            logger.error(e);
        }
        logger.info("ending process " + z + " " + new Date() + "...");
        endProcess();
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) throws SAXException {
        try {
            this.content.append(cArr, i, i2);
        } catch (Exception e) {
            logger.error("Error at characters " + this.currentTitle);
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
        try {
            characters(cArr, i, i2);
        } catch (Exception e) {
            logger.error("Error at ignorableWhitespace " + this.currentTitle);
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        this.xpath += "/" + str3;
        this.fElementDepth++;
        this.content.setLength(0);
        if (this.xpath.equals("/mediawiki/page")) {
            this.currentRedirect = null;
        }
        if (this.xpath.equals("/mediawiki/page/redirect")) {
            String value = attributes.getValue(AbstractXMLParser.TITLE_ID);
            if (value.trim().length() > 0) {
                this.currentRedirect = value.replace(' ', '_');
            }
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        this.fElementDepth--;
        if (this.xpath.equals("/mediawiki/page/title")) {
            this.currentTitle = this.content.toString().replace(' ', '_');
        } else if (this.xpath.equals("/mediawiki/page/id")) {
            this.currentWikiID = Integer.parseInt(this.content.toString());
        } else if (this.xpath.equals("/mediawiki/page/revision/text")) {
            this.currentText = this.content.toString();
        }
        if (this.xpath.equals("/mediawiki/page")) {
            try {
                this.myExecutor.execute(new AnalyzePage(this.currentText, this.currentTitle, this.currentWikiID, this.currentRedirect));
            } catch (Exception e) {
                e.printStackTrace();
                logger.error(e);
            }
        }
        this.xpath = this.xpath.substring(0, (this.xpath.length() - str3.length()) - 1);
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void startDTD(String str, String str2, String str3) throws SAXException {
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void endDTD() throws SAXException {
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void startEntity(String str) throws SAXException {
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void endEntity(String str) throws SAXException {
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void startCDATA() throws SAXException {
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void endCDATA() throws SAXException {
    }

    @Override // org.xml.sax.ext.LexicalHandler
    public void comment(char[] cArr, int i, int i2) throws SAXException {
    }
}
