package ch.epfl.bbp.uima.pdf.cr;

import ch.epfl.bbp.StringUtils;
import ch.epfl.bbp.uima.AbbreviationExpander;
import ch.epfl.bbp.uima.cr.AbstractFileReader;
import ch.epfl.bbp.uima.pdf.BBlock;
import ch.epfl.bbp.uima.pdf.BDocument;
import ch.epfl.bbp.uima.pdf.BLine;
import ch.epfl.bbp.uima.pdf.BlockHandler;
import ch.epfl.bbp.uima.pdf.cleanup.HyphenRemover;
import ch.epfl.bbp.uima.types.DataTable;
import ch.epfl.bbp.uima.types.DocumentBlock;
import ch.epfl.bbp.uima.types.DocumentLine;
import ch.epfl.bbp.uima.types.DocumentPage;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.snowtide.pdf.PDFTextStream;
import de.julielab.jules.types.Header;
import de.julielab.jules.types.Section;
import edu.psu.seersuite.extractors.tableextractor.extraction.PdfBoxParser;
import edu.psu.seersuite.extractors.tableextractor.extraction.TableExtractor;
import edu.psu.seersuite.extractors.tableextractor.model.Table;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FloatArray;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

@TypeCapability(outputs = {"de.julielab.jules.types.Header"})
/* loaded from: input_file:ch/epfl/bbp/uima/pdf/cr/PdfCollectionReader.class */
public class PdfCollectionReader extends AbstractFileReader {
    public static final String PARAM_EXTRACT_TABLES = "extractTables";

    @ConfigurationParameter(name = "extractTables", defaultValue = {"false"}, description = "whether to extract tables")
    private boolean extractTables;
    public static final String PARAM_EXPAND_ABBREVIATIONS = "expandAbbrevs";

    @ConfigurationParameter(name = "expandAbbrevs", defaultValue = {"false"}, description = "whether to expand Abbreviations")
    private boolean expandAbbrevs;
    public static final String PARAM_EXTRACT_REFERENCES = "extractReferences";

    @ConfigurationParameter(name = PARAM_EXTRACT_REFERENCES, defaultValue = {"false"}, description = "whether to extract references")
    private boolean extractReferences;
    private TableExtractor tableExtractor;
    private static Logger LOG = LoggerFactory.getLogger(PdfCollectionReader.class);
    public static final String COMPONENT_ID = PdfCollectionReader.class.getName();
    private static final Pattern REFERENCES = Pattern.compile("^(?:REFERENCES|References)");

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        this.fileExtensionFilter = "pdf";
        super.initialize(uimaContext);
        try {
            if (this.extractTables) {
                this.tableExtractor = new TableExtractor();
                this.tableExtractor.setParser(new PdfBoxParser());
            }
        } catch (Exception e) {
            throw new ResourceInitializationException(e);
        }
    }

    public void getNext(JCas jCas) throws IOException, CollectionException {
        File file = (File) this.fileIterator.next();
        Header header = new Header(jCas);
        header.setDocId(file.getName().replaceAll("\\.pdf.*", ""));
        header.setSource(file.getAbsolutePath());
        header.addToIndexes();
        PDFTextStream pDFTextStream = new PDFTextStream(file);
        BlockHandler blockHandler = new BlockHandler();
        pDFTextStream.pipe(blockHandler);
        pDFTextStream.close();
        extractText(jCas, blockHandler.getDoc(), header.getDocId(), this.expandAbbrevs);
        if (this.extractTables) {
            extractTables(this.tableExtractor, file, jCas);
        }
    }

    public static void extractText(JCas jCas, BDocument bDocument, String str, boolean z) {
        StringBuffer stringBuffer = new StringBuffer("");
        int i = 0;
        int i2 = 0;
        DocumentPage documentPage = new DocumentPage(jCas);
        documentPage.setBegin(0);
        Set set = null;
        if (z) {
            StringBuffer stringBuffer2 = new StringBuffer("");
            Iterator<BBlock> it = bDocument.getBlocks().iterator();
            while (it.hasNext()) {
                stringBuffer2.append(cleanupText(it.next(), str));
            }
            set = AbbreviationExpander.getAbbrevs(stringBuffer2.toString());
        }
        for (BBlock bBlock : bDocument.getBlocks()) {
            String cleanupText = cleanupText(bBlock, str);
            if (z) {
                cleanupText = AbbreviationExpander.expand(cleanupText, set);
            }
            int i3 = i;
            for (BLine bLine : bBlock.getLines()) {
                String text = bLine.getText();
                int length = text.replaceAll(" +", " ").length();
                DocumentLine documentLine = new DocumentLine(jCas, i3, i3 + length);
                i3 += length;
                documentLine.setX(bLine.getRegion().x);
                documentLine.setY(bLine.getRegion().y);
                documentLine.setHeight(bLine.getRegion().height);
                documentLine.setWidth(bLine.getRegion().width);
                documentLine.setPageId(bBlock.getPageId());
                documentLine.setBlock(bBlock.getId());
                documentLine.setLineText(text);
                documentLine.setBeginnings(new FloatArray(jCas, bLine.getBeginnings().size()));
                for (int i4 = 0; i4 < bLine.getBeginnings().size(); i4++) {
                    documentLine.setBeginnings(i4, bLine.getBeginnings().get(i4).floatValue());
                }
                documentLine.setEndings(new FloatArray(jCas, bLine.getEndings().size()));
                for (int i5 = 0; i5 < bLine.getEndings().size(); i5++) {
                    documentLine.setEndings(i5, bLine.getEndings().get(i5).floatValue());
                }
                documentLine.addToIndexes();
            }
            DocumentBlock documentBlock = new DocumentBlock(jCas, i, i + cleanupText.length());
            i += cleanupText.length();
            stringBuffer.append(cleanupText);
            documentBlock.setElementId(bBlock.getId());
            documentBlock.setX(bBlock.getRegion().x);
            documentBlock.setY(bBlock.getRegion().y);
            documentBlock.setHeight(bBlock.getRegion().height);
            documentBlock.setWidth(bBlock.getRegion().width);
            documentBlock.setHasBold(bBlock.isHasBold());
            documentBlock.setHasManyFontsizes(bBlock.isHasManyFontsizes());
            documentBlock.setMedianFontsize(bBlock.getMedianFontsize());
            documentBlock.setPageId(bBlock.getPageId());
            documentBlock.addToIndexes();
            if (bBlock.getPageId() > i2) {
                documentPage.setEnd(i);
                documentPage.setPageId(i2);
                documentPage.addToIndexes();
                documentPage = new DocumentPage(jCas);
                documentPage.setBegin(i);
                i2 = bBlock.getPageId();
            }
        }
        jCas.setDocumentText(stringBuffer.toString());
    }

    private static String cleanupText(BBlock bBlock, String str) {
        return StringUtils.stripNonValidXMLCharacters(HyphenRemover.dehyphenate(Normalizer.normalize(bBlock.getText().trim(), Normalizer.Form.NFKC), str).trim().replaceAll("[\\r\\n]+", " ").replaceAll("''", "\"").replaceAll("``", "\"").trim().replaceAll(" +", " ")) + "\r\n";
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void extractTables(TableExtractor tableExtractor, File file, JCas jCas) {
        try {
            ArrayList<Table> extract = tableExtractor.extract(file, null);
            if (extract != null) {
                Iterator<Table> it = extract.iterator();
                while (it.hasNext()) {
                    Table next = it.next();
                    DataTable dataTable = new DataTable(jCas);
                    dataTable.setTableId(next.getOrder());
                    dataTable.setPageNumber(next.getPageNumber());
                    dataTable.setRowCount(next.getTableBody().size());
                    dataTable.setColumnCount(next.getColumnNumber());
                    dataTable.setCaption(next.getCaption());
                    dataTable.setReferenceText(next.getRefTextList());
                    ArrayList<String> heading = next.getHeading();
                    dataTable.setHeadings(new StringArray(jCas, heading.size()));
                    for (int i = 0; i < heading.size(); i++) {
                        dataTable.setHeadings(i, heading.get(i));
                    }
                    ArrayList<String> tableBody = next.getTableBody();
                    dataTable.setBody(new StringArray(jCas, tableBody.size()));
                    for (int i2 = 0; i2 < tableBody.size(); i2++) {
                        dataTable.setBody(i2, tableBody.get(i2));
                    }
                    dataTable.addToIndexes();
                }
            }
        } catch (Throwable th) {
            LOG.warn("cannot extract tables from {}: {}", file.getAbsolutePath(), th);
        }
    }

    static boolean extractReferencesNaively(JCas jCas) {
        ArrayList newArrayList = Lists.newArrayList();
        Matcher matcher = REFERENCES.matcher(jCas.getDocumentText());
        while (matcher.find()) {
            Section section = new Section(jCas, matcher.start(), matcher.end());
            section.setSectionType("references");
            newArrayList.add(section);
        }
        if (newArrayList.size() != 1) {
            return false;
        }
        ((Section) newArrayList.get(0)).addToIndexes();
        return true;
    }

    public static void printHtml(JCas jCas, File file) {
        try {
            Document newDocument = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            Element createElement = newDocument.createElement("html");
            newDocument.appendChild(createElement);
            Element createElement2 = newDocument.createElement("head");
            createElement.appendChild(createElement2);
            addElem(createElement2, "meta", "http-equiv", "Content-Type", "content", "text/html; charset=utf-8");
            addElem(createElement2, "script", "src", "http://ajax.googleapis.com/ajax/libs/jquery/1.10.1/jquery.min.js", "type", "text/javascript");
            for (DocumentBlock documentBlock : JCasUtil.select(jCas, DocumentBlock.class)) {
                String str = "blue";
                if (!JCasUtil.selectCovered(jCas, Section.class, documentBlock).isEmpty()) {
                    str = "red";
                }
                int y = (int) (documentBlock.getY() + documentBlock.getHeight());
                HashMap newHashMap = Maps.newHashMap();
                newHashMap.put("style", "font-size: 10px;position:absolute;top:" + (((documentBlock.getPageId() + 1) * 800) - y) + ";left:" + ((int) documentBlock.getX()) + ";height:" + ((int) documentBlock.getHeight()) + ";width:" + ((int) documentBlock.getWidth()) + ";border:1px solid " + str);
                createElement.appendChild(buildTextElt(newDocument, "span", documentBlock.getCoveredText(), newHashMap));
            }
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(file));
            serializeXMLDocument(newDocument, outputStreamWriter);
            outputStreamWriter.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void serializeXMLDocument(Document document, Writer writer) throws TransformerException {
        TransformerFactory.newInstance().newTransformer().transform(new DOMSource(document), new StreamResult(writer));
    }

    private static Element addElem(Element element, String str, String... strArr) {
        Element createElement = element.getOwnerDocument().createElement(str);
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= strArr.length) {
                element.appendChild(createElement);
                return createElement;
            }
            createElement.setAttribute(strArr[i2], strArr[i2 + 1]);
            i = i2 + 2;
        }
    }

    private static Element buildTextElt(Document document, String str, String str2, Map<String, String> map) {
        Element createElement = document.createElement(str);
        if (str2 != null && str2.length() > 0) {
            createElement.appendChild(document.createTextNode(str2));
        }
        if (map != null) {
            for (String str3 : map.keySet()) {
                createElement.setAttribute(str3, map.get(str3));
            }
        }
        return createElement;
    }
}
