package org.fbk.cit.hlt.thewikimachine.xmldump;

import de.tudarmstadt.ukp.wikipedia.parser.Content;
import de.tudarmstadt.ukp.wikipedia.parser.Link;
import de.tudarmstadt.ukp.wikipedia.parser.Paragraph;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import de.tudarmstadt.ukp.wikipedia.parser.Span;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiMarkupParser;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaFirstSentenceExtractor.class */
public class WikipediaFirstSentenceExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
    static Logger logger = Logger.getLogger(WikipediaFirstSentenceExtractor.class.getName());
    private PrintWriter abstractWriter;

    public WikipediaFirstSentenceExtractor(int i, int i2, Locale locale) {
        super(i, i2, locale);
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            this.abstractWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFirstSentenceFileName()), "UTF-8")));
        } catch (IOException e) {
            logger.error(e);
        }
        startProcess(extractorParameters.getWikipediaXmlFileName());
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void filePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void categoryPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void contentPage(String str, String str2, int i) {
        try {
            String linksForFirstSentence = getLinksForFirstSentence(str, str2);
            synchronized (this) {
                this.abstractWriter.print(linksForFirstSentence);
            }
        } catch (IOException e) {
            logger.error(e);
        }
    }

    public String getLinksForFirstSentence(String str, String str2) throws IOException {
        StringBuilder sb = new StringBuilder();
        try {
            List<Section> sections = WikiMarkupParser.getInstance().parsePage(str, new String[]{this.imagePrefix, this.filePrefix}).getSections();
            if (sections.size() > 0) {
                List<Paragraph> paragraphs = sections.get(0).getParagraphs();
                Paragraph paragraph = new Paragraph();
                Iterator<Paragraph> it = paragraphs.iterator();
                while (true) {
                    if (!it.hasNext()) {
                        break;
                    }
                    Paragraph next = it.next();
                    if (next.getText().trim().length() != 0) {
                        Iterator<Span> it2 = next.getFormatSpans(Content.FormatType.ITALIC).iterator();
                        while (true) {
                            if (!it2.hasNext()) {
                                if (!next.getText().trim().startsWith(":")) {
                                    paragraph = next;
                                    break;
                                }
                            } else if (it2.next().getStart() == 0) {
                                break;
                            }
                        }
                    }
                }
                Iterator<Link> it3 = paragraph.getLinks().iterator();
                while (it3.hasNext()) {
                    String target = it3.next().getTarget();
                    if (!target.startsWith("#")) {
                        String[] split = target.split("#");
                        if (split.length > 1) {
                            target = split[0];
                        }
                        String normalizePageName = normalizePageName(target);
                        sb.append(str2);
                        sb.append('\t');
                        sb.append(normalizePageName);
                        sb.append('\n');
                    }
                }
            }
        } catch (Exception e) {
            logger.warn(e.getMessage());
        }
        return sb.toString();
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void disambiguationPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void templatePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void redirectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void portalPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void projectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaExtractor, org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void endProcess() {
        super.endProcess();
        this.abstractWriter.flush();
        this.abstractWriter.close();
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia xml dump file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("wikipedia-dump");
            Option create = OptionBuilder.create("d");
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("output directory in which to store output files");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output-dir");
            Option create2 = OptionBuilder.create("o");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of threads (default 1)");
            OptionBuilder.withLongOpt("num-threads");
            Option create3 = OptionBuilder.create("t");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of pages to process (default all)");
            OptionBuilder.withLongOpt("num-pages");
            Option create4 = OptionBuilder.create("p");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default 10000)");
            OptionBuilder.withLongOpt("notification-point");
            Option create5 = OptionBuilder.create("n");
            OptionBuilder.withDescription("if set, use the output folder as base dir");
            OptionBuilder.withLongOpt("base-dir");
            Option create6 = OptionBuilder.create();
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            options.addOption(create6);
            CommandLine parse = new PosixParser().parse(options, strArr);
            int i = 1;
            if (parse.hasOption("num-threads")) {
                i = Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            int i2 = Integer.MAX_VALUE;
            if (parse.hasOption("num-pages")) {
                i2 = Integer.parseInt(parse.getOptionValue("num-pages"));
            }
            int i3 = 10000;
            if (parse.hasOption("notification-point")) {
                i3 = Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            ExtractorParameters extractorParameters = parse.hasOption("base-dir") ? new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"), true) : new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
            logger.debug(extractorParameters);
            logger.debug("extracting abstracts (" + extractorParameters.getWikipediaAbstractFileName() + ")...");
            WikipediaFirstSentenceExtractor wikipediaFirstSentenceExtractor = new WikipediaFirstSentenceExtractor(i, i2, extractorParameters.getLocale());
            wikipediaFirstSentenceExtractor.setNotificationPoint(i3);
            wikipediaFirstSentenceExtractor.start(extractorParameters);
            logger.info("extraction ended " + new Date());
        } catch (ParseException e) {
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaAbstractExtractor", "\n", options, "\n", true);
        }
    }
}
