package org.fbk.cit.hlt.thewikimachine.xmldump;

import de.tudarmstadt.ukp.wikipedia.parser.Link;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.parser.Section;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.fbk.cit.hlt.thewikimachine.util.StringTable;
import org.fbk.cit.hlt.thewikimachine.util.SynchronizedCounter;
import org.fbk.cit.hlt.thewikimachine.util.SynchronizedIndexer;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageMap;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageSet;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PageTypeExtractor;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageLink;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ParsedPageTitle;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.PersonInfoMap;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.ReversePageMap;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiMarkupParser;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiTemplate;
import org.fbk.cit.hlt.thewikimachine.xmldump.util.WikiTemplateParser;
import org.xerial.snappy.SnappyOutputStream;

/* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaExampleExtractor.class */
public class WikipediaExampleExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
    public static final int ID_FORM_INDEX = 0;
    public static final int ID_PAGE_INDEX = 1;
    public static final int FORM_INDEX = 2;
    public static final int PAGE_INDEX = 3;
    public static final int SOURCE_INDEX = 4;
    public static final int TYPE_INDEX = 5;
    public static final int ID_INDEX = 6;
    public static final int LEFT_CONTEXT_INDEX = 7;
    public static final int RIGHT_CONTEXT_INDEX = 8;
    public static final int COLUMN_NUMBER = 9;
    private PrintWriter exampleWriter;
    private PrintWriter pageCounterWriter;
    private PrintWriter formCounterWriter;
    private PrintWriter formIdWriter;
    private PageMap redirectPageMap;
    private PageSet disambiguationPageSet;
    private PageMap contentPageMap;
    private ReversePageMap reverseRedirectPageMap;
    private PersonInfoMap personInformationMap;
    private WikiMarkupParser wikiMarkupParser;
    private Tokenizer tokenizer;
    private Pattern sectionTitleSkipPattern;
    private SynchronizedCounter<String> formCounter;
    private SynchronizedCounter<String> pageCounter;
    private SynchronizedIndexer<String> formIndexer;
    private int maximumNumberOfExamplesPerPage;
    static Logger logger = Logger.getLogger(WikipediaExampleExtractor.class.getName());
    private static AtomicInteger exampleCounter = new AtomicInteger();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaExampleExtractor$Example.class */
    public class Example {
        public static final String CONTENT_FROM_PERSON_INFORMATION = "I";
        public static final String CONTENT_FROM_REDIRECTION_PAGE = "R";
        public static final String CONTENT_FROM_LINK = "L";
        public static final String CONTENT_FROM_PAGE = "P";
        public static final String CONTENT_FROM_CATEGORY = "C";
        public static final String CONTENT_FROM_SECTION_TITLE = "S";
        public static final String CONTENT_FROM_NOMINAL = "N";
        public static final String CONTENT_FROM_TEXT = "T";
        public static final String CONTENT_FROM_TITLE_SUFFIX = "U";
        private String type;
        private String source;
        private String page;
        private String leftContext;
        private String form;
        private String rightContext;

        Example(String str, String str2, String str3, String str4, String str5, String str6) {
            WikipediaExampleExtractor.this.formCounter.add(str);
            WikipediaExampleExtractor.this.pageCounter.add(str2);
            this.form = str;
            this.source = str3;
            this.page = str2;
            this.leftContext = str4;
            this.rightContext = str5;
            this.type = str6;
        }

        public String getType() {
            return this.type;
        }

        public void setType(String str) {
            this.type = str;
        }

        public String getSource() {
            return this.source;
        }

        public void setSource(String str) {
            this.source = str;
        }

        public String getPage() {
            return this.page;
        }

        public void setPage(String str) {
            this.page = str;
        }

        public String getLeftContext() {
            return this.leftContext;
        }

        public void setLeftContext(String str) {
            this.leftContext = str;
        }

        public String getForm() {
            return this.form;
        }

        public void setForm(String str) {
            this.form = str;
        }

        public String getRightContext() {
            return this.rightContext;
        }

        public void setRightContext(String str) {
            this.rightContext = str;
        }

        public boolean isEmpty() {
            if (this.form == null || this.form.length() == 0 || this.page == null || this.page.length() == 0 || this.source == null || this.source.length() == 0 || this.leftContext == null || this.rightContext == null) {
                return true;
            }
            return this.leftContext.length() == 0 && this.rightContext.length() == 0;
        }

        public String toString() {
            return toString(0);
        }

        public String toString(int i) {
            StringBuilder sb = new StringBuilder();
            String str = WikipediaExampleExtractor.this.tokenizer.tokenizedString(this.form);
            int i2 = WikipediaExampleExtractor.this.formIndexer.get(str);
            String str2 = WikipediaExampleExtractor.this.contentPageMap.get(this.page);
            sb.append(i2);
            sb.append('\t');
            sb.append(str2);
            sb.append('\t');
            sb.append(str);
            sb.append('\t');
            sb.append(this.page);
            sb.append('\t');
            sb.append(this.source);
            sb.append('\t');
            sb.append(i);
            sb.append('\t');
            sb.append(this.type);
            sb.append('\t');
            sb.append(WikipediaExampleExtractor.this.tokenizer.tokenizedString(this.leftContext));
            sb.append('\t');
            sb.append(WikipediaExampleExtractor.this.tokenizer.tokenizedString(this.rightContext));
            return sb.toString();
        }
    }

    /* loaded from: input_file:org/fbk/cit/hlt/thewikimachine/xmldump/WikipediaExampleExtractor$ExampleBuilder.class */
    class ExampleBuilder {
        public static final String EMPTY_CONTEXT = "";
        public static final String END_OF_SENTENCE = ". ";
        List<Example> exampleList = new ArrayList();
        boolean nominal;

        ExampleBuilder(String str, String str2, boolean z) throws IOException {
            ParsedPage parsePage = WikipediaExampleExtractor.this.wikiMarkupParser.parsePage(str, new String[]{WikipediaExampleExtractor.this.filePrefix, WikipediaExampleExtractor.this.imagePrefix});
            ParsedPageTitle parsedPageTitle = new ParsedPageTitle(str2);
            this.nominal = new PageTypeExtractor(str, parsedPageTitle.getForm()).isNominal();
            if (!z) {
                addPageExamples(parsePage, parsedPageTitle);
                addTextExample(parsePage, parsedPageTitle);
                addCategoryExamples(str, parsedPageTitle);
                addSectionTitleExamples(parsePage, parsedPageTitle);
            }
            addLinkExamples(parsePage, parsedPageTitle);
        }

        public List<Example> getExampleList() {
            return this.exampleList;
        }

        private void addExample(Example example) {
            if (WikipediaExampleExtractor.this.pageCounter.get(example.getPage()) <= WikipediaExampleExtractor.this.maximumNumberOfExamplesPerPage) {
                this.exampleList.add(example);
            }
        }

        private void addSuffixExample(ParsedPageTitle parsedPageTitle) {
            if (parsedPageTitle.hasSuffix()) {
                Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), "", parsedPageTitle.getSuffix(), Example.CONTENT_FROM_TITLE_SUFFIX);
                addExample(example);
                addNominalVariantExample(example);
                addPersonSurnameExample(example);
                addRedirectLinkExamples(example);
            }
        }

        private void addSectionTitleExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) {
            Iterator<Section> it = parsedPage.getSections().iterator();
            while (it.hasNext()) {
                try {
                    String title = it.next().getTitle();
                    if (title != null && !WikipediaExampleExtractor.this.sectionTitleSkipPattern.matcher(title).matches() && !WikipediaExampleExtractor.this.disambiguationPageSet.contains(parsedPageTitle.getPage()) && parsedPageTitle.isCompliant()) {
                        Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), "", title, Example.CONTENT_FROM_SECTION_TITLE);
                        addExample(example);
                        addNominalVariantExample(example);
                        addPersonSurnameExample(example);
                        addRedirectLinkExamples(example);
                    }
                } catch (Exception e) {
                    WikipediaExampleExtractor.logger.error("Exception adding section examples for page " + parsedPageTitle.getPage() + ParsedPageLink.START_SUFFIX_PATTERN + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
                }
            }
        }

        private void addTemplateExamples(String str, ParsedPageTitle parsedPageTitle) {
            ArrayList<WikiTemplate> parse = WikiTemplateParser.parse(str, false);
            WikipediaExampleExtractor.logger.debug(parsedPageTitle.getPage());
            Iterator<WikiTemplate> it = parse.iterator();
            while (it.hasNext()) {
                WikipediaExampleExtractor.logger.debug(it.next().getHashMapOfParts());
            }
        }

        private void addTextExample(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) {
            try {
                if (parsedPageTitle.isCompliant()) {
                    Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), "", parsedPage.getText(), "T");
                    this.exampleList.add(example);
                    addNominalVariantExample(example);
                    addPersonSurnameExample(example);
                    addRedirectLinkExamples(example);
                }
            } catch (Exception e) {
                WikipediaExampleExtractor.logger.error("Exception adding text examples for page " + parsedPageTitle.getPage() + ParsedPageLink.START_SUFFIX_PATTERN + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
            }
        }

        private void addCategoryExamples(String str, ParsedPageTitle parsedPageTitle) {
            Matcher matcher = WikipediaExampleExtractor.this.categoryPattern.matcher(str);
            while (matcher.find()) {
                try {
                    String substring = str.substring(matcher.start(2), matcher.end(2));
                    int indexOf = substring.indexOf(124);
                    if (indexOf != -1) {
                        substring = substring.substring(0, indexOf);
                    }
                    if (!substring.equals(parsedPageTitle.getForm()) && !WikipediaExampleExtractor.this.disambiguationPageSet.contains(parsedPageTitle.getPage()) && parsedPageTitle.isCompliant()) {
                        Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), "", substring, Example.CONTENT_FROM_CATEGORY);
                        this.exampleList.add(example);
                        addNominalVariantExample(example);
                        addPersonSurnameExample(example);
                        addRedirectLinkExamples(example);
                    }
                } catch (Exception e) {
                    WikipediaExampleExtractor.logger.error("Exception adding category examples for page " + parsedPageTitle.getPage() + ParsedPageLink.START_SUFFIX_PATTERN + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
                }
            }
        }

        private void addLinkExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) {
            for (Section section : parsedPage.getSections()) {
                List<Link> links = section.getLinks(Link.type.INTERNAL);
                String title = section.getTitle();
                Iterator<Link> it = links.iterator();
                while (it.hasNext()) {
                    try {
                        ParsedPageLink parsedPageLink = new ParsedPageLink(it.next());
                        if (parsedPageLink.isCompliant()) {
                            String str = WikipediaExampleExtractor.this.redirectPageMap.get(parsedPageLink.getPage());
                            if (str != null) {
                                String str2 = WikipediaExampleExtractor.this.redirectPageMap.get(str);
                                if (str2 != null) {
                                    WikipediaExampleExtractor.logger.warn(parsedPageLink.getPage() + " -> " + str + " -> " + str2);
                                    parsedPageLink.setPage(str2);
                                } else {
                                    parsedPageLink.setPage(str);
                                }
                            }
                            if (WikipediaExampleExtractor.this.contentPageMap.get(parsedPageLink.getPage()) != null && !WikipediaExampleExtractor.this.disambiguationPageSet.contains(parsedPageLink.getPage())) {
                                Example example = new Example(parsedPageLink.getForm(), parsedPageLink.getPage(), parsedPageTitle.getPage(), buildLeftContext(parsedPageLink, parsedPageTitle, new ParsedPageTitle(parsedPageLink.getPage()), title), parsedPageLink.getRightContext(), Example.CONTENT_FROM_LINK);
                                addExample(example);
                                addPersonSurnameExample(example);
                            }
                        }
                    } catch (Exception e) {
                        WikipediaExampleExtractor.logger.error("Exception adding link examples for page " + parsedPageTitle.getPage() + ParsedPageLink.START_SUFFIX_PATTERN + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
                    }
                }
            }
        }

        private String buildLeftContext(ParsedPageLink parsedPageLink, ParsedPageTitle parsedPageTitle, ParsedPageTitle parsedPageTitle2, String str) {
            StringBuilder sb = new StringBuilder();
            sb.append(parsedPageTitle.getForm());
            sb.append(' ');
            if (str != null && !WikipediaExampleExtractor.this.sectionTitleSkipPattern.matcher(str).find()) {
                sb.append(str);
                sb.append(' ');
            }
            ParsedPageTitle parsedPageTitle3 = new ParsedPageTitle(parsedPageLink.getPage());
            if (!parsedPageLink.getForm().equals(parsedPageTitle3.getForm())) {
                sb.append(parsedPageTitle3.getForm());
                sb.append(' ');
                if (parsedPageTitle3.hasSuffix()) {
                    sb.append(parsedPageTitle3.getSuffix());
                    sb.append(' ');
                }
            }
            sb.append(END_OF_SENTENCE);
            sb.append(parsedPageLink.getLeftContext());
            return sb.toString();
        }

        private boolean isLowerCase(String str) {
            return Character.isLowerCase(str.charAt(0));
        }

        private void addRedirectLinkExamples(Example example) {
            Set<String> set = WikipediaExampleExtractor.this.reverseRedirectPageMap.get(example.getPage());
            HashSet hashSet = new HashSet();
            hashSet.add(example.getForm());
            if (set != null) {
                Iterator<String> it = set.iterator();
                while (it.hasNext()) {
                    try {
                        ParsedPageTitle parsedPageTitle = new ParsedPageTitle(it.next());
                        if (parsedPageTitle.isCompliant()) {
                            String form = parsedPageTitle.getForm();
                            if (isLowerCase(example.getForm())) {
                                form.toLowerCase();
                            }
                            if (!hashSet.contains(form)) {
                                Example example2 = new Example(form, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_REDIRECTION_PAGE);
                                addExample(example2);
                                hashSet.add(form);
                                addNominalVariantExample(example2);
                            }
                        }
                    } catch (Exception e) {
                        WikipediaExampleExtractor.logger.error("Exception adding redirect link examples (" + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
                    }
                }
            }
        }

        private void addPageExamples(ParsedPage parsedPage, ParsedPageTitle parsedPageTitle) {
            try {
                if (!WikipediaExampleExtractor.this.disambiguationPageSet.contains(parsedPageTitle.getPage())) {
                    Section section = parsedPage.getSection(0);
                    String text = section != null ? section.getText() : "";
                    String suffix = parsedPageTitle.hasSuffix() ? parsedPageTitle.getSuffix() : "";
                    if (parsedPageTitle.isCompliant()) {
                        Example example = new Example(parsedPageTitle.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), suffix, text, Example.CONTENT_FROM_PAGE);
                        addExample(example);
                        addNominalVariantExample(example);
                        addPersonSurnameExample(example);
                        addRedirectPageExamples(example, parsedPageTitle);
                    }
                }
            } catch (Exception e) {
                WikipediaExampleExtractor.logger.error("Exception adding page examples for page " + parsedPageTitle.getPage() + ParsedPageLink.START_SUFFIX_PATTERN + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
            }
        }

        private void addRedirectPageExamples(Example example, ParsedPageTitle parsedPageTitle) {
            String str = "";
            Set<String> set = WikipediaExampleExtractor.this.reverseRedirectPageMap.get(parsedPageTitle.getPage());
            if (set != null) {
                Iterator<String> it = set.iterator();
                while (it.hasNext()) {
                    try {
                        ParsedPageTitle parsedPageTitle2 = new ParsedPageTitle(it.next());
                        if (parsedPageTitle2.hasSuffix()) {
                            str = parsedPageTitle2.getSuffix();
                        }
                        if (parsedPageTitle2.isCompliant()) {
                            Example example2 = new Example(parsedPageTitle2.getForm(), parsedPageTitle.getPage(), parsedPageTitle.getPage(), str, example.getRightContext(), example.getType() + Example.CONTENT_FROM_REDIRECTION_PAGE);
                            addExample(example2);
                            addNominalVariantExample(example2);
                        }
                    } catch (Exception e) {
                        WikipediaExampleExtractor.logger.error("Exception adding redirect page examples (" + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
                    }
                }
            }
        }

        private void addPersonSurnameExample(Example example) {
            PersonInfoMap.Person person = WikipediaExampleExtractor.this.personInformationMap.get(example.getPage());
            if (person != null) {
                try {
                    String surname = person.getSurname();
                    if (surname.length() > 0) {
                        Example example2 = new Example(surname, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_PERSON_INFORMATION);
                        if (WikipediaExampleExtractor.this.formCounter.get(surname) <= WikipediaExampleExtractor.this.maximumNumberOfExamplesPerPage) {
                            this.exampleList.add(example2);
                        }
                    }
                } catch (Exception e) {
                    WikipediaExampleExtractor.logger.error("Exception adding person info examples (" + WikipediaExampleExtractor.exampleCounter.intValue() + ")\n" + e);
                }
            }
        }

        private void addNominalVariantExample(Example example) {
            if (this.nominal) {
                String lowerCase = example.getForm().toLowerCase();
                Example example2 = new Example(lowerCase, example.getPage(), example.getSource(), example.getLeftContext(), example.getRightContext(), example.getType() + Example.CONTENT_FROM_NOMINAL);
                if (WikipediaExampleExtractor.this.formCounter.get(lowerCase) <= WikipediaExampleExtractor.this.maximumNumberOfExamplesPerPage) {
                    this.exampleList.add(example2);
                }
            }
        }
    }

    public WikipediaExampleExtractor(int i, int i2, Locale locale) throws IOException {
        super(i, i2, locale);
        if (this.resources.getString("SECTION_TITLE_SKIP_PATTERN") != null) {
            this.sectionTitleSkipPattern = Pattern.compile(this.resources.getString("SECTION_TITLE_SKIP_PATTERN"), 2);
        }
        this.tokenizer = HardTokenizer.getInstance();
        this.wikiMarkupParser = WikiMarkupParser.getInstance();
        this.maximumNumberOfExamplesPerPage = 1000;
    }

    public int getMaximumNumberOfExamplesPerPage() {
        return this.maximumNumberOfExamplesPerPage;
    }

    public void setMaximumNumberOfExamplesPerPage(int i) {
        this.maximumNumberOfExamplesPerPage = i;
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void start(ExtractorParameters extractorParameters) {
        try {
            this.redirectPageMap = new PageMap(new File(extractorParameters.getWikipediaRedirFileName()));
            logger.info(this.redirectPageMap.size() + " redirect pages");
            this.reverseRedirectPageMap = new ReversePageMap(new File(extractorParameters.getWikipediaRedirFileName()));
            logger.info(this.reverseRedirectPageMap.size() + " reverse redirect pages");
            this.disambiguationPageSet = new PageSet(new File(extractorParameters.getWikipediaDisambiguationFileName()));
            logger.info(this.disambiguationPageSet.size() + " disambiguation pages");
            this.contentPageMap = new PageMap(new File(extractorParameters.getWikipediaContentPageFileName()));
            logger.info(this.contentPageMap.size() + " content pages");
            this.personInformationMap = new PersonInfoMap(new File(extractorParameters.getWikipediaPersonInfoFileName()));
            logger.info(this.personInformationMap.size() + " person information");
            logger.info("example file: " + extractorParameters.getWikipediaExampleFileName());
            if (isCompress()) {
                logger.info(extractorParameters.getWikipediaExampleFileName() + " is compressed");
                this.exampleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new SnappyOutputStream(new FileOutputStream(extractorParameters.getWikipediaExampleFileName())), "UTF-8")));
            } else {
                this.exampleWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaExampleFileName()), "UTF-8")));
            }
            logger.info("form/freq file: " + extractorParameters.getWikipediaFormFreqFileName());
            this.formCounterWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFormFreqFileName()), "UTF-8")));
            logger.info("page/freq file: " + extractorParameters.getWikipediaPageFreqFileName());
            this.pageCounterWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPageFreqFileName()), "UTF-8")));
            logger.info("form/index file: " + extractorParameters.getWikipediaFormIdFileName());
            this.formIdWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaFormIdFileName()), "UTF-8")));
            this.formIndexer = new SynchronizedIndexer<>();
            this.formCounter = new SynchronizedCounter<>();
            this.pageCounter = new SynchronizedCounter<>();
        } catch (IOException e) {
            logger.error(e);
        }
        startProcess(extractorParameters.getWikipediaXmlFileName());
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void filePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void disambiguationPage(String str, String str2, int i) {
        try {
            List<Example> exampleList = new ExampleBuilder(str, str2, true).getExampleList();
            StringBuilder sb = new StringBuilder();
            for (int i2 = 0; i2 < exampleList.size(); i2++) {
                Example example = exampleList.get(i2);
                if (!example.isEmpty()) {
                    sb.append(example.toString(exampleCounter.incrementAndGet()));
                    sb.append('\n');
                }
            }
            synchronized (this) {
                this.exampleWriter.print(sb.toString());
            }
        } catch (Exception e) {
            logger.error(e);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaExtractor, org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void printLog() {
        if (this.printHeader) {
            logger.info("total\tcontent\tredirect\tdisambiguation\tcategory\tpage\tform\ttime\tdate");
            this.printHeader = false;
        }
        logger.info(this.decimalFormat.format(this.generalCount.intValue()) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.countPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.redirectPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.disambiguationPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.categoryPageCounter) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.pageCounter.size()) + StringTable.HORIZONTAL_TABULATION + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.formCounter.size()) + StringTable.HORIZONTAL_TABULATION + this.decimalFormat.format(this.genEnd.longValue() - this.genBegin.longValue()) + StringTable.HORIZONTAL_TABULATION + new Date());
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void contentPage(String str, String str2, int i) {
        try {
            List<Example> exampleList = new ExampleBuilder(str, str2, false).getExampleList();
            StringBuilder sb = new StringBuilder();
            for (int i2 = 0; i2 < exampleList.size(); i2++) {
                Example example = exampleList.get(i2);
                if (!example.isEmpty()) {
                    sb.append(example.toString(exampleCounter.incrementAndGet()));
                    sb.append('\n');
                }
            }
            synchronized (this) {
                this.exampleWriter.print(sb.toString());
            }
        } catch (Exception e) {
            logger.error("Error at page " + str2 + ParsedPageLink.START_SUFFIX_PATTERN + i + ")");
            logger.error(e);
        }
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void categoryPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void templatePage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void redirectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void portalPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExtractor
    public void projectPage(String str, String str2, int i) {
    }

    @Override // org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaExtractor, org.fbk.cit.hlt.thewikimachine.xmldump.AbstractWikipediaXmlDumpParser
    public void endProcess() {
        super.endProcess();
        try {
            logger.info("writing " + this.decimalFormat.format(this.formCounter.size()) + " forms (counter)...");
            writeFormCounter();
            this.formCounterWriter.close();
        } catch (IOException e) {
            logger.error(e);
        }
        try {
            logger.info("writing " + this.decimalFormat.format(this.pageCounter.size()) + " pages (counter)...");
            this.pageCounter.write(this.pageCounterWriter);
            this.pageCounterWriter.close();
        } catch (IOException e2) {
            logger.error(e2);
        }
        try {
            logger.info("writing " + this.decimalFormat.format(this.formIndexer.size()) + " forms (indexer)...");
            this.formIndexer.write(this.formIdWriter);
            this.formIdWriter.close();
        } catch (IOException e3) {
            logger.error(e3);
        }
        logger.debug("closing the output stream...");
        this.exampleWriter.close();
    }

    public void writeFormCounter() throws IOException {
        SortedMap<AtomicInteger, List<String>> sortedMap = this.formCounter.getSortedMap();
        for (AtomicInteger atomicInteger : sortedMap.keySet()) {
            List<String> list = sortedMap.get(atomicInteger);
            for (int i = 0; i < list.size(); i++) {
                this.formCounterWriter.print(atomicInteger.toString());
                this.formCounterWriter.print('\t');
                this.formCounterWriter.println(this.tokenizer.tokenizedString(list.get(i).toString()));
            }
        }
    }

    public static void main(String[] strArr) throws IOException {
        String property = System.getProperty("log-config");
        if (property == null) {
            property = "configuration/log-config.txt";
        }
        PropertyConfigurator.configure(property);
        Options options = new Options();
        try {
            OptionBuilder.withArgName("file");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("wikipedia xml dump file");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("wikipedia-dump");
            Option create = OptionBuilder.create("d");
            OptionBuilder.withArgName("dir");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("output directory in which to store output files");
            OptionBuilder.isRequired();
            OptionBuilder.withLongOpt("output-dir");
            Option create2 = OptionBuilder.create("o");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of threads (default 1)");
            OptionBuilder.withLongOpt("num-threads");
            Option create3 = OptionBuilder.create("t");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("number of pages to process (default all)");
            OptionBuilder.withLongOpt("num-pages");
            Option create4 = OptionBuilder.create("p");
            OptionBuilder.withArgName(SchemaSymbols.ATTVAL_INT);
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("receive notification every n pages (default 10000)");
            OptionBuilder.withLongOpt("notification-point");
            Option create5 = OptionBuilder.create("n");
            OptionBuilder.withArgName("max-freq");
            OptionBuilder.hasArg();
            OptionBuilder.withDescription("maximum frequency of wanted forms (default is 1000)");
            OptionBuilder.withLongOpt("max-freq");
            Option create6 = OptionBuilder.create("m");
            options.addOption("h", "help", false, "print this message");
            options.addOption("v", "version", false, "output version information and exit");
            OptionBuilder.withDescription("if set, use the output folder as base dir");
            OptionBuilder.withLongOpt("base-dir");
            Option create7 = OptionBuilder.create();
            options.addOption(create);
            options.addOption(create2);
            options.addOption(create3);
            options.addOption(create4);
            options.addOption(create5);
            options.addOption(create6);
            options.addOption(create7);
            CommandLine parse = new PosixParser().parse(options, strArr);
            int i = 1;
            if (parse.hasOption("num-threads")) {
                i = Integer.parseInt(parse.getOptionValue("num-threads"));
            }
            int i2 = Integer.MAX_VALUE;
            if (parse.hasOption("num-pages")) {
                i2 = Integer.parseInt(parse.getOptionValue("num-pages"));
            }
            int i3 = 10000;
            if (parse.hasOption("notification-point")) {
                i3 = Integer.parseInt(parse.getOptionValue("notification-point"));
            }
            ExtractorParameters extractorParameters = parse.hasOption("base-dir") ? new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"), true) : new ExtractorParameters(parse.getOptionValue("wikipedia-dump"), parse.getOptionValue("output-dir"));
            File file = new File(extractorParameters.getExtractionOutputDirName());
            if (file.mkdirs()) {
                logger.info(file + " created");
            }
            logger.debug(extractorParameters);
            int i4 = 1000;
            if (parse.hasOption("max-freq")) {
                i4 = Integer.parseInt(parse.getOptionValue("max-freq"));
            }
            logger.debug("filtering examples with frequency higher than " + i4 + "...");
            logger.debug("extracting examples (" + extractorParameters.getWikipediaExampleFileName() + ")...");
            WikipediaExampleExtractor wikipediaExampleExtractor = new WikipediaExampleExtractor(i, i2, extractorParameters.getLocale());
            wikipediaExampleExtractor.setNotificationPoint(i3);
            wikipediaExampleExtractor.setMaximumNumberOfExamplesPerPage(i4);
            wikipediaExampleExtractor.start(extractorParameters);
            logger.info("extraction ended " + new Date());
        } catch (ParseException e) {
            System.out.println("Parsing failed: " + e.getMessage() + "\n");
            new HelpFormatter().printHelp(400, "java -cp dist/thewikimachine.jar org.fbk.cit.hlt.thewikimachine.xmldump.WikipediaExampleExtractor", "\n", options, "\n", true);
        }
    }
}
