package net.sf.okapi.steps.tokenization;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.ListUtil;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.UsingParameters;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.pipeline.BasePipelineStep;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextUnitUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@UsingParameters(Parameters.class)
/* loaded from: input_file:net/sf/okapi/steps/tokenization/TokenizationStep.class */
public class TokenizationStep extends BasePipelineStep {
    private static final Pattern APOSTROPHE = Pattern.compile("[’']");
    private final ITokenizer tokenizer;
    private final ArrayList<Integer> positions;
    private LocaleId targetLocale;
    private LocaleId sourceLocale;
    private final Logger logger = LoggerFactory.getLogger(getClass());
    private final Parameters params = new Parameters();

    public TokenizationStep() {
        setParameters(this.params);
        this.tokenizer = new RbbiTokenizer();
        this.positions = new ArrayList<>();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // net.sf.okapi.common.pipeline.BasePipelineStep
    public Event handleStartDocument(Event event) {
        StartDocument startDocument = (StartDocument) event.getResource();
        if (startDocument != null) {
            this.sourceLocale = startDocument.getLocale();
        }
        return event;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // net.sf.okapi.common.pipeline.BasePipelineStep
    public Event handleTextUnit(Event event) {
        Event handleTextUnit = super.handleTextUnit(event);
        if (handleTextUnit == null) {
            return null;
        }
        ITextUnit textUnit = handleTextUnit.getTextUnit();
        if (textUnit != null && !textUnit.isEmpty() && textUnit.isTranslatable()) {
            if (this.params.isTokenizeSource()) {
                tokenizeSource(textUnit);
            }
            if (this.params.isTokenizeTargets()) {
                tokenizeTargets(textUnit);
            }
            return handleTextUnit;
        }
        return handleTextUnit;
    }

    @Override // net.sf.okapi.common.pipeline.BasePipelineStep, net.sf.okapi.common.pipeline.IPipelineStep
    public LocaleId getSourceLocale() {
        return this.sourceLocale;
    }

    @Override // net.sf.okapi.common.pipeline.BasePipelineStep, net.sf.okapi.common.pipeline.IPipelineStep
    @StepParameterMapping(parameterType = StepParameterType.SOURCE_LOCALE)
    public void setSourceLocale(LocaleId localeId) {
        this.sourceLocale = localeId;
    }

    @Override // net.sf.okapi.common.pipeline.BasePipelineStep, net.sf.okapi.common.pipeline.IPipelineStep
    public LocaleId getTargetLocale() {
        return this.targetLocale;
    }

    @Override // net.sf.okapi.common.pipeline.BasePipelineStep, net.sf.okapi.common.pipeline.IPipelineStep
    @StepParameterMapping(parameterType = StepParameterType.TARGET_LOCALE)
    public void setTargetLocale(LocaleId localeId) {
        this.targetLocale = localeId;
    }

    private Tokens tokenize(TextContainer textContainer, LocaleId localeId) {
        if (textContainer == null || Util.isNullOrEmpty(localeId) || this.positions == null) {
            return null;
        }
        this.positions.clear();
        Tokens tokens = new Tokens();
        this.tokenizer.init(textContainer.contentIsOneSegment() ? TextUnitUtil.getText(textContainer.getFirstContent(), this.positions) : TextUnitUtil.getText(textContainer.getUnSegmentedContentCopy(), this.positions), localeId);
        while (this.tokenizer.hasNext()) {
            Token next = this.tokenizer.next();
            if (next != null) {
                tokens.addAll(postProcess(next, localeId));
            }
        }
        tokens.fixRanges(this.positions);
        return tokens.getFilteredList(ListUtil.stringListAsArray(this.params.getIncludedTokenNames()));
    }

    public Collection<? extends Token> postProcess(Token token, LocaleId localeId) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(token);
        return ((LocaleId.FRENCH.sameLanguageAs(localeId) || LocaleId.ITALIAN.sameLanguageAs(localeId)) && APOSTROPHE.matcher(token.getValue()).find()) ? apostrophe(token, localeId) : arrayList;
    }

    public List<Token> apostrophe(Token token, LocaleId localeId) {
        Matcher matcher = APOSTROPHE.matcher(token.getValue());
        matcher.find();
        int i = token.getRange().start;
        int i2 = token.getRange().end;
        ArrayList arrayList = new ArrayList();
        String[] split = APOSTROPHE.split(token.getValue());
        String str = split[0];
        String tokenName = Tokens.getTokenName(token.getId());
        String tokenDescription = Tokens.getTokenDescription(token.getId());
        int length = i + str.length();
        arrayList.add(new Token(token.getId(), str, tokenName, tokenDescription, i, length));
        String group = matcher.group();
        int tokenId = Tokens.getTokenId("PUNCTUATION");
        arrayList.add(new Token(tokenId, group, "PUNCTUATION", Tokens.getTokenDescription(tokenId), length + 1, length + 2));
        arrayList.add(new Token(token.getId(), split[1], Tokens.getTokenName(token.getId()), Tokens.getTokenDescription(token.getId()), length + 3, i2));
        return arrayList;
    }

    private void tokenizeSource(ITextUnit iTextUnit) {
        Tokens tokens;
        if (iTextUnit == null || (tokens = tokenize(iTextUnit.getSource(), getSourceLocale())) == null) {
            return;
        }
        TokensAnnotation tokensAnnotation = (TokensAnnotation) TextUnitUtil.getSourceAnnotation(iTextUnit, TokensAnnotation.class);
        if (tokensAnnotation == null) {
            TextUnitUtil.setSourceAnnotation(iTextUnit, new TokensAnnotation(tokens));
        } else {
            tokensAnnotation.addTokens(tokens);
        }
    }

    private void tokenizeTargets(ITextUnit iTextUnit) {
        if (iTextUnit == null) {
            return;
        }
        for (LocaleId localeId : iTextUnit.getTargetLocales()) {
            Tokens tokens = tokenize(iTextUnit.getTarget(localeId), localeId);
            if (tokens != null) {
                TokensAnnotation tokensAnnotation = (TokensAnnotation) TextUnitUtil.getTargetAnnotation(iTextUnit, localeId, TokensAnnotation.class);
                if (tokensAnnotation == null) {
                    TextUnitUtil.setTargetAnnotation(iTextUnit, localeId, new TokensAnnotation(tokens));
                } else {
                    tokensAnnotation.addTokens(tokens);
                }
            }
        }
    }

    @Override // net.sf.okapi.common.pipeline.IPipelineStep
    public String getName() {
        return "Tokenization Step";
    }

    @Override // net.sf.okapi.common.pipeline.IPipelineStep
    public String getDescription() {
        return "Extracts tokens from the text units content of a document. Expects: filter events. Sends back: filter events.";
    }
}
