package net.sf.okapi.steps.tokenization;

import com.ibm.icu.text.BreakIterator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.FileLocation;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Range;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.TextUnitUtil;
import net.sf.okapi.steps.tokenization.common.Token;
import net.sf.okapi.steps.tokenization.common.TokensAnnotation;
import net.sf.okapi.steps.tokenization.engine.RbbiLexer;
import net.sf.okapi.steps.tokenization.engine.javacc.ParseException;
import net.sf.okapi.steps.tokenization.engine.javacc.SimpleCharStream;
import net.sf.okapi.steps.tokenization.engine.javacc.WordTokenizer;
import net.sf.okapi.steps.tokenization.engine.javacc.WordTokenizerTokenManager;
import net.sf.okapi.steps.tokenization.locale.LocaleUtil;
import net.sf.okapi.steps.tokenization.tokens.Tokens;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.slf4j.LoggerFactory;

@RunWith(JUnit4.class)
/* loaded from: input_file:net/sf/okapi/steps/tokenization/TokenizationTest.class */
public class TokenizationTest {
    private String text = "Jaguar will sell its new XJ-6 model in the U.S. for a small fortune :-). Expect to pay around USD 120ks ($120,000.00 on 05/30/2007 at 12.30PM). Custom options can set you back another few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\" alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com. See http://www.jaguar.com/sales, www.jaguar.com, AT&T, P&G, Johnson&Johnson, 192.168.0.5 for info 3.5pct.";
    private LocaleId locENUS = LocaleId.fromString("en-us");
    private LocaleId locENGB = LocaleId.fromString("en-gb");
    private LocaleId locDEDE = LocaleId.fromString("de-de");
    private LocaleId locDECH = LocaleId.fromString("de-ch");
    private LocaleId locFR = LocaleId.fromString("fr");
    private TokenizationStep ts;
    private Tokens tokens;

    private String streamAsString(InputStream inputStream) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
        StringBuilder sb = new StringBuilder();
        char[] cArr = new char[2048];
        while (true) {
            int read = bufferedReader.read(cArr);
            if (read == -1) {
                return sb.toString();
            }
            sb.append(cArr, 0, read);
        }
    }

    private Tokens tokenizeText() {
        Tokens tokens = new Tokens();
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        StartDocument startDocument = new StartDocument("tokenization");
        startDocument.setLocale(this.locENUS);
        startDocument.setMultilingual(false);
        this.ts.handleEvent(new Event(EventType.START_DOCUMENT, startDocument));
        ITextUnit buildTU = TextUnitUtil.buildTU(this.text);
        this.ts.handleEvent(new Event(EventType.TEXT_UNIT, buildTU));
        TokensAnnotation sourceAnnotation = TextUnitUtil.getSourceAnnotation(buildTU, TokensAnnotation.class);
        if (sourceAnnotation != null) {
            tokens.addAll(sourceAnnotation.getTokens());
        }
        this.ts.handleEvent(new Event(EventType.END_BATCH));
        return tokens;
    }

    @Before
    public void setUp() {
        this.ts = new TokenizationStep();
    }

    @Test
    public void testDefRules() {
        BreakIterator.getWordInstance();
    }

    @Test
    public void testLocaleUtil() {
        Assert.assertEquals("en-us", LocaleUtil.normalizeLanguageCode_Okapi("en_US"));
        Assert.assertEquals("en_US", LocaleUtil.normalizeLanguageCode_ICU("EN-US"));
    }

    @Test
    public void testTS() {
        this.ts = new TokenizationStep();
        Event event = new Event(EventType.TEXT_UNIT, TextUnitUtil.buildTU(this.text));
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        this.ts.handleEvent(event);
        this.ts.handleEvent(new Event(EventType.END_BATCH));
    }

    private void listTokens(Tokens tokens) {
        if (tokens == null) {
            return;
        }
        StringBuilder sb = new StringBuilder();
        Iterator it = tokens.iterator();
        while (it.hasNext()) {
            sb.append(((Token) it.next()).toString() + "\n");
        }
        LoggerFactory.getLogger(getClass()).debug(sb.toString());
    }

    @Test
    public void listTokenizerOutput() {
        listTokens(Tokenizer.tokenize("NASDAQ is a U.S. stock exchange.", this.locENUS, new String[0]));
    }

    @Test
    public void testFilters() {
        Parameters parameters = new Parameters();
        this.ts.setParameters(parameters);
        Assert.assertNotNull(parameters.getLocaleFilter());
        Assert.assertTrue(parameters.supportsLanguage(this.locENUS));
        Assert.assertTrue(parameters.supportsToken("FAKE_TOKEN"));
        Assert.assertTrue(parameters.supportsToken(Integer.MAX_VALUE));
        parameters.setLocaleFilter("");
        Assert.assertTrue(parameters.supportsLanguage(this.locENUS));
        Assert.assertTrue(parameters.supportsLanguage(this.locENGB));
        Assert.assertTrue(parameters.supportsLanguage(this.locDEDE));
        Assert.assertTrue(parameters.supportsLanguage(this.locDECH));
        parameters.setLocaleFilter("en !en-gb de-*-* !de-ch");
        Assert.assertTrue(parameters.supportsLanguage(this.locENUS));
        Assert.assertFalse(parameters.supportsLanguage(this.locENGB));
        Assert.assertTrue(parameters.supportsLanguage(this.locDEDE));
        Assert.assertFalse(parameters.supportsLanguage(this.locDECH));
        parameters.setTokenNames((String[]) null);
        Assert.assertTrue(parameters.supportsToken("FAKE_TOKEN"));
        Assert.assertTrue(parameters.supportsToken(Integer.MAX_VALUE));
        parameters.setTokenNames(new String[]{"WORD", "PUNKTUATION"});
        Assert.assertFalse(parameters.supportsToken("FAKE_TOKEN"));
        Assert.assertFalse(parameters.supportsToken(Integer.MAX_VALUE));
        Assert.assertTrue(parameters.supportsToken("WORD"));
        Parameters parameters2 = this.ts.getParameters();
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        Assert.assertTrue(parameters2.supportsLanguage(this.locENUS));
        Assert.assertFalse(parameters2.supportsLanguage(this.locENGB));
        Assert.assertTrue(parameters2.supportsLanguage(this.locDEDE));
        Assert.assertFalse(parameters2.supportsLanguage(this.locDECH));
        Assert.assertFalse(parameters2.supportsToken("FAKE_TOKEN"));
        Assert.assertFalse(parameters2.supportsToken(Integer.MAX_VALUE));
        Assert.assertTrue(parameters2.supportsToken("WORD"));
        this.ts.handleEvent(new Event(EventType.END_BATCH));
    }

    @Test
    public void testTokenizer1() {
        this.ts.setConfiguration(getClass(), "test_config1.tprm");
        Assert.assertTrue(this.ts.getParameters().supportsToken("WORD"));
        Assert.assertEquals(1L, this.ts.getLexers().size());
        this.tokens = tokenizeText();
        Assert.assertEquals(183L, this.tokens.size());
    }

    @Test
    public void testTokenizer2() {
        Tokens tokens = Tokenizer.tokenize("word1 word2 word3", this.locENUS, new String[]{"WORD"});
        Assert.assertEquals(3L, tokens.size());
        Assert.assertEquals("word1", ((Token) tokens.get(0)).getValue());
        Assert.assertEquals("word2", ((Token) tokens.get(1)).getValue());
        Assert.assertEquals("word3", ((Token) tokens.get(2)).getValue());
    }

    @Test
    public void testJavaCC() {
        net.sf.okapi.steps.tokenization.engine.javacc.Token nextToken;
        WordTokenizer wordTokenizer = new WordTokenizer(new WordTokenizerTokenManager(new SimpleCharStream(new StringReader("This is a 1248-th test. U.S.A.F. read-through\n didn't AT&T, P&G, Johnson&Johnson \n\nadmin@yahoo.com 192.168.0.7"))));
        do {
            try {
                nextToken = wordTokenizer.nextToken();
                if (nextToken == null) {
                    return;
                }
            } catch (ParseException | IOException e) {
                e.printStackTrace();
                return;
            }
        } while (nextToken != null);
    }

    @Test
    public void testRetainRemove() {
        ArrayList arrayList = new ArrayList();
        arrayList.add("A");
        arrayList.add("B");
        arrayList.add("C");
        ArrayList arrayList2 = new ArrayList();
        arrayList2.add("A");
        arrayList2.add("B");
        ArrayList arrayList3 = new ArrayList();
        arrayList3.add("B");
        Assert.assertEquals(3L, arrayList.size());
        Assert.assertEquals("A", arrayList.get(0));
        Assert.assertEquals("B", arrayList.get(1));
        Assert.assertEquals("C", arrayList.get(2));
        arrayList.retainAll(arrayList2);
        Assert.assertEquals(2L, arrayList.size());
        Assert.assertEquals("A", arrayList.get(0));
        Assert.assertEquals("B", arrayList.get(1));
        arrayList.removeAll(arrayList3);
        Assert.assertEquals(1L, arrayList.size());
        Assert.assertEquals("A", arrayList.get(0));
    }

    @Test
    public void testFormRbbiRules() throws IOException {
        FileLocation fromClass = FileLocation.fromClass(getClass());
        Assert.assertEquals(Util.normalizeNewlines(streamAsString(fromClass.in("rbbi_custom.txt").asInputStream())), RbbiLexer.formatRule(RbbiLexer.formatRule(RbbiLexer.formatRule(RbbiLexer.formatRule(RbbiLexer.formatRule(RbbiLexer.formatRule(streamAsString(fromClass.in("rbbi_default.txt").asInputStream()), "Abbreviation", "Abbreviation: Uppercase alpha chars separated by period and optionally followed by a period", "[A-Z0-9](\\.[A-Z0-9])+(\\.)*", 500), "HyphenatedWord", "Hyphenated Word : sequence of letter or digit, (punctuated by - or _, with following letter or digit sequence)+", "[A-Za-z0-9]+([\\-_][A-Za-z0-9]+)+", 501), "EmailAddress", "Email address: sequence of letters, digits and punctuation followed by @ and followed by another sequence", "[A-Za-z0-9_\\-\\.]+\\@[A-Za-z][A-Za-z0-9_]+\\.[a-z]+", 502), "InternetAddress", "Internet Addresses: http://www.foo.com(/bar)", "[a-z]+\\:\\/\\/[a-z0-9]+(\\.[a-z0-9]+)+(\\/[a-z0-9][a-z0-9\\.]+)", 503), "XmlMarkup", "XML markup: A run begins with < and ends with the first matching >", "\\<[^\\>]+\\>", 504), "Emoticon", "Emoticon: A run that starts with :;B8{[ and contains only one or more of the following -=/{})(", "[B8\\:\\;\\{\\[][-=\\/\\{\\}\\)\\(]+", 505));
    }

    @Test
    public void testRange() {
        Range range = new Range(1, 5);
        Range range2 = new Range(1, 5);
        Assert.assertFalse(range == range2);
        Assert.assertFalse(range.equals(range2));
        Assert.assertFalse(range.hashCode() == range2.hashCode());
        Assert.assertFalse(range.toString() == range2.toString());
    }
}
