package net.sf.okapi.steps.tokenization;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Range;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.TextUnitUtil;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

@RunWith(JUnit4.class)
/* loaded from: input_file:net/sf/okapi/steps/tokenization/TokenizationTest.class */
public class TokenizationTest {
    private TokenizationStep ts;
    private Tokens tokens;
    private final LocaleId locENUS = LocaleId.fromString("en-us");
    private final LocaleId locFR = LocaleId.fromString("fr");
    private final String text = "Jaguar �� will sell its new XJ-6 model in the U.S. for a small fortune :-). Expect to pay around USD 120ks ($120,000.00 on 05/30/2007 at 12.30PM). Custom options can set you back another few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\" alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com. See http://www.jaguar.com/sales, www.jaguar.com, AT&T, P&G, Johnson&Johnson, 192.168.0.5 for info 3.5pct.";

    private Tokens tokenizeText() {
        Tokens tokens = new Tokens();
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        StartDocument startDocument = new StartDocument("tokenization");
        startDocument.setLocale(this.locENUS);
        startDocument.setMultilingual(false);
        this.ts.handleEvent(new Event(EventType.START_DOCUMENT, startDocument));
        ITextUnit buildGenericTU = TextUnitUtil.buildGenericTU("Jaguar �� will sell its new XJ-6 model in the U.S. for a small fortune :-). Expect to pay around USD 120ks ($120,000.00 on 05/30/2007 at 12.30PM). Custom options can set you back another few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\" alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com. See http://www.jaguar.com/sales, www.jaguar.com, AT&T, P&G, Johnson&Johnson, 192.168.0.5 for info 3.5pct.");
        this.ts.handleEvent(new Event(EventType.TEXT_UNIT, buildGenericTU));
        TokensAnnotation sourceAnnotation = TextUnitUtil.getSourceAnnotation(buildGenericTU, TokensAnnotation.class);
        if (sourceAnnotation != null) {
            tokens.addAll(sourceAnnotation.getTokens());
        }
        this.ts.handleEvent(new Event(EventType.END_BATCH));
        return tokens;
    }

    @Before
    public void setUp() {
        this.ts = new TokenizationStep();
    }

    @Test
    public void testTS() {
        this.ts = new TokenizationStep();
        Event event = new Event(EventType.TEXT_UNIT, TextUnitUtil.buildGenericTU("Jaguar �� will sell its new XJ-6 model in the U.S. for a small fortune :-). Expect to pay around USD 120ks ($120,000.00 on 05/30/2007 at 12.30PM). Custom options can set you back another few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\" alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com. See http://www.jaguar.com/sales, www.jaguar.com, AT&T, P&G, Johnson&Johnson, 192.168.0.5 for info 3.5pct."));
        this.ts.handleEvent(new Event(EventType.START_BATCH));
        this.ts.handleEvent(event);
        this.ts.handleEvent(new Event(EventType.END_BATCH));
    }

    @Test
    public void listTokenizerOutput() {
        Assert.assertEquals(22L, Tokenizer.tokenize("NASDAQ :-) hypen-word www.google.com is a U.S. stock 1.0006 100 exchange.", this.locENUS, new String[0]).size());
    }

    @Test
    public void testTokenizer1() {
        Tokens tokens = Tokenizer.tokenize("Jaguar �� will sell its new XJ-6 model in the U.S. for a small fortune :-). Expect to pay around USD 120ks ($120,000.00 on 05/30/2007 at 12.30PM). Custom options can set you back another few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\" alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com. See http://www.jaguar.com/sales, www.jaguar.com, AT&T, P&G, Johnson&Johnson, 192.168.0.5 for info 3.5pct.", this.locENUS, new String[0]);
        Assert.assertEquals(132L, tokens.size());
        Assert.assertEquals("Jaguar", ((Token) tokens.get(0)).getValue());
        Assert.assertEquals(" ", ((Token) tokens.get(1)).getValue());
        Assert.assertEquals("��", ((Token) tokens.get(2)).getValue());
    }

    @Test
    public void testTokenizer2() {
        Tokens tokens = Tokenizer.tokenize("word word word", this.locENUS, new String[]{"WORD"});
        Assert.assertEquals(3L, tokens.size());
        Assert.assertEquals("word", ((Token) tokens.get(0)).getValue());
        Assert.assertEquals("word", ((Token) tokens.get(1)).getValue());
        Assert.assertEquals("word", ((Token) tokens.get(2)).getValue());
    }

    @Test
    public void hyphenatedWords() {
        Tokens tokens = Tokenizer.tokenize("word-word-word", this.locENUS, new String[]{"HYPHENATED_WORD"});
        Assert.assertEquals(1L, tokens.size());
        Assert.assertEquals("word-word-word", ((Token) tokens.get(0)).getValue());
    }

    @Test
    public void allTokens() {
        Assert.assertEquals(11L, Tokenizer.tokenize("12:00pm 03/12/192 11:45 $300", this.locENUS, new String[0]).size());
    }

    @Test
    public void testRange() {
        Range range = new Range(1, 5);
        Range range2 = new Range(1, 5);
        Assert.assertNotSame(range, range2);
        Assert.assertEquals(range, range2);
        Assert.assertEquals(range.hashCode(), range2.hashCode());
        Assert.assertNotSame(range.toString(), range2.toString());
    }
}
