001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.classification.sequence.example;
018
019import org.tribuo.Example;
020import org.tribuo.Feature;
021import org.tribuo.classification.Label;
022import org.tribuo.classification.LabelFactory;
023import org.tribuo.impl.ListExample;
024import org.tribuo.provenance.SimpleDataSourceProvenance;
025import org.tribuo.sequence.MutableSequenceDataset;
026import org.tribuo.sequence.SequenceExample;
027
028import java.time.OffsetDateTime;
029import java.util.ArrayList;
030import java.util.List;
031
032/**
033 * A data generator for smoke testing sequence label models.
034 */
035public final class SequenceDataGenerator {
036
037    private static final LabelFactory labelFactory = new LabelFactory();
038
039    private SequenceDataGenerator() { }
040
041    /**
042     * Generates a simple dataset consisting of numCopies repeats of two sequences.
043     * @param numCopies The number of times to repeat the two sequence examples.
044     * @return The dataset.
045     */
046    public static MutableSequenceDataset<Label> generateGorillaDataset(int numCopies) {
047        List<SequenceExample<Label>> examples = new ArrayList<>();
048
049        for (int i = 0; i < numCopies; i++) {
050            examples.add(generateGorillaA());
051            examples.add(generateGorillaB());
052        }
053
054        return new MutableSequenceDataset<>(examples, new SimpleDataSourceProvenance("ExampleSequenceDataset", OffsetDateTime.now(),labelFactory),labelFactory);
055    }
056
057    /**
058     * Generates a sequence example with a mixture of features and three labels "O", "Status" and "Monkey".
059     * @return A sequence example.
060     */
061    public static SequenceExample<Label> generateGorillaA() {
062        //"The silverback gorilla is angry"
063        List<Example<Label>> examples = new ArrayList<>();
064
065        Example<Label> the = new ListExample<>(new Label("O"));
066        the.add(new Feature("A",1.0));
067        the.add(new Feature("B",1.0));
068        the.add(new Feature("W=the",1.0));
069        examples.add(the);
070
071        Example<Label> silverback = new ListExample<>(new Label("Monkey"));
072        silverback.add(new Feature("C",1.0));
073        silverback.add(new Feature("D",1.0));
074        silverback.add(new Feature("W=silverback",1.0));
075        examples.add(silverback);
076
077        Example<Label> gorilla = new ListExample<>(new Label("Monkey"));
078        gorilla.add(new Feature("D",1.0));
079        gorilla.add(new Feature("E",1.0));
080        gorilla.add(new Feature("W=gorilla",1.0));
081        examples.add(gorilla);
082
083        Example<Label> is = new ListExample<>(new Label("O"));
084        is.add(new Feature("B",1.0));
085        is.add(new Feature("W=is",1.0));
086        examples.add(is);
087
088        Example<Label> angry = new ListExample<>(new Label("Status"));
089        angry.add(new Feature("F",1.0));
090        angry.add(new Feature("G",1.0));
091        angry.add(new Feature("W=angry",1.0));
092        examples.add(angry);
093
094        return new SequenceExample<>(examples);
095    }
096
097    /**
098     * Generates a sequence example with a mixture of features and three labels "O", "Status" and "Monkey".
099     * @return A sequence example.
100     */
101    public static SequenceExample<Label> generateGorillaB() {
102        //"That is one angry looking gorilla"
103        List<Example<Label>> examples = new ArrayList<>();
104
105        Example<Label> that = new ListExample<>(new Label("O"));
106        that.add(new Feature("A",1.0));
107        that.add(new Feature("B",1.0));
108        that.add(new Feature("W=that",1.0));
109        examples.add(that);
110
111        Example<Label> is = new ListExample<>(new Label("O"));
112        is.add(new Feature("B",1.0));
113        is.add(new Feature("W=is",1.0));
114        examples.add(is);
115
116        Example<Label> one = new ListExample<>(new Label("O"));
117        one.add(new Feature("B",1.0));
118        one.add(new Feature("H",1.0));
119        one.add(new Feature("W=one",1.0));
120        examples.add(one);
121
122        Example<Label> angry = new ListExample<>(new Label("Status"));
123        angry.add(new Feature("F",1.0));
124        angry.add(new Feature("G",1.0));
125        angry.add(new Feature("W=angry",1.0));
126        examples.add(angry);
127
128        Example<Label> looking = new ListExample<>(new Label("O"));
129        looking.add(new Feature("I",1.0));
130        looking.add(new Feature("J",1.0));
131        looking.add(new Feature("W=looking",1.0));
132        examples.add(looking);
133
134        Example<Label> gorilla = new ListExample<>(new Label("Monkey"));
135        gorilla.add(new Feature("D",1.0));
136        gorilla.add(new Feature("E",1.0));
137        gorilla.add(new Feature("W=gorilla",1.0));
138        examples.add(gorilla);
139
140        return new SequenceExample<>(examples);
141    }
142
143    /**
144     * This generates a sequence example with features that are unused by the training data.
145     * @return A {@link SequenceExample} which is invalid in the context of the Gorilla example data.
146     */
147    public static SequenceExample<Label> generateInvalidExample() {
148        //"invalid example"
149        List<Example<Label>> examples = new ArrayList<>();
150
151        Example<Label> invalid = new ListExample<>(new Label("O"));
152        invalid.add(new Feature("1",1.0));
153        invalid.add(new Feature("2",1.0));
154        invalid.add(new Feature("W=invalid",1.0));
155        examples.add(invalid);
156
157        Example<Label> example = new ListExample<>(new Label("O"));
158        example.add(new Feature("3",1.0));
159        example.add(new Feature("2",1.0));
160        example.add(new Feature("W=example",1.0));
161        examples.add(example);
162
163        return new SequenceExample<>(examples);
164    }
165
166    /**
167     * This generates a sequence example where the first example has no features.
168     * @return A {@link SequenceExample} which is invalid as one example contains no features.
169     */
170    public static SequenceExample<Label> generateOtherInvalidExample() {
171        //"invalid example"
172        List<Example<Label>> examples = new ArrayList<>();
173
174        Example<Label> invalid = new ListExample<>(new Label("O"));
175        examples.add(invalid);
176
177        Example<Label> example = new ListExample<>(new Label("O"));
178        example.add(new Feature("3",1.0));
179        example.add(new Feature("2",1.0));
180        example.add(new Feature("W=example",1.0));
181        examples.add(example);
182
183        return new SequenceExample<>(examples);
184    }
185
186    /**
187     * This generates a sequence example with no examples.
188     * @return A {@link SequenceExample} which is invalid as it contains no examples.
189     */
190    public static SequenceExample<Label> generateEmptyExample() {
191        List<Example<Label>> examples = new ArrayList<>();
192        return new SequenceExample<>(examples);
193    }
194}