001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.classification.sequence.example; 018 019import org.tribuo.Example; 020import org.tribuo.Feature; 021import org.tribuo.classification.Label; 022import org.tribuo.classification.LabelFactory; 023import org.tribuo.impl.ListExample; 024import org.tribuo.provenance.SimpleDataSourceProvenance; 025import org.tribuo.sequence.MutableSequenceDataset; 026import org.tribuo.sequence.SequenceExample; 027 028import java.time.OffsetDateTime; 029import java.util.ArrayList; 030import java.util.List; 031 032/** 033 * A data generator for smoke testing sequence label models. 034 */ 035public final class SequenceDataGenerator { 036 037 private static final LabelFactory labelFactory = new LabelFactory(); 038 039 private SequenceDataGenerator() { } 040 041 /** 042 * Generates a simple dataset consisting of numCopies repeats of two sequences. 043 * @param numCopies The number of times to repeat the two sequence examples. 044 * @return The dataset. 045 */ 046 public static MutableSequenceDataset<Label> generateGorillaDataset(int numCopies) { 047 List<SequenceExample<Label>> examples = new ArrayList<>(); 048 049 for (int i = 0; i < numCopies; i++) { 050 examples.add(generateGorillaA()); 051 examples.add(generateGorillaB()); 052 } 053 054 return new MutableSequenceDataset<>(examples, new SimpleDataSourceProvenance("ExampleSequenceDataset", OffsetDateTime.now(),labelFactory),labelFactory); 055 } 056 057 /** 058 * Generates a sequence example with a mixture of features and three labels "O", "Status" and "Monkey". 059 * @return A sequence example. 060 */ 061 public static SequenceExample<Label> generateGorillaA() { 062 //"The silverback gorilla is angry" 063 List<Example<Label>> examples = new ArrayList<>(); 064 065 Example<Label> the = new ListExample<>(new Label("O")); 066 the.add(new Feature("A",1.0)); 067 the.add(new Feature("B",1.0)); 068 the.add(new Feature("W=the",1.0)); 069 examples.add(the); 070 071 Example<Label> silverback = new ListExample<>(new Label("Monkey")); 072 silverback.add(new Feature("C",1.0)); 073 silverback.add(new Feature("D",1.0)); 074 silverback.add(new Feature("W=silverback",1.0)); 075 examples.add(silverback); 076 077 Example<Label> gorilla = new ListExample<>(new Label("Monkey")); 078 gorilla.add(new Feature("D",1.0)); 079 gorilla.add(new Feature("E",1.0)); 080 gorilla.add(new Feature("W=gorilla",1.0)); 081 examples.add(gorilla); 082 083 Example<Label> is = new ListExample<>(new Label("O")); 084 is.add(new Feature("B",1.0)); 085 is.add(new Feature("W=is",1.0)); 086 examples.add(is); 087 088 Example<Label> angry = new ListExample<>(new Label("Status")); 089 angry.add(new Feature("F",1.0)); 090 angry.add(new Feature("G",1.0)); 091 angry.add(new Feature("W=angry",1.0)); 092 examples.add(angry); 093 094 return new SequenceExample<>(examples); 095 } 096 097 /** 098 * Generates a sequence example with a mixture of features and three labels "O", "Status" and "Monkey". 099 * @return A sequence example. 100 */ 101 public static SequenceExample<Label> generateGorillaB() { 102 //"That is one angry looking gorilla" 103 List<Example<Label>> examples = new ArrayList<>(); 104 105 Example<Label> that = new ListExample<>(new Label("O")); 106 that.add(new Feature("A",1.0)); 107 that.add(new Feature("B",1.0)); 108 that.add(new Feature("W=that",1.0)); 109 examples.add(that); 110 111 Example<Label> is = new ListExample<>(new Label("O")); 112 is.add(new Feature("B",1.0)); 113 is.add(new Feature("W=is",1.0)); 114 examples.add(is); 115 116 Example<Label> one = new ListExample<>(new Label("O")); 117 one.add(new Feature("B",1.0)); 118 one.add(new Feature("H",1.0)); 119 one.add(new Feature("W=one",1.0)); 120 examples.add(one); 121 122 Example<Label> angry = new ListExample<>(new Label("Status")); 123 angry.add(new Feature("F",1.0)); 124 angry.add(new Feature("G",1.0)); 125 angry.add(new Feature("W=angry",1.0)); 126 examples.add(angry); 127 128 Example<Label> looking = new ListExample<>(new Label("O")); 129 looking.add(new Feature("I",1.0)); 130 looking.add(new Feature("J",1.0)); 131 looking.add(new Feature("W=looking",1.0)); 132 examples.add(looking); 133 134 Example<Label> gorilla = new ListExample<>(new Label("Monkey")); 135 gorilla.add(new Feature("D",1.0)); 136 gorilla.add(new Feature("E",1.0)); 137 gorilla.add(new Feature("W=gorilla",1.0)); 138 examples.add(gorilla); 139 140 return new SequenceExample<>(examples); 141 } 142 143 /** 144 * This generates a sequence example with features that are unused by the training data. 145 * @return A {@link SequenceExample} which is invalid in the context of the Gorilla example data. 146 */ 147 public static SequenceExample<Label> generateInvalidExample() { 148 //"invalid example" 149 List<Example<Label>> examples = new ArrayList<>(); 150 151 Example<Label> invalid = new ListExample<>(new Label("O")); 152 invalid.add(new Feature("1",1.0)); 153 invalid.add(new Feature("2",1.0)); 154 invalid.add(new Feature("W=invalid",1.0)); 155 examples.add(invalid); 156 157 Example<Label> example = new ListExample<>(new Label("O")); 158 example.add(new Feature("3",1.0)); 159 example.add(new Feature("2",1.0)); 160 example.add(new Feature("W=example",1.0)); 161 examples.add(example); 162 163 return new SequenceExample<>(examples); 164 } 165 166 /** 167 * This generates a sequence example where the first example has no features. 168 * @return A {@link SequenceExample} which is invalid as one example contains no features. 169 */ 170 public static SequenceExample<Label> generateOtherInvalidExample() { 171 //"invalid example" 172 List<Example<Label>> examples = new ArrayList<>(); 173 174 Example<Label> invalid = new ListExample<>(new Label("O")); 175 examples.add(invalid); 176 177 Example<Label> example = new ListExample<>(new Label("O")); 178 example.add(new Feature("3",1.0)); 179 example.add(new Feature("2",1.0)); 180 example.add(new Feature("W=example",1.0)); 181 examples.add(example); 182 183 return new SequenceExample<>(examples); 184 } 185 186 /** 187 * This generates a sequence example with no examples. 188 * @return A {@link SequenceExample} which is invalid as it contains no examples. 189 */ 190 public static SequenceExample<Label> generateEmptyExample() { 191 List<Example<Label>> examples = new ArrayList<>(); 192 return new SequenceExample<>(examples); 193 } 194}