public class LuceneNGramPFE extends LuceneFeatureExtractorBase implements PairFeatureExtractor
LuceneNGramCPFE.| Modifier and Type | Field and Description |
|---|---|
static String |
LUCENE_NGRAM_FIELD1 |
static String |
LUCENE_NGRAM_FIELD2 |
protected boolean |
markViewBlindNgramsWithLocalView |
protected boolean |
ngramBinaryFeatureValuesCombos |
protected int |
ngramMaxN1 |
protected int |
ngramMaxN2 |
protected int |
ngramMinN1 |
protected int |
ngramMinN2 |
protected int |
ngramUseTopK1 |
protected int |
ngramUseTopK2 |
static String |
PARAM_MARK_VIEWBLIND_NGRAMS_WITH_LOCAL_VIEW
This option collects a FrequencyDistribution of ngrams across both documents of all pairs,
but when writing features, the view where a particular ngram is found is recorded with the
ngram.
|
static String |
PARAM_NGRAM_BINARY_FEATURE_VALUES_COMBO
Whether features should be marked with binary (occurs, doesn't occur in this document pair)
values, versus the document count of the feature.
|
static String |
PARAM_NGRAM_MAX_N_VIEW1
Maximum size n of ngrams from View 1's.
|
static String |
PARAM_NGRAM_MAX_N_VIEW2
Maximum size n of ngrams from View 2's.
|
static String |
PARAM_NGRAM_MIN_N_VIEW1
Minimum size n of ngrams from View 1's.
|
static String |
PARAM_NGRAM_MIN_N_VIEW2
Minimum size n of ngrams from View 2's.
|
static String |
PARAM_NGRAM_USE_TOP_K_VIEW1
Use this number of most frequent ngrams from View 1's.
|
static String |
PARAM_NGRAM_USE_TOP_K_VIEW2
Use this number of most frequent ngrams from View 2's.
|
static String |
PARAM_USE_VIEW1_NGRAMS_AS_FEATURES
Each ngram from View 1 documents added to the document pair instance as a feature.
|
static String |
PARAM_USE_VIEW2_NGRAMS_AS_FEATURES
Each ngram from View 1 documents added to the document pair instance as a feature.
|
static String |
PARAM_USE_VIEWBLIND_NGRAMS_AS_FEATURES
All qualifying ngrams from anywhere in either document are used as features.
|
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> |
topKSetView1 |
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> |
topKSetView2 |
protected boolean |
useView1NgramsAsFeatures |
protected boolean |
useView2NgramsAsFeatures |
protected boolean |
useViewBlindNgramsAsFeatures |
forceRereadFromIndex, LUCENE_NGRAM_FIELD, luceneDir, PARAM_SOURCE_LOCATIONdfStore, filterPartialStopwordMatches, ngramFreqThreshold, ngramLowerCase, ngramMaxN, ngramMinN, ngramStopwordsFile, ngramUseTopK, PARAM_FILTER_PARTIAL_STOPWORD_MATCHES, PARAM_NGRAM_FREQ_THRESHOLD, PARAM_NGRAM_LOWER_CASE, PARAM_NGRAM_MAX_N, PARAM_NGRAM_MIN_N, PARAM_NGRAM_STOPWORDS_FILE, PARAM_NGRAM_USE_TOP_K, PARAM_TF_IDF_CALCULATION, prefix, stopwords, tfIdfCalculation, topKSetfeatureExtractorName, PARAM_UNIQUE_EXTRACTOR_NAME| Constructor and Description |
|---|
LuceneNGramPFE() |
| Modifier and Type | Method and Description |
|---|---|
protected Set<Feature> |
addToFeatureArray(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> viewNgrams,
de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> topKSet,
Set<Feature> features) |
Set<Feature> |
extract(org.apache.uima.jcas.JCas view1,
org.apache.uima.jcas.JCas view2) |
protected String |
getFeaturePrefix() |
protected String |
getFieldName() |
List<MetaCollectorConfiguration> |
getMetaCollectorClasses(Map<String,Object> parameterSettings) |
protected int |
getTopN() |
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> |
getTopNgrams() |
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> |
getTopNgramsView1() |
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> |
getTopNgramsView2() |
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> |
getViewNgrams(org.apache.uima.jcas.JCas view1,
org.apache.uima.jcas.JCas view2) |
boolean |
initialize(org.apache.uima.resource.ResourceSpecifier aSpecifier,
Map<String,Object> aAdditionalParams) |
logSelectionProcess, passesScreeningafterResourcesInitialized, getLogger, getResourceNamepublic static final String PARAM_NGRAM_MIN_N_VIEW1
protected int ngramMinN1
public static final String PARAM_NGRAM_MIN_N_VIEW2
protected int ngramMinN2
public static final String PARAM_NGRAM_MAX_N_VIEW1
protected int ngramMaxN1
public static final String PARAM_NGRAM_MAX_N_VIEW2
protected int ngramMaxN2
public static final String PARAM_NGRAM_USE_TOP_K_VIEW1
protected int ngramUseTopK1
public static final String PARAM_NGRAM_USE_TOP_K_VIEW2
protected int ngramUseTopK2
public static final String PARAM_USE_VIEW1_NGRAMS_AS_FEATURES
protected boolean useView1NgramsAsFeatures
public static final String PARAM_USE_VIEW2_NGRAMS_AS_FEATURES
protected boolean useView2NgramsAsFeatures
public static final String PARAM_USE_VIEWBLIND_NGRAMS_AS_FEATURES
protected boolean useViewBlindNgramsAsFeatures
public static final String PARAM_MARK_VIEWBLIND_NGRAMS_WITH_LOCAL_VIEW
PARAM_NGRAM_USE_TOP_K_ALL value of 500, 400 of the ngrams
in the top 500 might happen to be from View 2's; and whenever an ngram from the 500 is seen
in any document, view 1 or 2, the document's view is recorded.PARAM_USE_VIEWBLIND_NGRAMS_AS_FEATURES must also be set
to true.protected boolean markViewBlindNgramsWithLocalView
public static final String PARAM_NGRAM_BINARY_FEATURE_VALUES_COMBO
protected boolean ngramBinaryFeatureValuesCombos
public static final String LUCENE_NGRAM_FIELD1
public static final String LUCENE_NGRAM_FIELD2
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> topKSetView1
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> topKSetView2
public List<MetaCollectorConfiguration> getMetaCollectorClasses(Map<String,Object> parameterSettings) throws org.apache.uima.resource.ResourceInitializationException
getMetaCollectorClasses in interface MetaDependentorg.apache.uima.resource.ResourceInitializationExceptionpublic boolean initialize(org.apache.uima.resource.ResourceSpecifier aSpecifier,
Map<String,Object> aAdditionalParams)
throws org.apache.uima.resource.ResourceInitializationException
initialize in interface org.apache.uima.resource.Resourceinitialize in class NGramFeatureExtractorBaseorg.apache.uima.resource.ResourceInitializationExceptionpublic Set<Feature> extract(org.apache.uima.jcas.JCas view1, org.apache.uima.jcas.JCas view2) throws org.dkpro.tc.api.exception.TextClassificationException
extract in interface PairFeatureExtractororg.dkpro.tc.api.exception.TextClassificationExceptionprotected Set<Feature> addToFeatureArray(de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> viewNgrams, de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> topKSet, Set<Feature> features)
protected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> getTopNgrams() throws org.apache.uima.resource.ResourceInitializationException
getTopNgrams in class LuceneFeatureExtractorBaseorg.apache.uima.resource.ResourceInitializationExceptionprotected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> getTopNgramsView1() throws org.apache.uima.resource.ResourceInitializationException
org.apache.uima.resource.ResourceInitializationExceptionprotected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> getTopNgramsView2() throws org.apache.uima.resource.ResourceInitializationException
org.apache.uima.resource.ResourceInitializationExceptionprotected de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution<String> getViewNgrams(org.apache.uima.jcas.JCas view1, org.apache.uima.jcas.JCas view2) throws org.dkpro.tc.api.exception.TextClassificationException
protected String getFieldName()
getFieldName in class LuceneFeatureExtractorBaseprotected int getTopN()
getTopN in class LuceneFeatureExtractorBaseprotected String getFeaturePrefix()
getFeaturePrefix in class NGramFeatureExtractorBaseCopyright © 2013–2018 Ubiquitous Knowledge Processing (UKP) Lab. All rights reserved.