package com.google.appengine.api.search.dev;

import com.google.appengine.repackaged.com.google.common.collect.ImmutableSet;
import com.google.apphosting.api.AppEngineInternal;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.Character;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

@AppEngineInternal
/* loaded from: input_file:com/google/appengine/api/search/dev/WordSeparatorAnalyzer.class */
public class WordSeparatorAnalyzer extends Analyzer {
    private static final float CJK_CHARACTER_THRESHOLD = 0.2f;
    private boolean detectCjk;
    static final Logger LOG = Logger.getLogger(WordSeparatorAnalyzer.class.getCanonicalName());
    static final ImmutableSet<Character> WORD_SEPARATORS = ImmutableSet.of('!', '\"', '%', '(', ')', '*', new Character[]{',', '.', '/', ':', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~', '\t', '\n', '\f', '\r', ' ', '&', '#', '$', ';', '_'});
    private static final ImmutableSet<Character.UnicodeBlock> CJK_BLOCKS = ImmutableSet.of(Character.UnicodeBlock.BOPOMOFO, Character.UnicodeBlock.BOPOMOFO_EXTENDED, Character.UnicodeBlock.CJK_COMPATIBILITY, Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS, Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS, Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, new Character.UnicodeBlock[]{Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT, Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS, Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS, Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS, Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO, Character.UnicodeBlock.HANGUL_JAMO, Character.UnicodeBlock.HANGUL_SYLLABLES, Character.UnicodeBlock.HIRAGANA, Character.UnicodeBlock.IDEOGRAPHIC_DESCRIPTION_CHARACTERS, Character.UnicodeBlock.KANBUN, Character.UnicodeBlock.KANGXI_RADICALS, Character.UnicodeBlock.KATAKANA, Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS, Character.UnicodeBlock.TAI_XUAN_JING_SYMBOLS, Character.UnicodeBlock.YI_RADICALS, Character.UnicodeBlock.YI_SYLLABLES, Character.UnicodeBlock.YIJING_HEXAGRAM_SYMBOLS});

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/google/appengine/api/search/dev/WordSeparatorAnalyzer$WordSeparatorTokenizer.class */
    public class WordSeparatorTokenizer extends LetterTokenizer {
        public WordSeparatorTokenizer(Reader reader) {
            super(reader);
        }

        @Override // org.apache.lucene.analysis.CharTokenizer
        protected char normalize(char c) {
            return Character.toLowerCase(WordSeparatorAnalyzer.removeDiacriticals(Character.toString(c)).charAt(0));
        }

        /* JADX INFO: Access modifiers changed from: protected */
        @Override // org.apache.lucene.analysis.LetterTokenizer, org.apache.lucene.analysis.CharTokenizer
        public boolean isTokenChar(char c) {
            return !WordSeparatorAnalyzer.WORD_SEPARATORS.contains(new Character(c));
        }
    }

    static boolean isProbablyCjk(Reader reader, StringBuilder sb) throws IOException {
        char[] cArr = new char[1024];
        long j = 0;
        long j2 = 0;
        while (true) {
            int read = reader.read(cArr);
            if (read < 0) {
                break;
            }
            j2 += read;
            sb.append(cArr, 0, read);
            for (int i = 0; i < read; i++) {
                if (CJK_BLOCKS.contains(Character.UnicodeBlock.of(cArr[i]))) {
                    j++;
                }
            }
        }
        return ((float) j) / ((float) j2) > CJK_CHARACTER_THRESHOLD;
    }

    public WordSeparatorAnalyzer(boolean z) {
        this.detectCjk = z;
    }

    public WordSeparatorAnalyzer() {
        this(true);
    }

    @Override // org.apache.lucene.analysis.Analyzer
    public TokenStream tokenStream(String str, Reader reader) {
        StringBuilder sb = new StringBuilder();
        if (this.detectCjk) {
            try {
                boolean isProbablyCjk = isProbablyCjk(reader, sb);
                reader = new StringReader(sb.toString());
                if (isProbablyCjk) {
                    return new CJKTokenizer(reader);
                }
            } catch (IOException e) {
                LOG.log(Level.SEVERE, "Failed to read stream for tokenization.", (Throwable) e);
                return new EmptyTokenStream();
            }
        }
        return new StandardFilter(new WordSeparatorTokenizer(reader));
    }

    public static List<String> tokenList(String str) {
        TokenStream tokenStream = new WordSeparatorAnalyzer().tokenStream("", new StringReader(str));
        TermAttribute termAttribute = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
        ArrayList arrayList = new ArrayList();
        while (tokenStream.incrementToken()) {
            try {
                arrayList.add(termAttribute.term());
            } catch (IOException e) {
                return new ArrayList();
            }
        }
        return arrayList;
    }

    public static String normalize(String str) {
        StringBuilder sb = new StringBuilder();
        List<String> list = tokenList(str);
        for (int i = 0; i < list.size(); i++) {
            sb.append(list.get(i));
            if (i != list.size() - 1) {
                sb.append(ShingleFilter.TOKEN_SEPARATOR);
            }
        }
        return sb.toString();
    }

    public static String removeDiacriticals(String str) {
        return Normalizer.normalize(str, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }
}
