package org.unlaxer.jaddress.tokenizer;

import java.lang.Character.UnicodeBlock;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class StringTypeTokenizer {
	static Map<UnicodeBlock, CharacterType> convertMap;

	public StringTypeTokenizer() {
		super();
		convertMap = new HashMap<>();
		convertMap.put(UnicodeBlock.HIRAGANA, CharacterType.hiragana);
		convertMap.put(UnicodeBlock.KATAKANA, CharacterType.katakana);
		convertMap.put(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS, CharacterType.kanji);

		convertMap.put(UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION, CharacterType.symbol);
		convertMap.put(UnicodeBlock.GENERAL_PUNCTUATION, CharacterType.symbol);
		convertMap.put(UnicodeBlock.MATHEMATICAL_OPERATORS, CharacterType.symbol);
	}

	private CharacterType getCharacterType(char c) {
		UnicodeBlock unicodeBlock = UnicodeBlock.of(c);
		if (convertMap.containsKey(unicodeBlock))
			return convertMap.get(unicodeBlock);

		String str = String.valueOf(c);
		if(unicodeBlock == UnicodeBlock.BASIC_LATIN ) {
			if (str.matches("[0-9]"))
				return CharacterType.digit;

			if (str.matches("[a-zA-Z]"))
				return CharacterType.alphabet;
		}else if( unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
			if (str.matches("[０-９]"))
				return CharacterType.digit;

			if (str.matches("[ａ-ｚＡ-Ｚ]"))
				return CharacterType.alphabet;
		}

		if (str.matches("\\W"))
			return CharacterType.symbol;

		throw new UnsupportedOperationException(String.format("char[%s],UnicodeBlock[%s]", c, unicodeBlock));
	}

	public List<TokenWithCharacterKind> tokenize(String phrase) {
		List<TokenWithCharacterKind> list = new ArrayList<>();

		String[] words = phrase.split(" +|　+");
		for (String word : words) {
			StringBuilder sb = new StringBuilder();
			CharacterType pre = null;

			for (int i = 0; i < word.length(); i++) {
				char c = word.charAt(i);
				CharacterType cur = getCharacterType(c);

				if (pre != cur) {
					if (sb.length() > 0) {
						list.add(new TokenWithCharacterKind(sb.toString(), pre));
						sb.setLength(0);
					}
				}
				sb.append(c);
				pre = cur;
			}
			if (sb.length() > 0) {
				list.add(new TokenWithCharacterKind(sb.toString(), pre));
			}
		}

		return list;
	}

	@Deprecated
	public List<TokenWithCharacterKind> tokenizeRaw(String phrase) {
		List<TokenWithCharacterKind> list = new ArrayList<>();

		String[] words = phrase.split(" +|　+");
		for (String word : words) {
			StringBuilder sb = new StringBuilder();
			UnicodeBlock pre = null;

			for (int i = 0; i < word.length(); i++) {
				char c = word.charAt(i);
				UnicodeBlock cur = UnicodeBlock.of(c);

				if (pre != cur) {
					if (sb.length() > 0) {
						list.add(new TokenWithCharacterKind(sb.toString(), pre));
						sb.setLength(0);
					}
				}
				sb.append(c);
				pre = cur;
			}
			if (sb.length() > 0) {
				list.add(new TokenWithCharacterKind(sb.toString(), pre));
			}
		}

		return list;
	}

}
