package org.fnlp.nlp.corpus;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import org.fnlp.nlp.cn.ChineseTrans;
import org.fnlp.nlp.cn.tag.CWSTagger;
import org.fnlp.util.exception.LoadModelException;

/* loaded from: input_file:org/fnlp/nlp/corpus/WikiClean.class */
public class WikiClean {
    static String infile = "../tmp/wiki_00";
    static String simpfile = "../tmp/wiki_simp";
    static String segfile = "../tmp/wiki_simp_seg";
    static String segfile_mini = "../tmp/wiki_mini_simp_seg";

    public static void main(String[] strArr) throws Exception {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(segfile), "utf8"));
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(segfile_mini), "utf8"));
        int i = 0;
        int i2 = 0;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            if (readLine.length() != 0) {
                if (readLine.startsWith("<doc")) {
                    i++;
                } else if (readLine.startsWith("</doc>")) {
                    i--;
                    i2++;
                    if (i2 == 100) {
                        break;
                    }
                } else {
                    continue;
                }
            }
            bufferedWriter.append((CharSequence) readLine);
            bufferedWriter.append((CharSequence) "\n");
        }
        System.out.println(i);
        bufferedReader.close();
        bufferedWriter.close();
    }

    private static void seg() throws IOException, LoadModelException {
        CWSTagger cWSTagger = new CWSTagger("../models/seg.m");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(simpfile), "utf8"));
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(segfile), "utf8"));
        int i = 0;
        while (true) {
            String readLine = bufferedReader.readLine();
            String str = readLine;
            if (readLine == null) {
                System.out.println(i);
                bufferedReader.close();
                bufferedWriter.close();
                return;
            }
            if (str.length() != 0) {
                if (str.startsWith("<doc")) {
                    i++;
                } else if (str.startsWith("</doc>")) {
                    i--;
                } else {
                    str = cWSTagger.tag(str);
                }
            }
            bufferedWriter.append((CharSequence) str);
            bufferedWriter.append((CharSequence) "\n");
        }
    }

    private static void toSimp() throws IOException {
        ChineseTrans chineseTrans = new ChineseTrans();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(infile), "utf8"));
        BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(simpfile), "utf8"));
        int i = 0;
        while (true) {
            String readLine = bufferedReader.readLine();
            String str = readLine;
            if (readLine == null) {
                System.out.println(i);
                bufferedReader.close();
                bufferedWriter.close();
                return;
            }
            if (str.length() != 0) {
                if (str.startsWith("<doc")) {
                    i++;
                } else if (str.startsWith("</doc>")) {
                    i--;
                } else {
                    str = chineseTrans.toSimp(str);
                }
            }
            bufferedWriter.append((CharSequence) str);
            bufferedWriter.append((CharSequence) "\n");
        }
    }
}
