package org.apdplat.word.corpus;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apdplat/word/corpus/ExtractText.class */
public class ExtractText {
    private static final Logger LOGGER = LoggerFactory.getLogger(ExtractText.class);
    private static final AtomicInteger WORD_COUNT = new AtomicInteger();
    private static final AtomicInteger CHAR_COUNT = new AtomicInteger();

    public static void main(String[] strArr) {
        String str = " ";
        String str2 = strArr.length == 1 ? strArr[0] : "target/word.txt";
        if (strArr.length == 2) {
            str2 = strArr[0];
            str = strArr[1];
        }
        extractFromCorpus(str2, str, true);
    }

    public static void extractFromCorpus(String str, String str2, boolean z) {
        LOGGER.info("开始从语料库中抽取文本");
        long currentTimeMillis = System.currentTimeMillis();
        try {
            analyzeCorpus("src/main/resources/corpus/corpora.zip", str, str2, z);
        } catch (IOException e) {
            LOGGER.info("抽取失败：" + e.getMessage());
        }
        LOGGER.info("完成抽取，耗时：" + (System.currentTimeMillis() - currentTimeMillis) + "毫秒");
        LOGGER.info("抽取出的总字符数目为：" + CHAR_COUNT.get() + "，总词数目为：" + WORD_COUNT.get());
    }

    private static void analyzeCorpus(String str, String str2, final String str3, final boolean z) throws IOException {
        FileSystem newFileSystem = FileSystems.newFileSystem(Paths.get(str, new String[0]), ExtractText.class.getClassLoader());
        Throwable th = null;
        try {
            final BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(str2), "utf-8"));
            Throwable th2 = null;
            try {
                try {
                    for (Path path : newFileSystem.getRootDirectories()) {
                        LOGGER.info("处理目录：" + path);
                        Files.walkFileTree(path, new SimpleFileVisitor<Path>() { // from class: org.apdplat.word.corpus.ExtractText.1
                            @Override // java.nio.file.SimpleFileVisitor, java.nio.file.FileVisitor
                            public FileVisitResult visitFile(Path path2, BasicFileAttributes basicFileAttributes) throws IOException {
                                ExtractText.LOGGER.info("处理文件：" + path2);
                                Path path3 = Paths.get("target/corpus-" + System.currentTimeMillis() + ".txt", new String[0]);
                                Files.copy(path2, path3, StandardCopyOption.REPLACE_EXISTING);
                                ExtractText.extractText(path3, bufferedWriter, str3, z);
                                return FileVisitResult.CONTINUE;
                            }
                        });
                    }
                    if (bufferedWriter != null) {
                        if (0 != 0) {
                            try {
                                bufferedWriter.close();
                            } catch (Throwable th3) {
                                th2.addSuppressed(th3);
                            }
                        } else {
                            bufferedWriter.close();
                        }
                    }
                    if (newFileSystem != null) {
                        if (0 == 0) {
                            newFileSystem.close();
                            return;
                        }
                        try {
                            newFileSystem.close();
                        } catch (Throwable th4) {
                            th.addSuppressed(th4);
                        }
                    }
                } catch (Throwable th5) {
                    th2 = th5;
                    throw th5;
                }
            } catch (Throwable th6) {
                if (bufferedWriter != null) {
                    if (th2 != null) {
                        try {
                            bufferedWriter.close();
                        } catch (Throwable th7) {
                            th2.addSuppressed(th7);
                        }
                    } else {
                        bufferedWriter.close();
                    }
                }
                throw th6;
            }
        } catch (Throwable th8) {
            if (newFileSystem != null) {
                if (0 != 0) {
                    try {
                        newFileSystem.close();
                    } catch (Throwable th9) {
                        th.addSuppressed(th9);
                    }
                } else {
                    newFileSystem.close();
                }
            }
            throw th8;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void extractText(Path path, BufferedWriter bufferedWriter, String str, boolean z) {
        String[] split;
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(path.toFile()), "utf-8"));
            Throwable th = null;
            while (true) {
                try {
                    try {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        String trim = readLine.trim();
                        if (!"".equals(trim) && (split = trim.split("\\s+")) != null) {
                            StringBuilder sb = new StringBuilder();
                            int i = 0;
                            boolean z2 = false;
                            for (String str2 : split) {
                                String[] split2 = str2.split("/");
                                if (split2 != null && split2.length >= 1) {
                                    if (split2[0].trim().startsWith("[")) {
                                        z2 = true;
                                    }
                                    String trim2 = split2[0].replace("[", "").replace("]", "").trim();
                                    bufferedWriter.write(trim2 + str);
                                    if (z2) {
                                        sb.append(trim2);
                                        i++;
                                    }
                                    if (i > 10) {
                                        z2 = false;
                                        i = 0;
                                        sb.setLength(0);
                                    }
                                    if (z2 && split2.length > 1 && split2[1].trim().endsWith("]")) {
                                        z2 = false;
                                        if (z) {
                                            bufferedWriter.write(sb.toString() + str);
                                        }
                                        sb.setLength(0);
                                    }
                                    WORD_COUNT.incrementAndGet();
                                    CHAR_COUNT.addAndGet(trim2.length());
                                }
                            }
                            bufferedWriter.write("\n");
                        }
                    } catch (Throwable th2) {
                        th = th2;
                        throw th2;
                    }
                } finally {
                }
            }
            if (bufferedReader != null) {
                if (0 != 0) {
                    try {
                        bufferedReader.close();
                    } catch (Throwable th3) {
                        th.addSuppressed(th3);
                    }
                } else {
                    bufferedReader.close();
                }
            }
        } catch (Exception e) {
            LOGGER.info("从语料库 " + path + " 中抽取文本失败：", e);
        }
    }
}
