package com.hankcs.hanlp.mining.word;

import com.hankcs.hanlp.algorithm.MaxHeap;
import com.hankcs.hanlp.utility.LexiconUtility;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;

/* loaded from: classes2.dex */
public class NewWordDiscover {
    private boolean filter;
    private int max_word_len;
    private float min_aggregation;
    private float min_entropy;
    private float min_freq;

    public NewWordDiscover() {
        this(4, 5.0E-5f, 0.4f, 1.2f, false);
    }

    public NewWordDiscover(int i8, float f9, float f10, float f11, boolean z8) {
        this.max_word_len = i8;
        this.min_freq = f9;
        this.min_entropy = f10;
        this.min_aggregation = f11;
        this.filter = z8;
    }

    public List<WordInfo> discover(BufferedReader bufferedReader, int i8) throws IOException {
        Map<String, WordInfo> treeMap = new TreeMap<>();
        Pattern compile = Pattern.compile("[\\s\\d,.<>/?:;'\"\\[\\]{}()\\|~!@#$%^&*\\-_=+，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+");
        int i9 = 0;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String replaceAll = compile.matcher(readLine).replaceAll("\u0000");
            int length = replaceAll.length();
            int i10 = 0;
            while (i10 < length) {
                int i11 = i10 + 1;
                int min = Math.min(this.max_word_len + i11, length + 1);
                int i12 = i11;
                while (i12 < min) {
                    String substring = replaceAll.substring(i10, i12);
                    if (substring.indexOf(0) < 0) {
                        WordInfo wordInfo = treeMap.get(substring);
                        if (wordInfo == null) {
                            wordInfo = new WordInfo(substring);
                            treeMap.put(substring, wordInfo);
                        }
                        wordInfo.update(i10 == 0 ? (char) 0 : replaceAll.charAt(i10 - 1), i12 < length ? replaceAll.charAt(i12) : (char) 0);
                    }
                    i12++;
                }
                i10 = i11;
            }
            i9 += length;
        }
        Iterator<WordInfo> it = treeMap.values().iterator();
        while (it.hasNext()) {
            it.next().computeProbabilityEntropy(i9);
        }
        Iterator<WordInfo> it2 = treeMap.values().iterator();
        while (it2.hasNext()) {
            it2.next().computeAggregation(treeMap);
        }
        LinkedList linkedList = new LinkedList(treeMap.values());
        ListIterator listIterator = linkedList.listIterator();
        while (listIterator.hasNext()) {
            WordInfo wordInfo2 = (WordInfo) listIterator.next();
            if (wordInfo2.text.trim().length() < 2 || wordInfo2.f34192p < this.min_freq || wordInfo2.entropy < this.min_entropy || wordInfo2.aggregation < this.min_aggregation || (this.filter && LexiconUtility.getFrequency(wordInfo2.text) > 0)) {
                listIterator.remove();
            }
        }
        MaxHeap maxHeap = new MaxHeap(i8, new Comparator<WordInfo>() { // from class: com.hankcs.hanlp.mining.word.NewWordDiscover.1
            @Override // java.util.Comparator
            public int compare(WordInfo wordInfo3, WordInfo wordInfo4) {
                return Float.compare(wordInfo3.f34192p, wordInfo4.f34192p);
            }
        });
        maxHeap.addAll(linkedList);
        return maxHeap.toList();
    }

    public List<WordInfo> discover(String str, int i8) {
        try {
            return discover(new BufferedReader(new StringReader(str)), i8);
        } catch (IOException e9) {
            throw new RuntimeException(e9);
        }
    }
}
