利用前缀树过滤敏感词

定义一个前缀树

public class Trie {

    // 根节点
    private TrieNode root;

    public Trie() {
        root = new TrieNode();
    }

    public TrieNode getRoot() {
        return root;
    }

    public void addWord(String word) {
        TrieNode cur = root;
        for (int i = 0; i < word.length(); i++) {
            TrieNode subNode = cur.getSubNode(word.charAt(i));
            if (subNode == null) {
                subNode = new TrieNode();
                cur.addSubNode(word.charAt(i), subNode);
            }
            cur = subNode;
        }
        cur.setEnd(true);
    }

    // 把和节点直接相关的方法都封装在了TrieNode里面
    static class TrieNode {
        boolean end;
        Map<Character, TrieNode> subNodes = new HashMap<>();
        TrieNode() {}

        public boolean isEnd() {
            return end;
        }

        public void setEnd(boolean end) {
            this.end = end;
        }

        // 添加子节点
        public void addSubNode(Character c, TrieNode node) {
            subNodes.put(c, node);
        }

        // 获取子节点
        public TrieNode getSubNode(Character c) {
            return subNodes.get(c);
        }

        // 是否有某个子节点
        public boolean hasSubNode(Character c) {
            return subNodes.get(c) != null;
        }
    }
}

接下来就是定义一个过滤器来使用前缀树

public class SensitiveFilter {
    // 替换符
    private final String REPLACEMENT = "*";
    private final Trie trie;
    private final Trie.TrieNode rootNode;


    public SensitiveFilter() {
        trie = new Trie();
        rootNode = trie.getRoot();
        init();
    }
  
    // 从配置文件里读出敏感词，放在前缀树中
    private void init() {
        try (
                InputStream is = this.getClass().getClassLoader().getResourceAsStream("sensitive-words.txt");
                BufferedReader reader = new BufferedReader(new InputStreamReader(is));
        ) {
            String word;
            while ((word = reader.readLine()) != null) {
                // 添加到前缀树
                trie.addWord(word);
            }
        } catch (IOException e) {
//            logger.error("加载敏感词文件失败: " + e.getMessage());
        }
    }

    /**
     * 过滤敏感词
     *
     * @param text 待过滤的文本
     * @return 过滤后的文本
     */
    public String filter(String text) {
//        if (StringUtils.isBlank(text)) {
//            return "";
//        }

        char[] chs = text.toCharArray();
        int n = text.length();
        boolean flag = false; // 是否发生过替换
        int idx = 0;
        int tmp;
        while (idx < n) {
            tmp = matchLen(text, idx, n);
            if (tmp != -1) {
                replaceChar(chs, idx, tmp + 1);
                flag = true;
                idx = tmp + 1;
            } else {
                idx++;
            }
        }

        return flag ? new String(chs) : text;
    }

    /**
     * 用前缀树来匹配敏感词
     *
     * @param word  被检测的字符串
     * @param begin 开始位置
     * @param n     字符串的长度
     *
     * @return 以begin开头的敏感词的最远结束位置，比如 ab 和 abc 都是敏感词，则返回的是c的位置，而不是b的位置;
     *         如果没有以begin开头的敏感词，则返回 -1
     */
    private int matchLen(String word, int begin, int n) {
        Trie.TrieNode curNode = rootNode;
        int ans = -1;
        while (begin < n && (curNode = curNode.getSubNode(word.charAt(begin))) != null) {
            if (curNode.isEnd()) {
                ans = begin;
            }

            begin++;
        }
        return ans;
    }

    /**
     * 将char[]类型数组的 [start, end) 替换成 REPLACEMENT(默认是*)
     * 注意：左闭右开
     * @param chs
     * @param start
     * @param end
     */
    public void replaceChar(char[] chs, int start, int end) {
        while (start < end) {
            chs[start++] = '*';
        }
    }


}