Java实现敏感词检测的代码

2018-07-20    来源:open-open

容器云强势上线!快速搭建集群,上万Linux镜像随意使用

[Java]代码    

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;

public class BadWordsUtil {
	
    public static final int WORDS_MAX_LENGTH = 10;
    public static final String BAD_WORDS_LIB_FILE_NAME = "badWords.txt";
  
    //敏感词列表
    public static Map[] badWordsList = null;
  
    //敏感词索引
    public static Map<String, Integer> wordIndex = new HashMap<String, Integer>();
  
    /*
    * 初始化敏感词库
    */
    public static void initbadWordsList() throws IOException {
        if (badWordsList == null) {
            badWordsList = new Map[WORDS_MAX_LENGTH];
  
            for (int i = 0; i < badWordsList.length; i++) {
                badWordsList[i] = new HashMap<String, String>();
            }
        }

        //敏感词词库所在目录,这里为txt文本,一个敏感词一行
        String path = BadWordsUtil.class.getClassLoader()
                                        .getResource(BAD_WORDS_LIB_FILE_NAME)
                                        .getPath();
        System.out.println(path);
  
        List<String> words = FileUtils.readLines(new File(path),"UTF-8");
  
        for (String w : words) {
            if (StringUtils.isNotBlank(w)) {
                //将敏感词按长度存入map
                badWordsList[w.length()].put(w.toLowerCase(), "");
  
                Integer index = wordIndex.get(w.substring(0, 1));
  
                //生成敏感词索引,存入map
                if (index == null) {
                    index = 0;
                }

                int x = (int) Math.pow(2, w.length());
                index = (index | x);
                wordIndex.put(w.substring(0, 1), index);
            }
        }
    }
  
    /**
    * 检索敏感词
    * @param content
    * @return
    */
    public static List<String> searchBanWords(String content) {
        if (badWordsList == null) {
            try {
                initbadWordsList();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
  
        List<String> result = new ArrayList<String>();
  
        for (int i = 0; i < content.length(); i++) {
            Integer index = wordIndex.get(content.substring(i, i + 1));
            int p = 0;
  
            while ((index != null) && (index > 0)) {
                p++;
                index = index >> 1;
  
                String sub = "";
  
                if ((i + p) < (content.length() - 1)) {
                    sub = content.substring(i, i + p);
                } else {
                    sub = content.substring(i);
                }
  
                if (((index % 2) == 1) && badWordsList[p].containsKey(sub)) {
                    result.add(content.substring(i, i + p));
                }
            }
        }
  
        return result;
    }
    
    public static void main(String[] args) throws IOException {
        String content = "含有敏感词的测试";
        BadWordsUtil.initbadWordsList();
        List<String> badWordList = BadWordsUtil.searchBanWords(content);
        if (badWordList.size() == 0){
        	System.out.println("没有找到敏感词!");
        }else{
        	for(String s : badWordList){
                System.out.println("找到敏感词:"+s);
            }
        }
    }
}

标签: ssl 代码

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点!
本站所提供的图片等素材,版权归原作者所有,如需使用,请与原作者联系。

上一篇:PHP的http请求处理类

下一篇:PHP判断网络文件是否存在