敏感词DFA算法,数据库或者txt文档构建敏感词库,未对特殊符号筛选
内容导读
互联网集市收集整理的这篇技术教程文章主要介绍了敏感词DFA算法,数据库或者txt文档构建敏感词库,未对特殊符号筛选,小编现在分享给大家,供广大互联网技能从业者学习和参考。文章包含14147字,纯文字阅读大概需要21分钟。
内容图文
![敏感词DFA算法,数据库或者txt文档构建敏感词库,未对特殊符号筛选](/upload/InfoBanner/zyjiaocheng/918/eb984c1c8b0f412bb629b5fbc815a39d.jpg)
1、DFA算法过滤 import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; /** * @Description: DFA算法进行敏感词过滤 * @Author: gyw * @CreateDate: 2018/11/27 14:09 */ public class SensitiveWordUtil { /** * 敏感词匹配规则 */ public static final int MinMatchTYpe = 1; //最小匹配规则,如:敏感词库["四川","四川人"],语句:"我是四川人",匹配结果:我是[四川]人 public static final int MaxMatchType = 2; //最大匹配规则,如:敏感词库["四川","四川人"],语句:"我是四川人",匹配结果:我是[四川人] /** * 敏感词集合 */ public static HashMap sensitiveWordMap; /** * 初始化敏感词库,构建DFA算法模型 * * @param sensitiveWordSet 敏感词库 */ public static synchronized void init(Set<String> sensitiveWordSet) { initSensitiveWordMap(sensitiveWordSet); } /** * 初始化敏感词库,构建DFA算法模型--存储在txt文档中 * * @param sensitiveWordSet 敏感词库 */ private static void initSensitiveWordMap(Set<String> sensitiveWordSet) { //初始化敏感词容器,减少扩容操作 sensitiveWordMap = new HashMap(sensitiveWordSet.size()); String key; Map nowMap; Map<String, String> newWorMap = new HashMap<>(); //迭代sensitiveWordSet Iterator<String> iterator = sensitiveWordSet.iterator(); while (iterator.hasNext()) { //关键字 key = iterator.next(); nowMap = sensitiveWordMap; SensitiveWordUtil.getNowMap(key,nowMap,newWorMap); } } /** * 初始化敏感词库,构建DFA算法模型---以类的形式存储数据库 * @param list 敏感词库 * @param tClass (T自定义类) * @param fieldName 敏感词库在类中的列(注意,工具里面会利用这个fieldName生成对应的get方法,所以,该类中必须有对应的Get方法) * @return */ public static <T> void extractItemFromSet(List<T> list, Class<T> tClass, String fieldName) { //初始化敏感词容器,减少扩容操作 sensitiveWordMap = new HashMap(list.size()); Map nowMap; Map<String, String> newWorMap = new HashMap<>(); String key = ""; Method method = null; //获得方法名 getXXX String methodName = "get" + fieldName.substring(0, 1).toUpperCase() + fieldName.substring(1); try { //通过给定的类名,生成类的实例,然后取得这个实例的指定方法名的引用。 method = (tClass.newInstance()).getClass().getMethod(methodName); for(T t:list) { //执行对象t的特定方法(前面取得的方法名method) key = (String) method.invoke(t); nowMap = sensitiveWordMap; SensitiveWordUtil.getNowMap(key,nowMap,newWorMap); } } catch (NoSuchMethodException e) { e.printStackTrace(); } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e){ e.printStackTrace(); }catch (InvocationTargetException e) { e.printStackTrace(); } } /** * 将当前词组改为树形 * @param key * @param nowMap * @param newWorMap */ private static void getNowMap(String key,Map nowMap,Map<String, String> newWorMap){ for (int i = 0; i < key.length(); i++) { //转换成char型 char keyChar = key.charAt(i); //库中获取关键字 Object wordMap = nowMap.get(keyChar); //如果存在该key,直接赋值,用于下一个循环获取 if (wordMap != null) { nowMap = (Map) wordMap; } else { //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个 newWorMap = new HashMap<>(); //不是最后一个 newWorMap.put("isEnd", "0"); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } if (i == key.length() - 1) { //最后一个 nowMap.put("isEnd", "1"); } } } /** * 判断文字是否包含敏感字符 * * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return 若包含返回true,否则返回false */ public static boolean contains(String txt, int matchType) { boolean flag = false; for (int i = 0; i < txt.length(); i++) { int matchFlag = checkSensitiveWord(txt, i, matchType); //判断是否包含敏感字符 if (matchFlag > 0) { //大于0存在,返回true flag = true; } } return flag; } /** * 判断文字是否包含敏感字符 * * @param txt 文字 * @return 若包含返回true,否则返回false */ public static boolean contains(String txt) { return contains(txt, MaxMatchType); } /** * 获取文字中的敏感词 * * @param txt 文字 * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则 * @return */ public static Set<String> getSensitiveWord(String txt, int matchType) { Set<String> sensitiveWordList = new HashSet<>(); for (int i = 0; i < txt.length(); i++) { //判断是否包含敏感字符 int length = checkSensitiveWord(txt, i, matchType); if (length > 0) {//存在,加入list中 sensitiveWordList.add(txt.substring(i, i + length)); i = i + length - 1;//减1的原因,是因为for会自增 } } return sensitiveWordList; } /** * 获取文字中的敏感词 * * @param txt 文字 * @return */ public static Set<String> getSensitiveWord(String txt) { return getSensitiveWord(txt, MaxMatchType); } /** * 替换敏感字字符 * * @param txt 文本 * @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱四川人 敏感词:四川人,替换字符:*, 替换结果:我爱*** * @param matchType 敏感词匹配规则 * @return */ public static String replaceSensitiveWord(String txt, char replaceChar, int matchType) { String resultTxt = txt; //获取所有的敏感词 Set<String> set = getSensitiveWord(txt, matchType); Iterator<String> iterator = set.iterator(); String word; String replaceString; while (iterator.hasNext()) { word = iterator.next(); replaceString = getReplaceChars(replaceChar, word.length()); resultTxt = resultTxt.replaceAll(word, replaceString); } return resultTxt; } /** * 替换敏感字字符 * * @param txt 文本 * @param replaceChar 替换的字符,匹配的敏感词以字符逐个替换,如 语句:我爱四川人 敏感词:四川人,替换字符:*, 替换结果:我爱*** * @return */ public static String replaceSensitiveWord(String txt, char replaceChar) { return replaceSensitiveWord(txt, replaceChar, MaxMatchType); } /** * 替换敏感字字符 * * @param txt 文本 * @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱四川人 敏感词:四川人,替换字符串:[屏蔽],替换结果:我爱[屏蔽] * @param matchType 敏感词匹配规则 * @return */ public static String replaceSensitiveWord(String txt, String replaceStr, int matchType) { String resultTxt = txt; //获取所有的敏感词 Set<String> set = getSensitiveWord(txt, matchType); Iterator<String> iterator = set.iterator(); String word; while (iterator.hasNext()) { word = iterator.next(); resultTxt = resultTxt.replaceAll(word, replaceStr); } return resultTxt; } /** * 替换敏感字字符 * * @param txt 文本 * @param replaceStr 替换的字符串,匹配的敏感词以字符逐个替换,如 语句:我爱四川人 敏感词:四川人,替换字符串:[屏蔽],替换结果:我爱[屏蔽] * @return */ public static String replaceSensitiveWord(String txt, String replaceStr) { return replaceSensitiveWord(txt, replaceStr, MaxMatchType); } /** * 获取替换字符串 * * @param replaceChar * @param length * @return */ private static String getReplaceChars(char replaceChar, int length) { String resultReplace = String.valueOf(replaceChar); for (int i = 1; i < length; i++) { resultReplace += replaceChar; } return resultReplace; } /** * 检查文字中是否包含敏感字符,检查规则如下:<br> * * @param txt * @param beginIndex * @param matchType * @return 如果存在,则返回敏感词字符的长度,不存在返回0 */ private static int checkSensitiveWord(String txt, int beginIndex, int matchType) { //敏感词结束标识位:用于敏感词只有1位的情况 boolean flag = false; //匹配标识数默认为0 int matchFlag = 0; char word; Map nowMap = sensitiveWordMap; for (int i = beginIndex; i < txt.length(); i++) { word = txt.charAt(i); //获取指定key nowMap = (Map) nowMap.get(word); if (nowMap != null) {//存在,则判断是否为最后一个 //找到相应key,匹配标识+1 matchFlag++; //如果为最后一个匹配规则,结束循环,返回匹配标识数 if ("1".equals(nowMap.get("isEnd"))) { //结束标志位为true flag = true; //最小规则,直接返回,最大规则还需继续查找 if (MinMatchTYpe == matchType) { break; } } } else {//不存在,直接返回 break; } } if (matchFlag < 2 || !flag) {//长度必须大于等于1,为词 matchFlag = 0; } return matchFlag; } public static void main(String[] args) { /*Set<String> sensitiveWordSet = new HashSet<>(); sensitiveWordSet.add("太多"); sensitiveWordSet.add("爱恋"); sensitiveWordSet.add("静静"); sensitiveWordSet.add("哈哈"); sensitiveWordSet.add("啦啦"); sensitiveWordSet.add("感动"); sensitiveWordSet.add("发呆");*/ Long start = System.currentTimeMillis(); //初始化敏感词库 SensitiveWordUtil.init(ReadTxtUtil.words); System.out.println("敏感词的数量:" + SensitiveWordUtil.sensitiveWordMap.size()); String string = "太多的伤感情怀也许只局限于饲养基地 荧幕中的情节。" + "然后我们的扮演的角色就是跟随着主人公的喜红客联盟 怒哀乐而过于牵强的把自己的情感也附加于银幕情节中,然后感动就流泪," + "难过就躺在某一个人的怀里尽情的阐述心扉或者手机卡复制器一个贱人一杯红酒一部电影在夜 深人静的晚上,关上电话静静的发呆着。"; System.out.println("待检测语句字数:" + string.length()); String other = "江泽民和孙中山去山上见毛泽东,商量党委书记是毛泽东还是蒋介石;胡(锦)涛和胡锦三有啥区别"; //是否含有关键字 boolean result = SensitiveWordUtil.contains(other); System.out.println(result); result = SensitiveWordUtil.contains(other, SensitiveWordUtil.MinMatchTYpe); System.out.println(result); //获取语句中的敏感词 Set<String> set = SensitiveWordUtil.getSensitiveWord(other); System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set); set = SensitiveWordUtil.getSensitiveWord(other, SensitiveWordUtil.MinMatchTYpe); System.out.println("语句中包含敏感词的个数为:" + set.size() + "。包含:" + set); //替换语句中的敏感词 String filterStr = SensitiveWordUtil.replaceSensitiveWord(other, '*'); System.out.println(filterStr); filterStr = SensitiveWordUtil.replaceSensitiveWord(other, '*', SensitiveWordUtil.MinMatchTYpe); System.out.println(filterStr); String filterStr2 = SensitiveWordUtil.replaceSensitiveWord(other, "[*敏感词*]"); System.out.println(filterStr2); filterStr2 = SensitiveWordUtil.replaceSensitiveWord(other, "[*敏感词*]", SensitiveWordUtil.MinMatchTYpe); System.out.println(filterStr2); System.out.println((System.currentTimeMillis()-start)/1000); } }
2、txt文档操作
import java.io.*; import java.util.*; /** * @Description: 读取txt文件返回Set<String></String> * @Author: gyw * @CreateDate: 2018/11/27 11:42 */ public class ReadTxtUtil { public static String filePath = ReadTxtUtil.class.getResource("/").getPath()+ "dictionary.txt";//文件路径-resources文件下 public static Set<String> words;//敏感词值 //读取文件传值给words static{ ReadTxtUtil.words = readTxtByLine(filePath); } /** * 读取编码格式 * @param path * @return * @throws Exception */ public static String resolveCode(String path) throws Exception { InputStream inputStream = new FileInputStream(path); byte[] head = new byte[3]; inputStream.read(head); String code = "gb2312"; //或GBK if (head[0] == -1 && head[1] == -2 ) code = "UTF-16"; else if (head[0] == -2 && head[1] == -1 ) code = "Unicode"; else if(head[0]==-17 && head[1]==-69 && head[2] ==-65) code = "UTF-8"; inputStream.close(); System.out.println(code); return code; } /** * 读取文件 * @param path--文件路径 * @return */ public static Set<String> readTxtByLine(String path){ Set<String> keyWordSet = new HashSet<String>(); File file=new File(path); if(!file.exists()){ //文件流是否存在 return keyWordSet; } BufferedReader reader=null; String temp=null; //int line=1; try{ //String code = ReadTxtUtil.resolveCode(path); //reader=new BufferedReader(new FileReader(file));//这样在web运行的时候,读取会乱码 reader=new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); while((temp=reader.readLine())!=null){ //System.out.println("line"+line+":"+temp); keyWordSet.add(temp); //line++; } } catch(Exception e){ e.printStackTrace(); } finally{ if(reader!=null){ try{ reader.close(); }catch(Exception e){ e.printStackTrace(); } } } return keyWordSet; } /** * 向文件添加内容换行 * @param filePath * @param txt */ public static void addTxtByLine(String filePath,String txt) { FileWriter fw = null; try { //如果文件存在,则追加内容;如果文件不存在,则创建文件 File f = new File(filePath); fw = new FileWriter(f, true); } catch (IOException e) { e.printStackTrace(); } PrintWriter pw = new PrintWriter(fw); pw.println(txt); pw.flush(); try { fw.flush(); pw.close(); fw.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 添加换行符 * @param filePath * @throws IOException */ public static void addTxtLine(String filePath) throws IOException { try { // 打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件 FileWriter writer = new FileWriter(filePath, true); writer.write(System.getProperty("line.separator"));//换行符 writer.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 将文件中指定内容的第一行替换为其它内容 . * @param oldStr 查找内容 * @param replaceStr 替换内容 * @param filePath 文件路径 */ public static void replaceTxtByStr(String oldStr,String replaceStr,String filePath) { String temp = ""; try { File file = new File(filePath); FileInputStream fis = new FileInputStream(file); InputStreamReader isr = new InputStreamReader(fis); BufferedReader br = new BufferedReader(isr); StringBuffer buf = new StringBuffer(); // 保存该行前面的内容 for (int j = 1; (temp = br.readLine()) != null; j++) { if(!temp.equals(oldStr)){ buf = buf.append(temp); buf = buf.append(System.getProperty("line.separator")); }else{ // 将内容插入 buf = buf.append(replaceStr); } } // 保存该行后面的内容 while ((temp = br.readLine()) != null) { buf = buf.append(System.getProperty("line.separator")); buf = buf.append(temp); } br.close(); FileOutputStream fos = new FileOutputStream(file); PrintWriter pw = new PrintWriter(fos); pw.write(buf.toString().toCharArray()); pw.flush(); pw.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 将文件中指定内容的第一行替换为其它内容 . * @param oldStr 查找内容 * @param filePath 文件路径 */ public static void delTxtByStr(String oldStr,String filePath) { String temp = ""; try { File file = new File(filePath); FileInputStream fis = new FileInputStream(file); InputStreamReader isr = new InputStreamReader(fis); BufferedReader br = new BufferedReader(isr); StringBuffer buf = new StringBuffer(); // 保存该行前面的内容 for (int j = 1; (temp = br.readLine()) != null; j++) { if(!temp.equals(oldStr)){ buf = buf.append(temp); buf = buf.append(System.getProperty("line.separator")); }else{ // 将内容插入 buf = buf.append(""); } } // 保存该行后面的内容 while ((temp = br.readLine()) != null) { buf = buf.append(System.getProperty("line.separator")); buf = buf.append(temp); } br.close(); FileOutputStream fos = new FileOutputStream(file); PrintWriter pw = new PrintWriter(fos); pw.write(buf.toString().toCharArray()); pw.flush(); pw.close(); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args){ //ReadTxtUtil.addTxtByLine(filePath,"你大爷");//添加 //ReadTxtUtil.replaceTxtByStr("你大爷","大爷",filePath);//修改 //ReadTxtUtil.delTxtByStr("大爷",filePath);//删除 } }
内容总结
以上是互联网集市为您收集整理的敏感词DFA算法,数据库或者txt文档构建敏感词库,未对特殊符号筛选全部内容,希望文章能够帮你解决敏感词DFA算法,数据库或者txt文档构建敏感词库,未对特殊符号筛选所遇到的程序开发问题。 如果觉得互联网集市技术教程内容还不错,欢迎将互联网集市网站推荐给程序员好友。
内容备注
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 gblab@vip.qq.com 举报,一经查实,本站将立刻删除。
内容手机端
扫描二维码推送至手机访问。