一个健壮的搜索系统都有敏感词屏蔽模块,敏感词过滤的做法有很多,从最原始的方法,query 从头到尾搜索一遍,看是否存在此敏感词,再到稍微高级一点儿的采用正则匹配方式,但是本质上都是比较粗糙,且效率慢;比较经典的设计方案是基于 DFA 算法实现的。
DFA 算法
在实现文字过滤的算法中,DFA 是唯一比较好的实现算法。DFA(Deterministic Finite Automaton)中文称之为确定有穷自动机,它是通过 event 和当前的 state 得到下一个 state,即 event + state = next_state。
通俗一点讲,可以理解为系统中有多个节点,通过传递进入的 event,来确定走哪个路由至另一个节点,而节点是有限的。
Java 实现
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.soyoung.as.dao.search.entity.TbSensitiveWord;
import com.soyoung.as.dao.search.mapper.TbSensitiveWordMapper;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* 敏感词管理器
**/
@Component
@Slf4j
public class SensitiveWordManager implements InitializingBean {
// 敏感词映射
private Map<Character, Object> sensitiveWordMap = new HashMap<>();
private Lock lock = new ReentrantLock();
@Resource
private SensitiveWordMapper sensitiveWordMapper;
@Override
public void afterPropertiesSet() throws Exception {
loadSensitiveWordData();
}
/**
* 加载 敏感词 数据
*/
@Scheduled(cron = "0 0/10 * * * ?")
public void loadSensitiveWordData() {
lock.lock();
try {
int count = 0;
try {
count = sensitiveWordMapper.count();
} catch (Throwable e) {
e.printStackTrace();
}
if (count <= 0) {
log.warn("loadSensitiveWordData count le 0");
return;
}
List<SensitiveWord> allList = Lists.newArrayListWithExpectedSize(count);
int cursor = 0;
int fetchSize = 1000;
boolean loop = true;
while (loop) {
try {
List<TbSensitiveWord> list = sensitiveWordMapper.findListByCursor(cursor, fetchSize);
if (CollectionUtils.isNotEmpty(list)) {
int size = list.size();
if (size < fetchSize) {
loop = false;
}
cursor = list.get(size - 1).getId();
for (TbSensitiveWord item : list) {
String word = item.getWord();
if (StringUtils.isBlank(word)) {
continue;
}
allList.add(new SensitiveWord(word));
}
} else {
loop = false;
}
} catch (Throwable throwable) {
log.error("loadSensitiveWordData error", throwable);
return;
}
}
if (CollectionUtils.isEmpty(allList)) {
log.warn("loadSensitiveWordData count le 0");
return;
}
try {
buildSensitiveWord2HashMap(allList);
} catch (Exception e) {
log.error("buildSensitiveWord2HashMap error", e);
}
} finally {
lock.unlock();
}
}
private void buildSensitiveWord2HashMap(List<SensitiveWord> sensitiveWordList) {
if (CollectionUtils.isEmpty(sensitiveWordList)) {
return;
}
Map<Character, Object> map = Maps.newHashMapWithExpectedSize(sensitiveWordList.size());
// 用来按照相应的格式保存屏蔽敏感词库数据
Map nowMap = null;
// 用来辅助构建屏蔽敏感词库
Map newWordMap = null;
Iterator<SensitiveWord> iterator = sensitiveWordList.iterator();
while (iterator.hasNext()) {
String word = iterator.next().getWord();
// 等于屏蔽敏感词库,HashMap 对象在内存中占用的是同一个地址,所以此 nowMap 对象的变化,map 对象也会跟着改变
nowMap = map;
int querySize = word.length();
for (int i = 0; i < querySize; i++) {
char keyChar = word.charAt(i);
// 判断这个字是否存在于屏蔽敏感词词库中
Object wordMap = nowMap.get(keyChar);
if (wordMap != null) {
nowMap = (Map) wordMap;
} else {
// 若不存在,构建一个 HashMap
newWordMap = new HashMap<>();
// 先暂设置未结束
newWordMap.put("isEnd", "0");
nowMap.put(keyChar, newWordMap);
nowMap = newWordMap;
}
// 如果该字是当前屏蔽敏感词的最后一个字,则标识为结尾字
if (i == querySize - 1) {
nowMap.put("isEnd", "1");
}
}
}
// 原子操作,指针指向
sensitiveWordMap = map;
}
private int checkSensitiveWord(String txt, int beginIndex, int matchType) {
// 敏感词结束标识位:用于敏感词只有 1 位的情况
boolean flag = false;
// 匹配标识数默认为 0
int matchFlag = 0;
char word;
Map nowMap = sensitiveWordMap;
for (int i = beginIndex; i < txt.length(); i++) {
word = txt.charAt(i);
// 获取指定key
nowMap = (Map) nowMap.get(word);
if (nowMap != null) {// 存在,则判断是否为最后一个
// 找到相应key,匹配标识+1
matchFlag++;
// 如果为最后一个匹配规则,结束循环,返回匹配标识数
if ("1".equals(nowMap.get("isEnd"))) {
// 结束标志位为 true
flag = true;
// 最小规则,直接返回,最大规则还需继续查找
if (1 == matchType) {
break;
}
}
} else {
// 不存在,直接返回
break;
}
}
if (matchFlag < 2 || !flag) {// 长度必须大于 1(认为是词)
matchFlag = 0;
}
return matchFlag;
}
/**
* 判断文字是否包含敏感字符
*
* @param query 文字
* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return 若包含返回 true,否则返回 false
*/
public boolean contains(String query, int matchType) {
if (StringUtils.isBlank(query)) {
return false;
}
for (int i = 0; i < query.length(); i++) {
// 判断是否包含敏感字符
int matchFlag = checkSensitiveWord(query, i, matchType);
if (matchFlag > 0) { // 大于 0 存在,返回 true
return true;
}
}
return false;
}
@Data
@NoArgsConstructor
public static class SensitiveWord {
private String word;
public SensitiveWord(String word) {
this.word = word;
}
}
}