/***
* Wu-Manber多模式关键词快速检测(过滤)算法
* @author Ivan Ji 2011.08.18
*/
import java.util.Vector;
public class WuManber {
private int B=2;//块字符X的长度(模式串后缀字符的个数)
private boolean initFlag = false;//是否初始化
private UnionPatternSet unionPatternSet = new UnionPatternSet();
private int maxIndex = (int) java.lang.Math.pow(2, 16);
private int shiftTable[] = new int[maxIndex];
public Vector <AtomicPattern> hashTable[] = new Vector[maxIndex];
private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();
public WuManber() {
}
public static void main(String args[]){//测试
WuManber objWM=new WuManber();
Vector<String> vKey=new Vector<String>();
vKey.add("法轮功");
vKey.add("中国");
vKey.add("共产党");
vKey.add("ABC");
vKey.add("B52");
if(objWM.addFilterKeyWord(vKey, 0)){
String sResult=objWM.macth("我的法 轮功是否在中国的共产党中的ABC是不是革命党中的A B C是不是革命党中的ABC是不是革命B52",new Vector(0));
System.out.println(sResult);
}
objWM.clear();
}
//匹配
public String macth(String content, Vector <Integer> levelSet) {
StringBuffer sResult=new StringBuffer();
if (initFlag == false)
init();
Vector <AtomicPattern> aps = new Vector <AtomicPattern>();
String preContent = content;//preConvert(content);
int iSameCount=0;
for (int i = 0; i < preContent.length();) {
char checkChar = preContent.charAt(i);
if (shiftTable[checkChar] == 0) {
Vector <AtomicPattern> tmpAps = new Vector <AtomicPattern>();
tmpAps = findMathAps(preContent.substring(0, i + 1),hashTable[checkChar]);
aps.addAll(tmpAps);
if(tmpAps.size()>0){
sResult.append(tmpAps.get(0).getPattern().str).append("|");
iSameCount++;
}
i++;
} else
i = i + shiftTable[checkChar];
}
parseAtomicPatternSet(aps, levelSet);
//sResult.append(" 共有:").append(iSameCount).append("处不合格!");
return sResult.toString();
}
//加入关键词
public boolean addFilterKeyWord(Vector<String> keyWord, int level) {
if (initFlag == true)
return false;
UnionPattern unionPattern = new UnionPattern();
Object[] strArray = keyWord.toArray();
for (int i = 0; i < strArray.length; i++) {
String sPattern=(String)strArray[i];
Pattern pattern = new Pattern(sPattern);
AtomicPattern atomicPattern = new AtomicPattern(pattern);
unionPattern.addNewAtomicPattrn(atomicPattern);
unionPattern.setLevel(level);
atomicPattern.setBelongUnionPattern(unionPattern);
}
tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
return true;
}
//验证字符
private boolean isValidChar(char ch) {
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'))
return true;
if ((ch >= 0x4e00 && ch <= 0x7fff) || (ch >= 0x8000 && ch <= 0x952f))
return true;// 简体中文汉字编码
return false;
}
//封装原子模式集
private void parseAtomicPatternSet(Vector <AtomicPattern> aps,Vector <Integer> levelSet) {
while (aps.size() > 0) {
AtomicPattern ap = aps.get(0);
UnionPattern up = ap.belongUnionPattern;
if (up.isIncludeAllAp(aps) == true) {
levelSet.add(new Integer(up.getLevel()));
}
aps.remove(0);
}
}
//查找原子模式
private Vector <AtomicPattern> findMathAps(String src,Vector <AtomicPattern> destAps) {
Vector <AtomicPattern> aps = new Vector <AtomicPattern>();
for (int i = 0; i < destAps.size(); i++) {
AtomicPattern ap = destAps.get(i);
if (ap.findMatchInString(src) == true)
aps.add(ap);
}
return aps;
}
//预转换内容(除掉特殊字符)
private String preConvert(String content) {
String retStr = new String();
for (int i = 0; i < content.length(); i++) {
char ch = content.charAt(i);
if (this.isValidChar(ch) == true) {
retStr = retStr + ch;
}
}
return retStr;
}
// shift table and hash table of initialize
private void init() {
initFlag = true;
for (int i = 0; i < maxIndex; i++)
hashTable[i] = new Vector <AtomicPattern>();
shiftTableInit();
hashTableInit();
}
//清除
public void clear() {
tmpUnionPatternSet.clear();
initFlag = false;
}
//初始化跳跃表
private void shiftTableInit() {
for (int i = 0; i < maxIndex; i++)
shiftTable[i] = B;
Vector <UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector <AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
//System.out.print(pattern.charAtEnd(1)+"\t");
if (shiftTable[pattern.charAtEnd(1)] != 0)
shiftTable[pattern.charAtEnd(1)] = 1;
if (shiftTable[pattern.charAtEnd(0)] != 0)
shiftTable[pattern.charAtEnd(0)] = 0;
}
}
}
//初始化HASH表
private void hashTableInit() {
Vector <UnionPattern> upSet = tmpUnionPatternSet.getSet();
for (int i = 0; i < upSet.size(); i++) {
Vector <AtomicPattern> apSet = upSet.get(i).getSet();
for (int j = 0; j < apSet.size(); j++) {
AtomicPattern ap = apSet.get(j);
Pattern pattern = ap.getPattern();
//System.out.println(pattern.charAtEnd(0));
if (pattern.charAtEnd(0) != 0) {
hashTable[pattern.charAtEnd(0)].add(ap);
}
}
}
}
}
//模式类
class Pattern {
public String str;
Pattern(String str) {
this.str = str;
}
public char charAtEnd(int index) {
if (str.length() > index) {
return str.charAt(str.length() - index - 1);
} else
return 0;
}
public String getStr() {
return str;
};
}
//原子模式类
class AtomicPattern {
public boolean findMatchInString(String str) {
if (this.pattern.str.length() > str.length())
return false;
int beginIndex = str.length() - this.pattern.str.length();
String eqaulLengthStr = str.substring(beginIndex);
if (this.pattern.str.equalsIgnoreCase(eqaulLengthStr))
return true;
return false;
}
AtomicPattern(Pattern pattern) {
this.pattern = pattern;
};
private Pattern pattern;
public UnionPattern belongUnionPattern;
public UnionPattern getBelongUnionPattern() {
return belongUnionPattern;
}
public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
this.belongUnionPattern = belongUnionPattern;
}
public Pattern getPattern() {
return pattern;
}
public void setPattern(Pattern pattern) {
this.pattern = pattern;
}
}
//相同的原子模式集类
class SameAtomicPatternSet {
SameAtomicPatternSet() {
SAPS = new Vector <AtomicPattern>();
};
public Vector <AtomicPattern> SAPS;
}
//合并的模式类
class UnionPattern {
// union string
UnionPattern() {
this.apSet = new Vector <AtomicPattern>();
}
public Vector <AtomicPattern> apSet;
public void addNewAtomicPattrn(AtomicPattern ap) {
this.apSet.add(ap);
}
public Vector <AtomicPattern> getSet() {
return apSet;
}
public boolean isIncludeAllAp(Vector <AtomicPattern> inAps) {
if (apSet.size() > inAps.size())
return false;
for (int i = 0; i < apSet.size(); i++) {
AtomicPattern ap = apSet.get(i);
- 1
- 2
- 3
- 4
前往页