AC自动机算法(Aho-Corasick算法),是在Trie树之上,加了类似 KMP 的 next 数组。
class ACNode //AC自动机的Trie树节点类,假设只有26个字母的数据集
{
public:
char data;
ACNode *children[charNum];
size_t count; //记录这个节点被多少个单词占用
bool isEndOfWord;//是否是一个单词的结束字符
size_t freq; //单词插入的频次
int length; //当isEndOFWord为True时,记录模式串长度
ACNode *fail; //失败指针
ACNode(char ch = '/'):data(ch), isEndOfWord(false),
count(0), freq(0),length(-1),fail(NULL)
{
memset(children,0,sizeof(ACNode*) * charNum);
}
~ACNode(){}
};
void buildFailPointer()
{
queue<ACNode*> ACNode_queue;
ACNode_queue.push(root);
ACNode *p, *pchild, *q, *qchild;
int i;
while(!ACNode_queue.empty())//用队列按层遍历
{
p = ACNode_queue.front();//队首的节点p
ACNode_queue.pop();
for(i = 0; i < charNum; ++i)
{
pchild = p->children[i];//找到p的非空子节点pc
if(pchild == NULL)
continue;
if(p == root)
pchild->fail = root;
else
{
q = p->fail; //q为p的失效指针
while(q != NULL) //q不为空
{
qchild = q->children[pchild->data-'a'];//字符等于pc的qc
if(qchild != NULL)//qc存在
{
pchild->fail = qchild;//链接pc失败指针到qc
break;//找到了就跳出循环
}
q = q->fail;//qc不存在,就再去上一个失效点找
}
if(q == NULL)//最后找到root处还没找到
pchild->fail = root;//pc的失效指针指向root
}
ACNode_queue.push(pchild);//把p的非空子节点pc入队
}
}
}
void match(const string &maintext) //maintext是主串
{
int n = maintext.size();
ACNode *p = root, *temp;//模式串从root开始
int index, pos;
for(int i = 0; i < n; ++i)//主串从i=0开始
{
index = maintext[i]-'a';//子节点下标
while(p->children[index] == NULL && p != root)
{//p不为root,且 子节点为空(找不到那个i对应的字符)
p = p->fail; //失败指针发挥作用的地方
}
p = p->children[index];
if(p == NULL)
p = root; //如果没有匹配的,从root开始重新匹配
temp = p;
while(temp != root)//打印出可以匹配的模式串
{
if(temp->isEndOfWord == true)
{
pos = i-temp->length+1;
cout << "Found " << maintext.substr(pos,temp->length) << " at "
"position(start from 0) "<< pos << " at " << maintext << endl;
}
temp = temp->fail;
}
}
}
主程序
Trie textlib;
string a("abcd"), b("bcd"), c("c");
textlib.insert(a);
textlib.insert(a);
textlib.insert(b);
textlib.insert(c);
textlib.buildFailPointer();
textlib.match("abcdc");