字符串-AC自动机（详细图解）

唔仄lo咚锵

发布于 2020-10-09 10:05:55

1.3K0

发布于 2020-10-09 10:05:55

文章被收录于专栏：blog(为什么会重名，真的醉了)

AC自动机

原理

Fail指针 同KMP的next一样，Fail指针是AC自动机的核心，是在树上指出失配后下一个跳转的位置，而不用全部回溯，大大减少时间。那么Fail是怎么跳转的？以HDU-2222的样例为例说明，模式串P={“she”,“he”,“say”,“shr”,“her”}，文本串S=“yasherhs”。

1.构建字典树

构造fail指针 2.1 用bfs实现，将root子节点入队（第二层），并将其fail指向root。

2.2 h出队，父节点h的fail指针所指节点是root；此时root没有对应为e的子节点，匹配失败，则e的fail指针指向root，表示没有匹配序列，然后入队e；同样的s出队，其子节点a同理。

2.3 此时循环到s的子节点h，父节点s的fail指针所指节点也是root，但与前面不同的是：root有值为h的子节点，匹配成功，此时fail应指向匹配节点。

2.4 以此类推，求出所有fail指针，右侧e的父节点h的fail指针所指节点是左侧h，而左侧h有值为e的子节点，匹配成功，即右侧e的fail指向左侧e（蓝线），如图。

搜索待处理文本 ①首先根结点下无y和a，第1、2条线还是指向根结点； ②从she开始一直可以匹配，即线3、4、5，到节点e(绿底)，更新答案； ③下一个字符是r，匹配失败，到fail指针所指节点(蓝线所指)，即线6； ④此时匹配到了r(线7)，发现模式串末尾标记，更新答案； ⑤下一个字符h，失配，回到fail所指(线8) ⑥然后继续匹配，成功(线9) ⑦继续下一个字符s，失配回溯(线10) ⑧继续匹配，成功(线11)。最后一个字符结束，退出

模板

void insert(char* p) { //构建字典树
    int u = 0;
    int ls = strlen(p);
    for (int i = 0; i < ls; i++) {
        int v = p[i] - 'a';
        if (trie[u][v]==0)
            trie[u][v] = ++pos;
        u = trie[u][v];
    }
    cnt[u]++; //当前节点单词数+1
}
void getFail() { //求fail
    queue <int>q;
    for (int i = 0; i < 26; i++) { //入队root子节点(第二层)
        if (trie[0][i]) {
            fail[trie[0][i]] = 0;
            q.push(trie[0][i]);
        }
    }
    while (!q.empty()) {
        int cur = q.front();//当前父节点
        q.pop();
        for (int i = 0; i < 26; i++) { //26个字母
            if (trie[cur][i]) { //存在子节点，将其fail指向对应匹配节点(父节点fail所指节点的对应子节点)
                fail[trie[cur][i]] = trie[fail[cur]][i];
                q.push(trie[cur][i]);
            }
            else//若不存在相关子节点，字典树中赋值为fail所指节点
                trie[cur][i] = trie[fail[cur]][i];
        }
    }
}
int query(char* s) { //查询s中出现几个p
    int cur = 0, ans = 0, ls = strlen(s);
    for (int i = 0; i < ls; i++) { 
        cur = trie[cur][s[i] - 'a']; 
        for (int j = cur; j && cnt[j]; j = fail[j]) {//一直向下寻找,直到匹配失败
            ans += cnt[j]; //更新答案
            cnt[j] = 0; //防止重复计算
        }
    }
    return ans;
}

例题

HDU-2222Keywords Search

Problem Description In the modern time, Search engine came into the life of everybody like Google, Baidu, etc. Wiskey also wants to bring this feature to his image retrieval system. Every image have a long description, when users type some keywords to find the image, the system will match the keywords with description of image and show the image which the most keywords be matched. To simplify the problem, giving you a description of image, and some keywords, you should tell me how many keywords will be match. Input First line will contain one integer means how many cases will follow by. Each case will contain two integers N means the number of keywords and N keywords follow. (N <= 10000) Each keyword will only contains characters ‘a’-‘z’, and the length will be not longer than 50. The last line is the description, and the length will be not longer than 1000000. Output Print how many keywords are contained in the description. Sample Input 1 5 she he say shr her yasherhs Sample Output 3

分析：模板题，分析同上

#include<bits/stdc++.h>
using namespace std;
typedef long long ll;
const int maxn = 1000006;
int trie[maxn][26]; //字典树
int cnt[maxn];  //记录单词出现次数
int fail[maxn]; //失败时的回溯指针
int pos;
void insert(char* p) {
    int u = 0;
    int ls = strlen(p);
    for (int i = 0; i < ls; i++) {
        int v = p[i] - 'a';
        if (trie[u][v] == 0)
            trie[u][v] = ++pos;
        u = trie[u][v];
    }
    cnt[u]++;
}
void getFail() {
    queue <int>q;
    for (int i = 0; i < 26; i++) {
        if (trie[0][i]) {
            fail[trie[0][i]] = 0;
            q.push(trie[0][i]);
        }
    }
    while (!q.empty()) {
        int cur = q.front();
        q.pop();
        for (int i = 0; i < 26; i++) {
            if (trie[cur][i]) {
                fail[trie[cur][i]] = trie[fail[cur]][i];
                q.push(trie[cur][i]);
            }
            else
                trie[cur][i] = trie[fail[cur]][i];
        }
    }
}
int query(char* s) {
    int cur = 0, ans = 0, ls = strlen(s);
    for (int i = 0; i < ls; i++) {
        cur = trie[cur][s[i] - 'a'];
        for (int j = cur; j && cnt[j]; j = fail[j]) {
            ans += cnt[j];
            cnt[j] = 0;
        }
    }
    return ans;
}
int main() {
    int n, t;
    char s[maxn], p[maxn];//不喜欢传参，全局也行
    scanf("%d", &t);
    while (t--) {
        scanf("%d", &n);
        memset(trie, 0, sizeof(trie));
        memset(cnt, 0, sizeof(cnt));
        fail[0] = pos = 0;
        for (int i = 0; i < n; i++) {
            scanf("%s", p);
            insert(p);
        }
        getFail();
        scanf("%s", s);
        printf("%d\n", query(s));
    }
    return 0;
}

HDU-2896病毒侵袭

Problem Description 当太阳的光辉逐渐被月亮遮蔽，世界失去了光明，大地迎来最黑暗的时刻。。。。在这样的时刻，人们却异常兴奋——我们能在有生之年看到500年一遇的世界奇观，那是多么幸福的事儿啊~~ 但网路上总有那么些网站，开始借着民众的好奇心，打着介绍日食的旗号，大肆传播病毒。小t不幸成为受害者之一。小t如此生气，他决定要把世界上所有带病毒的网站都找出来。当然，谁都知道这是不可能的。小t却执意要完成这不能的任务，他说：“子子孙孙无穷匮也！”（愚公后继有人了）。万事开头难，小t收集了好多病毒的特征码，又收集了一批诡异网站的源码，他想知道这些网站中哪些是有病毒的，又是带了怎样的病毒呢？顺便还想知道他到底收集了多少带病毒的网站。这时候他却不知道何从下手了。所以想请大家帮帮忙。小t又是个急性子哦，所以解决问题越快越好哦~~ Input 第一行，一个整数N（1<=N<=500），表示病毒特征码的个数。接下来N行，每行表示一个病毒特征码，特征码字符串长度在20—200之间。每个病毒都有一个编号，依此为1—N。不同编号的病毒特征码不会相同。在这之后一行，有一个整数M（1<=M<=1000），表示网站数。接下来M行，每行表示一个网站源码，源码字符串长度在7000—10000之间。每个网站都有一个编号，依此为1—M。以上字符串中字符都是ASCII码可见字符（不包括回车）。 Output 依次按如下格式输出按网站编号从小到大输出，带病毒的网站编号和包含病毒编号，每行一个含毒网站信息。 web 网站编号: 病毒编号病毒编号 … 冒号后有一个空格，病毒编号按从小到大排列，两个病毒编号之间用一个空格隔开，如果一个网站包含病毒，病毒数不会超过3个。最后一行输出统计信息，如下格式 total: 带病毒网站数冒号后有一个空格。 Sample Input 3 aaa bbb ccc 2 aaabbbccc bbaacc Sample Output web 1: 1 2 3 total: 1

分析：套AC自动机，把P构建字典树，查询时用vector记录经过哪些单词，注意初始化。注意有坑最后输出total要带换行，字符串是ASCLL码范围0-126，还有要排序。

#include<bits/stdc++.h>
using namespace std;
const int maxn = 100005;
int trie[maxn][130];
int fail[maxn], cnt[maxn];
int vis[maxn], tag[maxn];
int n, m, pos = 0, total = 0;
char s[maxn], p[202];
queue<int>q;
vector<int>ans;
void insert(int idx) {
	int u = 0, lp = strlen(p);
	for (int i = 0; i < lp; i++) {
		int v = p[i];
		if (trie[u][v] == 0)
			trie[u][v] = ++pos;
		u = trie[u][v];
	}
	cnt[u]++;
	tag[u] = idx;
}
void getfail() {
	for (int i = 0; i < 130; i++) {
		if (trie[0][i]) {
			fail[trie[0][i]] = 0;
			q.push(trie[0][i]);
		}
	}
	while (!q.empty()) {
		int cur = q.front();
		q.pop();
		for (int i = 0; i < 130; i++) {
			if (trie[cur][i]) {
				fail[trie[cur][i]] = trie[fail[cur]][i];
				q.push(trie[cur][i]);
			}
			else trie[cur][i] = trie[fail[cur]][i];
		}
	}
}
void query() {
	int ls = strlen(s), u = 0;
	for (int i = 0; i < ls; i++) {
		u = trie[u][s[i]];
		for (int j = u; j && !vis[j]&&cnt[j]; j = fail[j]) {
			ans.push_back(tag[j]);
			vis[j] = 1;
		}
	}
}
int main() {
	scanf("%d", &n);
	for (int i = 1; i <= n; i++) {
		scanf("%s", p);
		insert(i);
	}
	scanf("%d", &m);
	getfail();
	for (int i = 1; i <= m; i++) {
		ans.clear();
		memset(vis, 0, sizeof(vis));
		scanf("%s", s);
		query();
		if (!ans.empty()) {
			total++;
			sort(ans.begin(), ans.end());
			printf("web %d:", i);
			for (int j = 0; j < ans.size(); j++)
				printf(" %d", ans[j]);
			printf("\n");
		}
	}
	printf("total: %d\n", total);
	return 0;
}

HDU-3065病毒侵袭持续中

Problem Description 小t非常感谢大家帮忙解决了他的上一个问题。然而病毒侵袭持续中。在小t的不懈努力下，他发现了网路中的“万恶之源”。这是一个庞大的病毒网站，他有着好多好多的病毒，但是这个网站包含的病毒很奇怪，这些病毒的特征码很短，而且只包含“英文大写字符”。当然小t好想好想为民除害，但是小t从来不打没有准备的战争。知己知彼，百战不殆，小t首先要做的是知道这个病毒网站特征：包含多少不同的病毒，每种病毒出现了多少次。大家能再帮帮他吗？ Input 第一行，一个整数N（1<=N<=1000），表示病毒特征码的个数。接下来N行，每行表示一个病毒特征码，特征码字符串长度在1—50之间，并且只包含“英文大写字符”。任意两个病毒特征码，不会完全相同。在这之后一行，表示“万恶之源”网站源码，源码字符串长度在2000000之内。字符串中字符都是ASCII码可见字符（不包括回车）。 Output 按以下格式每行一个，输出每个病毒出现次数。未出现的病毒不需要输出。病毒特征码: 出现次数冒号后有一个空格，按病毒特征码的输入顺序进行输出。 Sample Input 3 AA BB CC ooxxCC%dAAAoen…END Sample Output AA: 2 CC: 1 Hint Hit：题目描述中没有被提及的所有情况都应该进行考虑。比如两个病毒特征码可能有相互包含或者有重叠的特征码段。计数策略也可一定程度上从Sample中推测。

分析：统计各单词出现次数，记录各单词末尾节点对应单词编号，查询时若经过则维护更新对应单词数量。注意数组大小和初始化，还有神坑多组数据！

#include<bits/stdc++.h>
using namespace std;
const int maxn = 50004;
int trie[maxn][130];
int fail[maxn], num[maxn], cnt[maxn];
char p[1003][55], s[2000006];
int n, pos;
void insert(int idx) {
	int lp = strlen(p[idx]), u = 0;
	for (int i = 0; i < lp; i++) {
		int v = p[idx][i];
		if (trie[u][v] == 0)
			trie[u][v] = ++pos;
		u = trie[u][v];
	}
	num[u] = idx; //对应单词编号
}
void getfail() {
	queue<int>q;
	for (int i = 0; i < 130; i++) {
		if (trie[0][i]) {
			fail[trie[0][i]] = 0;
			q.push(trie[0][i]);
		}
	}
	while (!q.empty()) {
		int cur = q.front();
		q.pop();
		for (int i = 0; i < 130; i++) {
			if (trie[cur][i]) {
				fail[trie[cur][i]] = trie[fail[cur]][i];
				q.push(trie[cur][i]);
			}
			else trie[cur][i] = trie[fail[cur]][i];
		}
	}
}
void query() {
	int u = 0, ls = strlen(s);
	for (int i = 0; i < ls; i++) {
		u = trie[u][s[i]];
		for (int j = u; j; j = fail[j]) {
			cnt[num[j]]++; //经过时更新数量++
		}
	}
}
int main() {
	while (~scanf("%d", &n)) {
		memset(trie, 0, sizeof(trie));
		memset(cnt, 0, sizeof(cnt));
		memset(num, 0, sizeof(num));
		pos = 0;
		for (int i = 1; i <= n; i++) {
			scanf("%s", p[i]);
			insert(i);
		}
		getfail();
		scanf("%s", s);
		query();
		for (int i = 1; i <= n; i++)
			if (cnt[i])
				printf("%s: %d\n", p[i], cnt[i]);
	}
	return 0;
}

POJ-2778DNA Sequence

Description It’s well known that DNA Sequence is a sequence only contains A, C, T and G, and it’s very useful to analyze a segment of DNA Sequence，For example, if a animal’s DNA sequence contains segment ATC then it may mean that the animal may have a genetic disease. Until now scientists have found several those segments, the problem is how many kinds of DNA sequences of a species don’t contain those segments. Suppose that DNA sequences of a species is a sequence that consist of A, C, T and G，and the length of sequences is a given integer n. Input First line contains two integer m (0 <= m <= 10), n (1 <= n <=2000000000). Here, m is the number of genetic disease segment, and n is the length of sequences. Next m lines each line contain a DNA genetic disease segment, and length of these segments is not larger than 10. Output An integer, the number of DNA sequences, mod 100000. Sample Input 4 3 AT AC AG AA Sample Output 36

分析：

#include<cstdio>
#include<cstring>
#include<queue>
#include<map>
using namespace std;
const int maxn = 102;
const int mod = 100000;
int trie[maxn][4], fail[maxn], tail[maxn];
int  n, m, pos;
char s[15];
map<char, int>idx;
void insert() {
	int ls = strlen(s), u = 0;
	for (int i = 0; i < ls; i++) {
		int v = idx[s[i]];
		if (trie[u][v] == 0)
			trie[u][v] = ++pos;
		u = trie[u][v];
	}
	tail[u] = 1;
}
void getfail() {
	queue<int>q;
	for (int i = 0; i < 4; i++) {
		if (trie[0][i]) {
			fail[trie[0][i]] = 0;
			q.push(trie[0][i]);
		}
	}
	while (!q.empty()) {
		int cur = q.front();
		q.pop();
		for (int i = 0; i < 4; i++) {
			if (trie[cur][i]) {
				fail[trie[cur][i]] = trie[fail[cur]][i];
				q.push(trie[cur][i]);
			}
			else trie[cur][i] = trie[fail[cur]][i];
			tail[trie[cur][i]] |= tail[trie[fail[cur]][i]]; //注意是或，只要包含病毒就不行
		}
	}
}
struct matrix {
	long long a[maxn][maxn];
	matrix() {
		memset(a, 0, sizeof(a));
	}
};
matrix  operator*(const matrix& x, const matrix& y) {
	matrix  m;
	for (int i = 0; i <= pos; ++i) 
		for (int j = 0; j <= pos; ++j) 
			for (int k = 0; k <= pos; ++k) 
				m.a[i][j] = (m.a[i][j] + x.a[i][k] * y.a[k][j]) % mod;
	return m;
}
matrix fastm(matrix a, int n) {
	matrix res;
	for (int i = 0; i <= pos; ++i) res.a[i][i] = 1;
	while (n) {
		if (n & 1) res = res * a;
		a = a * a;
		n >>= 1;
	}
	return res;
}
int main() {
	idx['A'] = 0, idx['C'] = 1;
	idx['T'] = 2, idx['G'] = 3;
	while (~scanf("%d%d", &m, &n)) {
		pos = 0;
		memset(trie, 0, sizeof(trie));
		memset(tail, 0, sizeof(tail));
		while (m--) {
			scanf("%s", s);
			insert();
		}
		getfail();
		matrix x;
		for (int i = 0; i <= pos; ++i)  //构建初始矩阵
			if (!tail[i]) //如果本身不含病毒
				for (int j = 0; j < 4; ++j)
					if (!tail[trie[i][j]]) //其子节点也不含病毒
						x.a[i][trie[i][j]]++; //那么节点i到该子节点是可行方案+1
		x = fastm(x, n);
		int ans = 0;
		for (int i = 0; i <= pos; ++i)	//统计
			ans = (ans + x.a[0][i]) % mod;
		printf("%d\n", ans);
	}
	return 0;
}

HDU-2296Ring

Problem Description For the hope of a forever love, Steven is planning to send a ring to Jane with a romantic string engraved on. The string’s length should not exceed N. The careful Steven knows Jane so deeply that he knows her favorite words, such as “love”, “forever”. Also, he knows the value of each word. The higher value a word has the more joy Jane will get when see it. The weight of a word is defined as its appeared times in the romantic string multiply by its value, while the weight of the romantic string is defined as the sum of all words’ weight. You should output the string making its weight maximal. Input The input consists of several test cases. The first line of input consists of an integer T, indicating the number of test cases. Each test case starts with a line consisting of two integers: N, M, indicating the string’s length and the number of Jane’s favorite words. Each of the following M lines consists of a favorite word Si. The last line of each test case consists of M integers, while the i-th number indicates the value of Si. Technical Specification

T ≤ 15
0 < N ≤ 50, 0 < M ≤ 100.
The length of each word is less than 11 and bigger than 0.
1 ≤ Hi ≤ 100.
All the words in the input are different.
All the words just consist of ‘a’ - ‘z’.

Output For each test case, output the string to engrave on a single line. If there’s more than one possible answer, first output the shortest one. If there are still multiple solutions, output the smallest in lexicographically order. The answer may be an empty string. Sample Input 2 7 2 love ever 5 5 5 1 ab 5 Sample Output lovever abab Hint Sample 1: weight(love) = 5, weight(ever) = 5, so weight(lovever) = 5 + 5 = 10 Sample 2: weight(ab) = 2 * 5 = 10, so weight(abab) = 10

#include<bits/stdc++.h>
using namespace std;
const int maxn = 1003;
int trie[maxn][26], fail[maxn];
int val[maxn], cost[102], dp[55][maxn];
char h[105][15];
string path[55][maxn];
int t, n, m, pos;
void insert(char* s, int idx) {
	int ls = strlen(s), u = 0;
	for (int i = 0; i < ls; ++i) {
		int v = s[i] - 'a';
		if (trie[u][v] == 0)
			trie[u][v] = ++pos;
		u = trie[u][v];
	}
	val[u] = cost[idx];
}
void getfail() {
	queue<int>q;
	fail[0] = 0;
	for (int i = 0; i < 26; ++i) {
		if (trie[0][i]) {
			fail[trie[0][i]] = 0;
			q.push(trie[0][i]);
		}
	}
	while (!q.empty()) {
		int cur = q.front();
		q.pop();
		for (int i = 0; i < 26; ++i) {
			if (trie[cur][i]) {
				fail[trie[cur][i]] = trie[fail[cur]][i];
				q.push(trie[cur][i]);
			}
			else
				trie[cur][i] = trie[fail[cur]][i];
		}
	}
}
bool cmp(string s, string t) {
	if (t == "") return true;
	if (s.size() < t.size()) return true;
	if (s.size() > t.size()) return false;
	return s < t;
}
string solve() {
	for (int i = 0; i <= n; ++i)
		for (int j = 0; j <= pos; ++j)
			path[i][j] = "";
	int mx = 0;
	for (int i = 0; i <= n; ++i) {
		for (int j = 0; j <= pos; ++j) {
			if (dp[i][j] == -1) continue;
			for (int k = 0; k < 26; ++k) {
				if (dp[i + 1][trie[j][k]] < dp[i][j] + val[trie[j][k]]) {
					dp[i + 1][trie[j][k]] = dp[i][j] + val[trie[j][k]];
					path[i + 1][trie[j][k]] = path[i][j] + (char)('a' + k);
				}
				else if (dp[i + 1][trie[j][k]] == dp[i][j] + val[trie[j][k]]) {
					if (cmp(path[i][j] + (char)('a' + k), path[i + 1][trie[j][k]]))
						path[i + 1][trie[j][k]] = path[i][j] + (char)('a' + k);
				}
			}
			if (i > 0) mx = max(mx, dp[i][j]);
		}
	}
	if (mx == 0) return "";
	string res = "";
	for (int i = 1; i <= n; ++i) for (int j = 0; j <= pos; ++j) {
		if (dp[i][j] == mx && cmp(path[i][j], res)) {
			res = path[i][j];
		}
	}
	return res;
}
int main() {
	scanf("%d", &t);
	while (t--) {
		memset(trie, 0, sizeof(trie));
		memset(val, 0, sizeof(val));
		memset(dp, -1, sizeof(dp));
		pos = cost[0] = dp[0][0] = 0;
		scanf("%d%d", &n, &m);
		for (int i = 0; i < m; ++i)
			scanf("%s", h[i]);
		for (int i = 1; i <= m; ++i)
			scanf("%d", &cost[i]);
		for (int i = 0; i < m; ++i)
			insert(h[i], i + 1);
		getfail();
		cout << solve() << "\n";
	}
	return 0;
}

本文参与腾讯云自媒体同步曝光计划，分享自作者个人站点/博客。

原始发表：2020/09/28 ，如有侵权请联系 cloudcommunity@tencent.com 删除

网络安全

编程算法

本文分享自作者个人站点/博客前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

网络安全

编程算法

登录后参与评论

0 条评论

热度