字符串是IO竞赛中非常重要的数据类型,在各类算法题中都有广泛的应用。掌握字符串处理的高效算法和技巧,对于解决IO竞赛中的问题至关重要。本文将深入解析IO竞赛中常用的字符串算法,包括字符串的基础操作、字符串匹配算法、字符串哈希、字典树、后缀数组等,并通过具体的代码实现和实例分析,帮助读者理解和掌握这些算法。
在C++中,字符串可以用字符数组或std::string类来表示。字符数组是C语言的遗留物,而std::string是C++标准库提供的字符串类,提供了丰富的操作函数,使用更加方便。
字符数组的表示与初始化:
char str1[] = "Hello, World!";
char str2[20]; // 声明一个长度为20的字符数组
strcpy(str2, "Hello, World!"); // 复制字符串std::string的表示与初始化:
#include <string>
#include <iostream>
using namespace std;
int main() {
string str1 = "Hello, World!";
string str2(str1); // 复制构造
string str3(10, 'a'); // 构造包含10个'a'的字符串
string str4 = str1 + str2; // 字符串拼接
cout << str1 << endl;
cout << str2 << endl;
cout << str3 << endl;
cout << str4 << endl;
return 0;
}字符串的基本操作包括字符串的长度获取、访问单个字符、字符串拼接、字符串比较、子串获取等。
std::string的常用操作:
#include <string>
#include <iostream>
using namespace std;
int main() {
string str = "Hello, World!";
// 获取字符串长度
cout << "Length: " << str.length() << endl; // 输出:13
cout << "Size: " << str.size() << endl; // 输出:13
// 访问单个字符
cout << "First character: " << str[0] << endl; // 输出:H
cout << "Last character: " << str[str.length() - 1] << endl; // 输出:!
// 字符串拼接
str += " Welcome";
cout << str << endl; // 输出:Hello, World! Welcome
// 字符串比较
string str2 = "Hello, World!";
if (str == str2) {
cout << "Equal" << endl;
} else if (str > str2) {
cout << "Greater" << endl;
} else {
cout << "Less" << endl;
}
// 获取子串
string sub = str.substr(7, 5); // 从索引7开始,长度为5的子串
cout << "Substring: " << sub << endl; // 输出:World
// 查找子串
size_t pos = str.find("World");
if (pos != string::npos) {
cout << "Found at position: " << pos << endl; // 输出:7
}
// 替换子串
str.replace(pos, 5, "C++");
cout << str << endl; // 输出:Hello, C++! Welcome
return 0;
}在IO竞赛中,经常需要对字符串进行比较和排序。std::string类重载了比较运算符,可以直接进行字符串比较。字符串的排序可以使用标准库中的sort函数。
字符串的比较:
#include <string>
#include <iostream>
using namespace std;
int main() {
string str1 = "apple";
string str2 = "banana";
string str3 = "apple";
// 使用比较运算符
if (str1 < str2) {
cout << str1 << " < " << str2 << endl; // 输出:apple < banana
}
if (str1 == str3) {
cout << str1 << " == " << str3 << endl; // 输出:apple == apple
}
// 使用compare函数
int result = str1.compare(str2);
if (result < 0) {
cout << str1 << " < " << str2 << endl; // 输出:apple < banana
} else if (result == 0) {
cout << str1 << " == " << str2 << endl;
} else {
cout << str1 << " > " << str2 << endl;
}
return 0;
}字符串的排序:
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>
using namespace std;
int main() {
vector<string> fruits = {"banana", "apple", "orange", "grape", "pear"};
// 按字典序排序
sort(fruits.begin(), fruits.end());
// 输出排序后的结果
for (const string& fruit : fruits) {
cout << fruit << " " << endl;
}
// 输出:
// apple
// banana
// grape
// orange
// pear
// 按长度排序
sort(fruits.begin(), fruits.end(), [](const string& a, const string& b) {
return a.length() < b.length();
});
// 输出按长度排序后的结果
cout << "Sorted by length: " << endl;
for (const string& fruit : fruits) {
cout << fruit << " (length: " << fruit.length() << ")" << endl;
}
// 输出:
// pear (length: 4)
// apple (length: 5)
// grape (length: 5)
// banana (length: 6)
// orange (length: 6)
return 0;
}朴素字符串匹配算法(也称为暴力匹配算法)是最基本的字符串匹配算法,它的思想是:对于主串中的每个可能的起始位置,都尝试与模式串进行匹配,直到找到匹配或者遍历完整个主串。
朴素字符串匹配算法的C++实现:
#include <iostream>
#include <string>
using namespace std;
// 在主串s中查找模式串p,返回第一次出现的位置,若不存在则返回-1
int naiveSearch(const string& s, const string& p) {
int n = s.length();
int m = p.length();
if (m == 0) return 0;
if (n < m) return -1;
for (int i = 0; i <= n - m; i++) {
int j;
for (j = 0; j < m; j++) {
if (s[i + j] != p[j]) {
break;
}
}
if (j == m) {
return i; // 找到匹配
}
}
return -1; // 未找到匹配
}
int main() {
string s = "abababcababcabcabababd";
string p = "ababcabcaba";
int pos = naiveSearch(s, p);
if (pos != -1) {
cout << "Found at position: " << pos << endl;
} else {
cout << "Not found" << endl;
}
return 0;
}朴素字符串匹配算法的时间复杂度为O(nm),其中n是主串的长度,m是模式串的长度。在最坏情况下,需要进行nm次比较。虽然朴素算法的效率不高,但它简单易懂,对于小规模的字符串匹配问题仍然适用。
KMP算法(Knuth-Morris-Pratt算法)是一种高效的字符串匹配算法,它通过预处理模式串,避免了不必要的比较,从而提高了匹配效率。KMP算法的核心是构建一个部分匹配表(也称为next数组),用于在匹配失败时快速移动模式串的位置。
KMP算法的基本思想:
部分匹配表(next数组)的含义:对于模式串p的前i个字符组成的子串,其最长相等前后缀的长度。
KMP算法的C++实现:
#include <iostream>
#include <string>
#include <vector>
using namespace std;
// 构建部分匹配表(next数组)
vector<int> buildNext(const string& p) {
int m = p.length();
vector<int> next(m, 0);
for (int i = 1, j = 0; i < m; i++) {
while (j > 0 && p[i] != p[j]) {
j = next[j - 1];
}
if (p[i] == p[j]) {
j++;
}
next[i] = j;
}
return next;
}
// KMP算法,在主串s中查找模式串p,返回第一次出现的位置,若不存在则返回-1
int kmpSearch(const string& s, const string& p) {
int n = s.length();
int m = p.length();
if (m == 0) return 0;
if (n < m) return -1;
vector<int> next = buildNext(p);
for (int i = 0, j = 0; i < n; i++) {
while (j > 0 && s[i] != p[j]) {
j = next[j - 1];
}
if (s[i] == p[j]) {
j++;
}
if (j == m) {
return i - m + 1; // 找到匹配
}
}
return -1; // 未找到匹配
}
int main() {
string s = "abababcababcabcabababd";
string p = "ababcabcaba";
int pos = kmpSearch(s, p);
if (pos != -1) {
cout << "Found at position: " << pos << endl;
} else {
cout << "Not found" << endl;
}
return 0;
}KMP算法的时间复杂度为O(n+m),其中n是主串的长度,m是模式串的长度。预处理模式串的时间复杂度为O(m),匹配过程的时间复杂度为O(n),因此总的时间复杂度为O(n+m)。KMP算法的空间复杂度为O(m),用于存储部分匹配表。
BM算法(Boyer-Moore算法)是一种高效的字符串匹配算法,它通过从模式串的尾部开始匹配,利用坏字符规则和好后缀规则来加速匹配过程。BM算法在实践中比KMP算法更快,特别是对于较长的模式串。
BM算法的基本思想:
坏字符规则:当遇到坏字符时,将模式串向右移动,使得模式串中最右边的与坏字符匹配的字符与主串中的坏字符对齐;如果模式串中不存在这样的字符,则将模式串移动到坏字符的后面。
好后缀规则:当遇到坏字符时,已经匹配的部分称为好后缀,将模式串向右移动,使得模式串中另一个与好后缀匹配的子串与主串中的好后缀对齐;如果模式串中不存在这样的子串,则将模式串移动到好后缀的后面。
BM算法的C++实现:
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
// 构建坏字符表
vector<int> buildBadChar(const string& p) {
int m = p.length();
vector<int> badChar(256, -1); // 假设字符集为ASCII
for (int i = 0; i < m; i++) {
badChar[p[i]] = i;
}
return badChar;
}
// 构建好后缀表
vector<int> buildGoodSuffix(const string& p) {
int m = p.length();
vector<int> goodSuffix(m, 0);
// 初始化
vector<int> suffix = computeSuffix(p);
// 规则1:模式串中存在与好后缀匹配的子串
fill(goodSuffix.begin(), goodSuffix.end(), m);
// 规则2:模式串的前缀与好后缀的后缀匹配
for (int i = m - 1, j = 0; i >= 0; i--) {
if (suffix[i] == i + 1) {
for (; j < m - 1 - i; j++) {
if (goodSuffix[j] == m) {
goodSuffix[j] = m - 1 - i;
}
}
}
}
// 规则1的具体实现
for (int i = 0; i < m - 1; i++) {
goodSuffix[m - 1 - suffix[i]] = m - 1 - i;
}
return goodSuffix;
}
// 计算后缀数组
vector<int> computeSuffix(const string& p) {
int m = p.length();
vector<int> suffix(m, 0);
suffix[m - 1] = m;
int g = m - 1, f = 0;
for (int i = m - 2; i >= 0; i--) {
if (i > g && suffix[i + m - 1 - f] < i - g) {
suffix[i] = suffix[i + m - 1 - f];
} else {
if (i < g) {
g = i;
}
f = i;
while (g >= 0 && p[g] == p[g + m - 1 - f]) {
g--;
}
suffix[i] = f - g;
}
}
return suffix;
}
// BM算法,在主串s中查找模式串p,返回第一次出现的位置,若不存在则返回-1
int bmSearch(const string& s, const string& p) {
int n = s.length();
int m = p.length();
if (m == 0) return 0;
if (n < m) return -1;
vector<int> badChar = buildBadChar(p);
vector<int> goodSuffix = buildGoodSuffix(p);
int i = 0;
while (i <= n - m) {
int j = m - 1;
while (j >= 0 && p[j] == s[i + j]) {
j--;
}
if (j < 0) {
return i; // 找到匹配
}
// 计算坏字符规则的移动距离
int badCharSkip = j - badChar[s[i + j]];
// 计算好后缀规则的移动距离
int goodSuffixSkip = goodSuffix[j];
// 选择较大的移动距离
i += max(1, max(badCharSkip, goodSuffixSkip));
}
return -1; // 未找到匹配
}
int main() {
string s = "abababcababcabcabababd";
string p = "ababcabcaba";
int pos = bmSearch(s, p);
if (pos != -1) {
cout << "Found at position: " << pos << endl;
} else {
cout << "Not found" << endl;
}
return 0;
}BM算法的时间复杂度在最坏情况下为O(n*m),但在平均情况下性能很好,通常比KMP算法更快。BM算法的空间复杂度为O(k),其中k是字符集的大小,用于存储坏字符表。
Sunday算法是一种简单高效的字符串匹配算法,它由Daniel M. Sunday于1990年提出。Sunday算法的基本思想与BM算法类似,但它只使用了坏字符规则,并且是从模式串的尾部开始向前匹配。
Sunday算法的基本思想:
Sunday算法的C++实现:
#include <iostream>
#include <string>
#include <vector>
using namespace std;
// 构建字符在模式串中最右出现的位置表
vector<int> buildShiftTable(const string& p) {
int m = p.length();
vector<int> shift(256, m + 1); // 假设字符集为ASCII
for (int i = 0; i < m; i++) {
shift[p[i]] = m - i;
}
return shift;
}
// Sunday算法,在主串s中查找模式串p,返回第一次出现的位置,若不存在则返回-1
int sundaySearch(const string& s, const string& p) {
int n = s.length();
int m = p.length();
if (m == 0) return 0;
if (n < m) return -1;
vector<int> shift = buildShiftTable(p);
int i = 0;
while (i <= n - m) {
int j = 0;
while (j < m && s[i + j] == p[j]) {
j++;
}
if (j == m) {
return i; // 找到匹配
}
// 查看主串中位于模式串后面的第一个字符
if (i + m < n) {
i += shift[s[i + m]];
} else {
break;
}
}
return -1; // 未找到匹配
}
int main() {
string s = "abababcababcabcabababd";
string p = "ababcabcaba";
int pos = sundaySearch(s, p);
if (pos != -1) {
cout << "Found at position: " << pos << endl;
} else {
cout << "Not found" << endl;
}
return 0;
}Sunday算法的时间复杂度在最坏情况下为O(n*m),但在平均情况下性能很好,实现也比较简单。Sunday算法的空间复杂度为O(k),其中k是字符集的大小,用于存储字符在模式串中最右出现的位置表。
哈希是一种将任意长度的输入映射到固定长度输出的函数。在字符串处理中,哈希函数可以将字符串映射为一个整数,这个整数称为哈希值。通过比较两个字符串的哈希值,可以快速判断它们是否相等。
哈希函数的基本要求:
多项式滚动哈希是一种常用的字符串哈希方法,它将字符串看作一个多项式,然后计算这个多项式在某个基数下的值。
多项式滚动哈希的基本思想: 对于字符串s = s[0]s[1]…s[n-1],其哈希值可以表示为:
hash(s) = (s[0] * base^(n-1) + s[1] * base^(n-2) + … + s[n-1] * base^0) mod mod_value
其中,base是一个基数(通常选择一个较大的质数,如911382629),mod_value是一个大质数(通常选择1e9+7或1e9+9)。
多项式滚动哈希的C++实现:
#include <iostream>
#include <string>
#include <vector>
using namespace std;
typedef long long ll;
const ll BASE = 911382629;
const ll MOD = 1e9 + 7;
// 计算字符串的哈希值
ll computeHash(const string& s) {
ll hash = 0;
for (char c : s) {
hash = (hash * BASE + c) % MOD;
}
return hash;
}
// 预处理前缀哈希和基数的幂次
pair<vector<ll>, vector<ll>> precomputeHash(const string& s) {
int n = s.length();
vector<ll> prefixHash(n + 1, 0);
vector<ll> power(n + 1, 1);
for (int i = 0; i < n; i++) {
prefixHash[i + 1] = (prefixHash[i] * BASE + s[i]) % MOD;
power[i + 1] = (power[i] * BASE) % MOD;
}
return {prefixHash, power};
}
// 获取子串的哈希值\lll getSubHash(const vector<ll>& prefixHash, const vector<ll>& power, int l, int r) {
// 子串s[l..r]的哈希值
ll res = (prefixHash[r + 1] - prefixHash[l] * power[r - l + 1]) % MOD;
return res < 0 ? res + MOD : res;
}
int main() {
string s = "abababcababcabcabababd";
// 计算整个字符串的哈希值
ll hash = computeHash(s);
cout << "Hash of the whole string: " << hash << endl;
// 预处理前缀哈希和基数的幂次
auto [prefixHash, power] = precomputeHash(s);
// 获取子串的哈希值
int l = 2, r = 7; // 子串s[2..7] = "ababca"
ll subHash = getSubHash(prefixHash, power, l, r);
cout << "Hash of substring s[" << l << ".." << r << "]: " << subHash << endl;
// 验证子串的哈希值是否正确
string sub = s.substr(l, r - l + 1);
ll manualHash = computeHash(sub);
cout << "Manual hash of substring: " << manualHash << endl;
cout << "Are they equal? " << (subHash == manualHash ? "Yes" : "No") << endl;
return 0;
}多项式滚动哈希的时间复杂度为O(n),其中n是字符串的长度。预处理前缀哈希和基数的幂次的时间复杂度为O(n),获取子串的哈希值的时间复杂度为O(1)。多项式滚动哈希的空间复杂度为O(n),用于存储前缀哈希和基数的幂次。
为了降低哈希碰撞的概率,可以使用双哈希技术,即同时使用两个不同的哈希函数,只有当两个哈希函数的哈希值都相等时,才认为两个字符串相等。
双哈希技术的C++实现:
#include <iostream>
#include <string>
#include <vector>
using namespace std;
typedef long long ll;
const ll BASE1 = 911382629;
const ll MOD1 = 1e9 + 7;
const ll BASE2 = 3571428571;
const ll MOD2 = 1e9 + 9;
// 计算字符串的双哈希值
pair<ll, ll> computeDoubleHash(const string& s) {
ll hash1 = 0, hash2 = 0;
for (char c : s) {
hash1 = (hash1 * BASE1 + c) % MOD1;
hash2 = (hash2 * BASE2 + c) % MOD2;
}
return {hash1, hash2};
}
// 预处理双哈希的前缀哈希和基数的幂次
struct DoubleHashInfo {
vector<ll> prefixHash1;
vector<ll> prefixHash2;
vector<ll> power1;
vector<ll> power2;
};
DoubleHashInfo precomputeDoubleHash(const string& s) {
int n = s.length();
DoubleHashInfo info;
info.prefixHash1.resize(n + 1, 0);
info.prefixHash2.resize(n + 1, 0);
info.power1.resize(n + 1, 1);
info.power2.resize(n + 1, 1);
for (int i = 0; i < n; i++) {
info.prefixHash1[i + 1] = (info.prefixHash1[i] * BASE1 + s[i]) % MOD1;
info.prefixHash2[i + 1] = (info.prefixHash2[i] * BASE2 + s[i]) % MOD2;
info.power1[i + 1] = (info.power1[i] * BASE1) % MOD1;
info.power2[i + 1] = (info.power2[i] * BASE2) % MOD2;
}
return info;
}
// 获取子串的双哈希值
pair<ll, ll> getSubDoubleHash(const DoubleHashInfo& info, int l, int r) {
// 子串s[l..r]的哈希值
ll res1 = (info.prefixHash1[r + 1] - info.prefixHash1[l] * info.power1[r - l + 1]) % MOD1;
res1 = res1 < 0 ? res1 + MOD1 : res1;
ll res2 = (info.prefixHash2[r + 1] - info.prefixHash2[l] * info.power2[r - l + 1]) % MOD2;
res2 = res2 < 0 ? res2 + MOD2 : res2;
return {res1, res2};
}
int main() {
string s = "abababcababcabcabababd";
// 计算整个字符串的双哈希值
auto [hash1, hash2] = computeDoubleHash(s);
cout << "Hash1 of the whole string: " << hash1 << endl;
cout << "Hash2 of the whole string: " << hash2 << endl;
// 预处理双哈希的前缀哈希和基数的幂次
DoubleHashInfo info = precomputeDoubleHash(s);
// 获取子串的双哈希值
int l = 2, r = 7; // 子串s[2..7] = "ababca"
auto [subHash1, subHash2] = getSubDoubleHash(info, l, r);
cout << "Hash1 of substring s[" << l << ".." << r << "]: " << subHash1 << endl;
cout << "Hash2 of substring s[" << l << ".." << r << "]: " << subHash2 << endl;
// 验证子串的双哈希值是否正确
string sub = s.substr(l, r - l + 1);
auto [manualHash1, manualHash2] = computeDoubleHash(sub);
cout << "Manual hash1 of substring: " << manualHash1 << endl;
cout << "Manual hash2 of substring: " << manualHash2 << endl;
cout << "Are they equal? " << ((subHash1 == manualHash1 && subHash2 == manualHash2) ? "Yes" : "No") << endl;
return 0;
}双哈希技术的时间复杂度和空间复杂度与多项式滚动哈希相同,但可以显著降低哈希碰撞的概率。在IO竞赛中,双哈希技术被广泛应用于需要高精度字符串比较的场景。
哈希在字符串处理中有广泛的应用,主要包括:
Rabin-Karp算法的C++实现:
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
using namespace std;
typedef long long ll;
const ll BASE = 911382629;
const ll MOD = 1e9 + 7;
// 预处理前缀哈希和基数的幂次
pair<vector<ll>, vector<ll>> precomputeHash(const string& s) {
int n = s.length();
vector<ll> prefixHash(n + 1, 0);
vector<ll> power(n + 1, 1);
for (int i = 0; i < n; i++) {
prefixHash[i + 1] = (prefixHash[i] * BASE + s[i]) % MOD;
power[i + 1] = (power[i] * BASE) % MOD;
}
return {prefixHash, power};
}
// 获取子串的哈希值\lll getSubHash(const vector<ll>& prefixHash, const vector<ll>& power, int l, int r) {
ll res = (prefixHash[r + 1] - prefixHash[l] * power[r - l + 1]) % MOD;
return res < 0 ? res + MOD : res;
}
// Rabin-Karp算法,在主串s中查找模式串p,返回所有匹配的起始位置
vector<int> rabinKarpSearch(const string& s, const string& p) {
int n = s.length();
int m = p.length();
vector<int> positions;
if (m == 0) {
for (int i = 0; i <= n; i++) {
positions.push_back(i);
}
return positions;
}
if (n < m) return positions;
// 预处理主串的前缀哈希和基数的幂次
auto [prefixHashS, power] = precomputeHash(s);
// 计算模式串的哈希值
auto [prefixHashP, _] = precomputeHash(p);
ll pHash = prefixHashP[m];
// 遍历主串,查找所有匹配的位置
for (int i = 0; i <= n - m; i++) {
ll sHash = getSubHash(prefixHashS, power, i, i + m - 1);
if (sHash == pHash) {
// 为了避免哈希碰撞,这里可以进行一次字符串比较
if (s.substr(i, m) == p) {
positions.push_back(i);
}
}
}
return positions;
}
int main() {
string s = "abababcababcabcabababd";
string p = "ababc";
vector<int> positions = rabinKarpSearch(s, p);
if (positions.empty()) {
cout << "Not found" << endl;
} else {
cout << "Found at positions: ";
for (int pos : positions) {
cout << pos << " ";
}
cout << endl;
}
return 0;
}Rabin-Karp算法的时间复杂度在最坏情况下为O(n*m),但在平均情况下性能很好,时间复杂度为O(n+m)。Rabin-Karp算法的空间复杂度为O(n),用于存储前缀哈希和基数的幂次。
字典树(Trie)是一种树形数据结构,用于高效地存储和检索字符串。字典树的每个节点代表一个字符,从根节点到任意节点的路径上的字符连接起来,形成一个字符串。字典树特别适合用于字符串的前缀匹配和自动补全。
字典树的基本特点:
字典树的实现可以使用数组或指针。对于字符集较小的情况(如仅包含小写字母),可以使用数组来存储子节点;对于字符集较大的情况,可以使用指针或哈希表来存储子节点。
使用数组实现的字典树:
#include <iostream>
#include <string>
#include <vector>
using namespace std;
const int ALPHABET_SIZE = 26; // 假设只包含小写字母
struct TrieNode {
TrieNode* children[ALPHABET_SIZE];
bool isEndOfWord;
int count; // 记录该字符串的出现次数
TrieNode() {
isEndOfWord = false;
count = 0;
for (int i = 0; i < ALPHABET_SIZE; i++) {
children[i] = nullptr;
}
}
};
class Trie {
private:
TrieNode* root;
// 递归删除节点
void deleteNode(TrieNode* node) {
if (node == nullptr) return;
for (int i = 0; i < ALPHABET_SIZE; i++) {
if (node->children[i] != nullptr) {
deleteNode(node->children[i]);
}
}
delete node;
}
// 递归查找以某个前缀开头的所有单词
void findWordsWithPrefix(TrieNode* node, string prefix, vector<string>& words) {
if (node == nullptr) return;
if (node->isEndOfWord) {
words.push_back(prefix);
}
for (int i = 0; i < ALPHABET_SIZE; i++) {
if (node->children[i] != nullptr) {
findWordsWithPrefix(node->children[i], prefix + (char)('a' + i), words);
}
}
}
public:
Trie() {
root = new TrieNode();
}
~Trie() {
deleteNode(root);
}
// 插入字符串
void insert(const string& word) {
TrieNode* current = root;
for (char c : word) {
int index = c - 'a';
if (current->children[index] == nullptr) {
current->children[index] = new TrieNode();
}
current = current->children[index];
}
current->isEndOfWord = true;
current->count++;
}
// 查找字符串
bool search(const string& word) {
TrieNode* node = searchPrefix(word);
return node != nullptr && node->isEndOfWord;
}
// 查找前缀
bool startsWith(const string& prefix) {
return searchPrefix(prefix) != nullptr;
}
// 获取字符串的出现次数
int getCount(const string& word) {
TrieNode* node = searchPrefix(word);
if (node != nullptr && node->isEndOfWord) {
return node->count;
}
return 0;
}
// 查找以某个前缀开头的所有单词
vector<string> getWordsWithPrefix(const string& prefix) {
vector<string> words;
TrieNode* node = searchPrefix(prefix);
if (node != nullptr) {
findWordsWithPrefix(node, prefix, words);
}
return words;
}
private:
// 查找前缀对应的节点
TrieNode* searchPrefix(const string& prefix) {
TrieNode* current = root;
for (char c : prefix) {
int index = c - 'a';
if (current->children[index] == nullptr) {
return nullptr;
}
current = current->children[index];
}
return current;
}
};
int main() {
Trie trie;
// 插入字符串
trie.insert("apple");
trie.insert("app");
trie.insert("application");
trie.insert("banana");
trie.insert("orange");
trie.insert("apple"); // 重复插入
// 查找字符串
cout << "Search 'apple': " << (trie.search("apple") ? "Found" : "Not found") << endl;
cout << "Search 'app': " << (trie.search("app") ? "Found" : "Not found") << endl;
cout << "Search 'appl': " << (trie.search("appl") ? "Found" : "Not found") << endl;
// 查找前缀
cout << "Starts with 'app': " << (trie.startsWith("app") ? "Yes" : "No") << endl;
cout << "Starts with 'ban': " << (trie.startsWith("ban") ? "Yes" : "No") << endl;
cout << "Starts with 'ora': " << (trie.startsWith("ora") ? "Yes" : "No") << endl;
cout << "Starts with 'orn': " << (trie.startsWith("orn") ? "Yes" : "No") << endl;
// 获取字符串的出现次数
cout << "Count of 'apple': " << trie.getCount("apple") << endl;
// 查找以某个前缀开头的所有单词
vector<string> words = trie.getWordsWithPrefix("app");
cout << "Words with prefix 'app': ";
for (const string& word : words) {
cout << word << " ";
}
cout << endl;
return 0;
}使用哈希表实现的字典树:
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
using namespace std;
struct TrieNode {
unordered_map<char, TrieNode*> children;
bool isEndOfWord;
int count;
TrieNode() {
isEndOfWord = false;
count = 0;
}
};
class Trie {
private:
TrieNode* root;
// 递归删除节点
void deleteNode(TrieNode* node) {
if (node == nullptr) return;
for (auto& [c, child] : node->children) {
deleteNode(child);
}
delete node;
}
// 递归查找以某个前缀开头的所有单词
void findWordsWithPrefix(TrieNode* node, string prefix, vector<string>& words) {
if (node == nullptr) return;
if (node->isEndOfWord) {
words.push_back(prefix);
}
for (auto& [c, child] : node->children) {
findWordsWithPrefix(child, prefix + c, words);
}
}
public:
Trie() {
root = new TrieNode();
}
~Trie() {
deleteNode(root);
}
// 插入字符串
void insert(const string& word) {
TrieNode* current = root;
for (char c : word) {
if (current->children.find(c) == current->children.end()) {
current->children[c] = new TrieNode();
}
current = current->children[c];
}
current->isEndOfWord = true;
current->count++;
}
// 查找字符串
bool search(const string& word) {
TrieNode* node = searchPrefix(word);
return node != nullptr && node->isEndOfWord;
}
// 查找前缀
bool startsWith(const string& prefix) {
return searchPrefix(prefix) != nullptr;
}
// 获取字符串的出现次数
int getCount(const string& word) {
TrieNode* node = searchPrefix(word);
if (node != nullptr && node->isEndOfWord) {
return node->count;
}
return 0;
}
// 查找以某个前缀开头的所有单词
vector<string> getWordsWithPrefix(const string& prefix) {
vector<string> words;
TrieNode* node = searchPrefix(prefix);
if (node != nullptr) {
findWordsWithPrefix(node, prefix, words);
}
return words;
}
private:
// 查找前缀对应的节点
TrieNode* searchPrefix(const string& prefix) {
TrieNode* current = root;
for (char c : prefix) {
if (current->children.find(c) == current->children.end()) {
return nullptr;
}
current = current->children[c];
}
return current;
}
};
int main() {
Trie trie;
// 插入字符串
trie.insert("apple");
trie.insert("app");
trie.insert("application");
trie.insert("banana");
trie.insert("orange");
trie.insert("apple"); // 重复插入
// 查找字符串
cout << "Search 'apple': " << (trie.search("apple") ? "Found" : "Not found") << endl;
cout << "Search 'app': " << (trie.search("app") ? "Found" : "Not found") << endl;
cout << "Search 'appl': " << (trie.search("appl") ? "Found" : "Not found") << endl;
// 查找前缀
cout << "Starts with 'app': " << (trie.startsWith("app") ? "Yes" : "No") << endl;
cout << "Starts with 'ban': " << (trie.startsWith("ban") ? "Yes" : "No") << endl;
cout << "Starts with 'ora': " << (trie.startsWith("ora") ? "Yes" : "No") << endl;
cout << "Starts with 'orn': " << (trie.startsWith("orn") ? "Yes" : "No") << endl;
// 获取字符串的出现次数
cout << "Count of 'apple': " << trie.getCount("apple") << endl;
// 查找以某个前缀开头的所有单词
vector<string> words = trie.getWordsWithPrefix("app");
cout << "Words with prefix 'app': ";
for (const string& word : words) {
cout << word << " ";
}
cout << endl;
return 0;
}字典树的插入、查找和前缀匹配的时间复杂度均为O(L),其中L是字符串的长度。字典树的空间复杂度为O(N*L),其中N是插入的字符串数量,L是平均字符串长度。
字典树在字符串处理中有广泛的应用,主要包括:
使用字典树查找最长公共前缀:
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
using namespace std;
struct TrieNode {
unordered_map<char, TrieNode*> children;
int count;
TrieNode() {
count = 0;
}
};
class Trie {
private:
TrieNode* root;
public:
Trie() {
root = new TrieNode();
}
~Trie() {
// 析构函数实现(省略)
}
// 插入字符串
void insert(const string& word) {
TrieNode* current = root;
for (char c : word) {
if (current->children.find(c) == current->children.end()) {
current->children[c] = new TrieNode();
}
current = current->children[c];
current->count++;
}
}
// 查找最长公共前缀
string findLongestCommonPrefix(int n) {
string prefix;
TrieNode* current = root;
while (true) {
// 检查当前节点的子节点数量
int childCount = 0;
char nextChar = '\0';
for (auto& [c, child] : current->children) {
if (child->count == n) { // 该子节点被所有n个字符串使用
childCount++;
nextChar = c;
}
}
if (childCount == 1) { // 只有一个子节点被所有字符串使用
prefix += nextChar;
current = current->children[nextChar];
} else {
break;
}
}
return prefix;
}
};
// 查找多个字符串的最长公共前缀
string longestCommonPrefix(const vector<string>& strs) {
if (strs.empty()) return "";
if (strs.size() == 1) return strs[0];
Trie trie;
for (const string& str : strs) {
trie.insert(str);
}
return trie.findLongestCommonPrefix(strs.size());
}
int main() {
vector<string> strs1 = {"flower", "flow", "flight"};
cout << "Longest common prefix: " << longestCommonPrefix(strs1) << endl; // 输出:fl
vector<string> strs2 = {"dog", "racecar", "car"};
cout << "Longest common prefix: " << longestCommonPrefix(strs2) << endl; // 输出:
vector<string> strs3 = {"apple", "app", "application"};
cout << "Longest common prefix: " << longestCommonPrefix(strs3) << endl; // 输出:app
return 0;
}字典树的主要缺点是空间消耗较大,特别是当字符集较大或字符串较长时。为了优化字典树的空间使用,可以采用以下方法:
压缩字典树的C++实现:
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
using namespace std;
struct CompressedTrieNode {
unordered_map<string, CompressedTrieNode*> children;
bool isEndOfWord;
int count;
CompressedTrieNode() {
isEndOfWord = false;
count = 0;
}
};
class CompressedTrie {
private:
CompressedTrieNode* root;
// 递归删除节点
void deleteNode(CompressedTrieNode* node) {
if (node == nullptr) return;
for (auto& [s, child] : node->children) {
deleteNode(child);
}
delete node;
}
public:
CompressedTrie() {
root = new CompressedTrieNode();
}
~CompressedTrie() {
deleteNode(root);
}
// 插入字符串
void insert(const string& word) {
CompressedTrieNode* current = root;
int i = 0;
while (i < word.length()) {
bool found = false;
for (auto& [s, child] : current->children) {
int j = 0;
// 找到共同前缀的长度
while (i + j < word.length() && j < s.length() && word[i + j] == s[j]) {
j++;
}
if (j == 0) continue; // 没有共同前缀
if (j == s.length()) { // s是word的前缀
current = child;
i += j;
found = true;
break;
} else if (i + j == word.length()) { // word是s的前缀
// 分裂节点
CompressedTrieNode* newNode = new CompressedTrieNode();
newNode->children[s.substr(j)] = child;
newNode->isEndOfWord = true;
newNode->count++;
current->children.erase(s);
current->children[word.substr(i)] = newNode;
return;
} else { // 有共同前缀,但都不是对方的前缀
// 分裂节点
CompressedTrieNode* newNode = new CompressedTrieNode();
CompressedTrieNode* childNode = new CompressedTrieNode();
childNode->children[s.substr(j)] = child;
childNode->isEndOfWord = child->isEndOfWord;
childNode->count = child->count;
newNode->children[s.substr(j)] = childNode;
newNode->children[word.substr(i + j)] = new CompressedTrieNode();
newNode->children[word.substr(i + j)]->isEndOfWord = true;
newNode->children[word.substr(i + j)]->count++;
current->children.erase(s);
current->children[s.substr(0, j)] = newNode;
return;
}
}
if (!found) {
// 没有共同前缀,直接插入剩余部分
current->children[word.substr(i)] = new CompressedTrieNode();
current->children[word.substr(i)]->isEndOfWord = true;
current->children[word.substr(i)]->count++;
return;
}
}
// 已经存在该单词
current->isEndOfWord = true;
current->count++;
}
// 查找字符串
bool search(const string& word) {
CompressedTrieNode* current = root;
int i = 0;
while (i < word.length()) {
bool found = false;
for (auto& [s, child] : current->children) {
if (i + s.length() <= word.length() && word.substr(i, s.length()) == s) {
current = child;
i += s.length();
found = true;
break;
}
}
if (!found) {
return false;
}
}
return current->isEndOfWord;
}
// 查找前缀
bool startsWith(const string& prefix) {
CompressedTrieNode* current = root;
int i = 0;
while (i < prefix.length()) {
bool found = false;
for (auto& [s, child] : current->children) {
if (i + s.length() <= prefix.length() && prefix.substr(i, s.length()) == s) {
current = child;
i += s.length();
found = true;
break;
} else if (s.length() > prefix.length() - i && s.substr(0, prefix.length() - i) == prefix.substr(i)) {
return true;
}
}
if (!found) {
return false;
}
}
return true;
}
};
int main() {
CompressedTrie trie;
// 插入字符串
trie.insert("apple");
trie.insert("app");
trie.insert("application");
trie.insert("banana");
// 查找字符串
cout << "Search 'apple': " << (trie.search("apple") ? "Found" : "Not found") << endl;
cout << "Search 'app': " << (trie.search("app") ? "Found" : "Not found") << endl;
cout << "Search 'appl': " << (trie.search("appl") ? "Found" : "Not found") << endl;
// 查找前缀
cout << "Starts with 'app': " << (trie.startsWith("app") ? "Yes" : "No") << endl;
cout << "Starts with 'ban': " << (trie.startsWith("ban") ? "Yes" : "No") << endl;
cout << "Starts with 'ora': " << (trie.startsWith("ora") ? "Yes" : "No") << endl;
return 0;
}压缩字典树可以显著减少节点数量,特别是当插入的字符串有很多共同前缀时。但压缩字典树的实现较为复杂,且插入和查找操作的时间复杂度可能会增加,因为需要比较字符串的前缀。
后缀数组(Suffix Array)是一种用于字符串处理的数据结构,它包含了一个字符串的所有后缀的起始位置,并按照字典序排序。后缀数组可以用于快速解决许多字符串问题,如查找子串、查找最长公共前缀、查找最长重复子串等。
后缀数组的基本概念:
构建后缀数组的算法有很多种,包括朴素算法、倍增算法、DC3算法等。其中,倍增算法是一种较为常用的算法,它的时间复杂度为O(n log n)。
倍增算法的基本思想:
后缀数组的C++实现(倍增算法):
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
// 构建后缀数组
vector<int> buildSuffixArray(const string& s) {
int n = s.length();
vector<int> sa(n);
vector<int> rank(n);
vector<int> tmp(n);
// 初始排名
for (int i = 0; i < n; i++) {
sa[i] = i;
rank[i] = s[i];
}
// 倍增排序
for (int k = 1; k < n; k *= 2) {
// 按前k个字符排序
auto compare = [&](int i, int j) {
if (rank[i] != rank[j]) {
return rank[i] < rank[j];
}
int ri = (i + k < n) ? rank[i + k] : -1;
int rj = (j + k < n) ? rank[j + k] : -1;
return ri < rj;
};
sort(sa.begin(), sa.end(), compare);
// 更新排名
tmp[sa[0]] = 0;
for (int i = 1; i < n; i++) {
tmp[sa[i]] = tmp[sa[i - 1]] + (compare(sa[i - 1], sa[i]) ? 1 : 0);
}
rank.swap(tmp);
// 如果所有排名都不同,提前结束
bool allUnique = true;
for (int i = 1; i < n; i++) {
if (rank[sa[i]] == rank[sa[i - 1]]) {
allUnique = false;
break;
}
}
if (allUnique) break;
}
return sa;
}
// 构建最长公共前缀数组
vector<int> buildLCPArray(const string& s, const vector<int>& sa) {
int n = s.length();
vector<int> rank(n);
for (int i = 0; i < n; i++) {
rank[sa[i]] = i;
}
vector<int> lcp(n - 1, 0);
int h = 0;
for (int i = 0; i < n; i++) {
if (rank[i] == n - 1) {
h = 0;
continue;
}
int j = sa[rank[i] + 1];
while (i + h < n && j + h < n && s[i + h] == s[j + h]) {
h++;
}
lcp[rank[i]] = h;
if (h > 0) {
h--;
}
}
return lcp;
}
int main() {
string s = "banana";
// 构建后缀数组
vector<int> sa = buildSuffixArray(s);
// 输出后缀数组
cout << "Suffix Array: " << endl;
for (int i = 0; i < sa.size(); i++) {
cout << "SA[" << i << "] = " << sa[i] << ", suffix: " << s.substr(sa[i]) << endl;
}
// 构建最长公共前缀数组
vector<int> lcp = buildLCPArray(s, sa);
// 输出最长公共前缀数组
cout << "LCP Array: " << endl;
for (int i = 0; i < lcp.size(); i++) {
cout << "LCP[" << i << "] = " << lcp[i] << ", between " << s.substr(sa[i]) << " and " << s.substr(sa[i+1]) << endl;
}
return 0;
}后缀数组的构建时间复杂度为O(n log n),其中n是字符串的长度。最长公共前缀数组的构建时间复杂度为O(n)。后缀数组的空间复杂度为O(n)。
后缀数组在字符串处理中有广泛的应用,主要包括:
使用后缀数组查找最长重复子串:
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
// 构建后缀数组
vector<int> buildSuffixArray(const string& s) {
int n = s.length();
vector<int> sa(n);
vector<int> rank(n);
vector<int> tmp(n);
// 初始排名
for (int i = 0; i < n; i++) {
sa[i] = i;
rank[i] = s[i];
}
// 倍增排序
for (int k = 1; k < n; k *= 2) {
// 按前k个字符排序
auto compare = [&](int i, int j) {
if (rank[i] != rank[j]) {
return rank[i] < rank[j];
}
int ri = (i + k < n) ? rank[i + k] : -1;
int rj = (j + k < n) ? rank[j + k] : -1;
return ri < rj;
};
sort(sa.begin(), sa.end(), compare);
// 更新排名
tmp[sa[0]] = 0;
for (int i = 1; i < n; i++) {
tmp[sa[i]] = tmp[sa[i - 1]] + (compare(sa[i - 1], sa[i]) ? 1 : 0);
}
rank.swap(tmp);
// 如果所有排名都不同,提前结束
bool allUnique = true;
for (int i = 1; i < n; i++) {
if (rank[sa[i]] == rank[sa[i - 1]]) {
allUnique = false;
break;
}
}
if (allUnique) break;
}
return sa;
}
// 构建最长公共前缀数组
vector<int> buildLCPArray(const string& s, const vector<int>& sa) {
int n = s.length();
vector<int> rank(n);
for (int i = 0; i < n; i++) {
rank[sa[i]] = i;
}
vector<int> lcp(n - 1, 0);
int h = 0;
for (int i = 0; i < n; i++) {
if (rank[i] == n - 1) {
h = 0;
continue;
}
int j = sa[rank[i] + 1];
while (i + h < n && j + h < n && s[i + h] == s[j + h]) {
h++;
}
lcp[rank[i]] = h;
if (h > 0) {
h--;
}
}
return lcp;
}
// 查找最长重复子串
string findLongestRepeatedSubstring(const string& s) {
int n = s.length();
if (n == 0) return "";
// 构建后缀数组和最长公共前缀数组
vector<int> sa = buildSuffixArray(s);
vector<int> lcp = buildLCPArray(s, sa);
// 查找最长公共前缀
int maxLen = 0;
int maxIndex = 0;
for (int i = 0; i < lcp.size(); i++) {
if (lcp[i] > maxLen) {
maxLen = lcp[i];
maxIndex = i;
}
}
if (maxLen == 0) return "";
// 返回最长重复子串
return s.substr(sa[maxIndex], maxLen);
}
int main() {
string s1 = "banana";
cout << "Longest repeated substring of '" << s1 << "': '" << findLongestRepeatedSubstring(s1) << "'" << endl; // 输出:ana
string s2 = "abababcababcabcabababd";
cout << "Longest repeated substring of '" << s2 << "': '" << findLongestRepeatedSubstring(s2) << "'" << endl;
string s3 = "abcdefg";
cout << "Longest repeated substring of '" << s3 << "': '" << findLongestRepeatedSubstring(s3) << "'" << endl; // 输出:(空字符串)
return 0;
}使用后缀数组查找子串:
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
// 构建后缀数组
vector<int> buildSuffixArray(const string& s) {
// 实现与前面相同(省略)
int n = s.length();
vector<int> sa(n);
vector<int> rank(n);
vector<int> tmp(n);
// 初始排名
for (int i = 0; i < n; i++) {
sa[i] = i;
rank[i] = s[i];
}
// 倍增排序
for (int k = 1; k < n; k *= 2) {
// 按前k个字符排序
auto compare = [&](int i, int j) {
if (rank[i] != rank[j]) {
return rank[i] < rank[j];
}
int ri = (i + k < n) ? rank[i + k] : -1;
int rj = (j + k < n) ? rank[j + k] : -1;
return ri < rj;
};
sort(sa.begin(), sa.end(), compare);
// 更新排名
tmp[sa[0]] = 0;
for (int i = 1; i < n; i++) {
tmp[sa[i]] = tmp[sa[i - 1]] + (compare(sa[i - 1], sa[i]) ? 1 : 0);
}
rank.swap(tmp);
// 如果所有排名都不同,提前结束
bool allUnique = true;
for (int i = 1; i < n; i++) {
if (rank[sa[i]] == rank[sa[i - 1]]) {
allUnique = false;
break;
}
}
if (allUnique) break;
}
return sa;
}
// 在后缀数组中查找子串
bool searchSubstring(const string& s, const vector<int>& sa, const string& p) {
int n = s.length();
int m = p.length();
if (m == 0) return true;
if (n < m) return false;
// 二分查找
int left = 0, right = n - 1;
while (left <= right) {
int mid = (left + right) / 2;
int pos = sa[mid];
// 比较子串
int i = 0;
while (i < m && pos + i < n && s[pos + i] == p[i]) {
i++;
}
if (i == m) {
return true; // 找到子串
} else if (i < m && pos + i < n && s[pos + i] < p[i]) {
left = mid + 1; // 子串在右半部分
} else {
right = mid - 1; // 子串在左半部分
}
}
return false; // 未找到子串
}
int main() {
string s = "banana";
vector<int> sa = buildSuffixArray(s);
string p1 = "ana";
cout << "Is '" << p1 << "' a substring of '" << s << "'? " << (searchSubstring(s, sa, p1) ? "Yes" : "No") << endl; // 输出:Yes
string p2 = "nan";
cout << "Is '" << p2 << "' a substring of '" << s << "'? " << (searchSubstring(s, sa, p2) ? "Yes" : "No") << endl; // 输出:Yes
string p3 = "xyz";
cout << "Is '" << p3 << "' a substring of '" << s << "'? " << (searchSubstring(s, sa, p3) ? "Yes" : "No") << endl; // 输出:No
return 0;
}最长回文子串问题是字符串处理中的经典问题,它要求找出一个字符串中最长的回文子串。回文串是指从左到右读和从右到左读都一样的字符串。
朴素算法:枚举所有可能的回文中心,然后向两边扩展,时间复杂度为O(n^2)。
Manacher算法:由Glenn K. Manacher于1975年提出的一种高效算法,它可以在O(n)时间内找到最长回文子串。
Manacher算法的核心思想是利用已知的回文信息来避免重复计算。具体来说,它通过预处理字符串,将所有的回文子串(包括奇数长度和偶数长度)都转化为奇数长度的回文子串,然后利用回文的对称性来加速计算。
Manacher算法的基本步骤:
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
// Manacher算法,返回最长回文子串
string manacher(const string& s) {
int n = s.length();
if (n == 0) return "";
// 预处理字符串,在每个字符之间插入'#'
string t = "#";
for (char c : s) {
t += c;
t += "#";
}
int m = t.length();
vector<int> P(m, 0); // P[i]表示以i为中心的最长回文子串的半径
int C = 0; // 当前已知的最长回文子串的中心位置
int R = 0; // 当前已知的最长回文子串的右边界位置
int maxLen = 0; // 最长回文子串的长度
int maxCenter = 0; // 最长回文子串的中心位置
for (int i = 0; i < m; i++) {
// 利用对称性,计算P[i]的初始值
int mirror = 2 * C - i;
if (i < R) {
P[i] = min(P[mirror], R - i);
}
// 向两边扩展
int left = i - (P[i] + 1);
int right = i + (P[i] + 1);
while (left >= 0 && right < m && t[left] == t[right]) {
P[i]++;
left--;
right++;
}
// 更新C和R
if (i + P[i] > R) {
C = i;
R = i + P[i];
}
// 更新最长回文子串
if (P[i] > maxLen) {
maxLen = P[i];
maxCenter = i;
}
}
// 计算原字符串中的最长回文子串
int start = (maxCenter - maxLen) / 2;
return s.substr(start, maxLen);
}
int main() {
string s1 = "babad";
cout << "Longest palindromic substring of '" << s1 << "': '" << manacher(s1) << "'" << endl; // 输出:bab或aba
string s2 = "cbbd";
cout << "Longest palindromic substring of '" << s2 << "': '" << manacher(s2) << "'" << endl; // 输出:bb
string s3 = "a";
cout << "Longest palindromic substring of '" << s3 << "': '" << manacher(s3) << "'" << endl; // 输出:a
string s4 = "";
cout << "Longest palindromic substring of '" << s4 << "': '" << manacher(s4) << "'" << endl; // 输出:(空字符串)
string s5 = "abababcababcabcabababd";
cout << "Longest palindromic substring of '" << s5 << "': '" << manacher(s5) << "'" << endl;
return 0;
}Manacher算法的时间复杂度为O(n),空间复杂度为O(n)。Manacher算法是目前解决最长回文子串问题的最有效算法之一。
AC自动机(Aho-Corasick automaton)是一种用于多模式字符串匹配的高效算法,它由Alfred V. Aho和Margaret J. Corasick于1975年提出。AC自动机结合了字典树和KMP算法的思想,可以在O(n + m + z)的时间内完成多模式字符串匹配,其中n是主串的长度,m是所有模式串的长度之和,z是匹配的数量。
AC自动机的基本思想:
AC自动机的构建主要包括两个步骤:构建字典树和构建失败指针。
构建字典树:与普通字典树的构建方法相同,将所有的模式串插入到字典树中。
构建失败指针:使用广度优先搜索(BFS)的方式构建失败指针。对于字典树中的每个节点,其失败指针指向的节点表示当前节点的最长后缀,该后缀是某个模式串的前缀。
AC自动机的C++实现:
#include <iostream>
#include <string>
#include <vector>
#include <queue>
#include <unordered_map>
#include <unordered_set>
using namespace std;
const int ALPHABET_SIZE = 26; // 假设只包含小写字母
struct ACNode {
ACNode* children[ALPHABET_SIZE];
ACNode* fail; // 失败指针
bool isEndOfWord; // 是否是某个模式串的结尾
vector<int> patternIndices; // 存储以该节点结尾的所有模式串的索引
ACNode() {
fail = nullptr;
isEndOfWord = false;
for (int i = 0; i < ALPHABET_SIZE; i++) {
children[i] = nullptr;
}
}
};
class ACAutomaton {
private:
ACNode* root;
vector<string> patterns; // 存储所有的模式串
// 递归删除节点
void deleteNode(ACNode* node) {
if (node == nullptr) return;
for (int i = 0; i < ALPHABET_SIZE; i++) {
if (node->children[i] != nullptr) {
deleteNode(node->children[i]);
}
}
delete node;
}
public:
ACAutomaton() {
root = new ACNode();
}
~ACAutomaton() {
deleteNode(root);
}
// 插入模式串
void insert(const string& pattern) {
ACNode* current = root;
for (char c : pattern) {
int index = c - 'a';
if (current->children[index] == nullptr) {
current->children[index] = new ACNode();
}
current = current->children[index];
}
current->isEndOfWord = true;
current->patternIndices.push_back(patterns.size());
patterns.push_back(pattern);
}
// 构建失败指针
void buildFailurePointers() {
queue<ACNode*> q;
// 根节点的所有直接子节点的失败指针指向根节点
for (int i = 0; i < ALPHABET_SIZE; i++) {
if (root->children[i] != nullptr) {
root->children[i]->fail = root;
q.push(root->children[i]);
} else {
root->children[i] = root; // 简化处理,将空指针指向根节点
}
}
// 使用BFS构建失败指针
while (!q.empty()) {
ACNode* current = q.front();
q.pop();
for (int i = 0; i < ALPHABET_SIZE; i++) {
ACNode* child = current->children[i];
if (child != nullptr && child != root) { // 确保不是根节点(因为有些空指针被指向了根节点)
ACNode* failNode = current->fail;
while (failNode->children[i] == nullptr || failNode->children[i] == root) { // 查找失败指针
failNode = failNode->fail;
if (failNode == nullptr) { // 如果到达根节点,则失败指针指向根节点
failNode = root;
break;
}
}
child->fail = failNode->children[i];
// 合并以该节点结尾的所有模式串的索引
for (int idx : child->fail->patternIndices) {
child->patternIndices.push_back(idx);
}
q.push(child);
}
}
}
}
// 在主串中查找所有模式串的出现位置
vector<pair<int, int>> search(const string& text) {
vector<pair<int, int>> matches; // 存储匹配的模式串索引和位置
ACNode* current = root;
for (int i = 0; i < text.length(); i++) {
int index = text[i] - 'a';
// 查找可以转移的节点
while (current->children[index] == nullptr || current->children[index] == root) {
current = current->fail;
if (current == nullptr) { // 如果到达根节点,则转移到根节点
current = root;
break;
}
}
current = current->children[index];
// 检查是否有匹配的模式串
if (current->isEndOfWord) {
for (int idx : current->patternIndices) {
int startPos = i - patterns[idx].length() + 1;
matches.emplace_back(idx, startPos);
}
}
}
return matches;
}
// 获取模式串
string getPattern(int index) {
if (index >= 0 && index < patterns.size()) {
return patterns[index];
}
return "";
}
};
int main() {
ACAutomaton ac;
// 插入模式串
ac.insert("he");
ac.insert("she");
ac.insert("his");
ac.insert("hers");
// 构建失败指针
ac.buildFailurePointers();
// 在主串中查找所有模式串的出现位置
string text = "ushershisthe";
vector<pair<int, int>> matches = ac.search(text);
// 输出匹配结果
cout << "Matches in '" << text << "':" << endl;
for (auto [patternIndex, pos] : matches) {
string pattern = ac.getPattern(patternIndex);
cout << "Pattern '" << pattern << "' found at position " << pos << endl;
}
return 0;
}AC自动机的构建时间复杂度为O(m),其中m是所有模式串的长度之和。在主串中查找所有模式串的出现位置的时间复杂度为O(n + z),其中n是主串的长度,z是匹配的数量。AC自动机的空间复杂度为O(m)。
AC自动机在多模式字符串匹配中有广泛的应用,主要包括:
使用AC自动机进行敏感词过滤:
#include <iostream>
#include <string>
#include <vector>
#include <queue>
#include <unordered_map>
#include <unordered_set>
using namespace std;
const int ALPHABET_SIZE = 26; // 假设只包含小写字母
struct ACNode {
ACNode* children[ALPHABET_SIZE];
ACNode* fail;
bool isEndOfWord;
ACNode() {
fail = nullptr;
isEndOfWord = false;
for (int i = 0; i < ALPHABET_SIZE; i++) {
children[i] = nullptr;
}
}
};
class ACAutomaton {
private:
ACNode* root;
// 递归删除节点
void deleteNode(ACNode* node) {
if (node == nullptr) return;
for (int i = 0; i < ALPHABET_SIZE; i++) {
if (node->children[i] != nullptr) {
deleteNode(node->children[i]);
}
}
delete node;
}
public:
ACAutomaton() {
root = new ACNode();
}
~ACAutomaton() {
deleteNode(root);
}
// 插入模式串
void insert(const string& pattern) {
ACNode* current = root;
for (char c : pattern) {
int index = c - 'a';
if (current->children[index] == nullptr) {
current->children[index] = new ACNode();
}
current = current->children[index];
}
current->isEndOfWord = true;
}
// 构建失败指针
void buildFailurePointers() {
queue<ACNode*> q;
// 根节点的所有直接子节点的失败指针指向根节点
for (int i = 0; i < ALPHABET_SIZE; i++) {
if (root->children[i] != nullptr) {
root->children[i]->fail = root;
q.push(root->children[i]);
} else {
root->children[i] = root; // 简化处理,将空指针指向根节点
}
}
// 使用BFS构建失败指针
while (!q.empty()) {
ACNode* current = q.front();
q.pop();
for (int i = 0; i < ALPHABET_SIZE; i++) {
ACNode* child = current->children[i];
if (child != nullptr && child != root) { // 确保不是根节点(因为有些空指针被指向了根节点)
ACNode* failNode = current->fail;
while (failNode->children[i] == nullptr || failNode->children[i] == root) { // 查找失败指针
failNode = failNode->fail;
if (failNode == nullptr) { // 如果到达根节点,则失败指针指向根节点
failNode = root;
break;
}
}
child->fail = failNode->children[i];
// 合并结束标记
if (child->fail->isEndOfWord) {
child->isEndOfWord = true;
}
q.push(child);
}
}
}
}
// 过滤敏感词,将敏感词替换为*号
string filterSensitiveWords(const string& text) {
string result = text;
ACNode* current = root;
for (int i = 0; i < text.length(); i++) {
int index = text[i] - 'a';
// 查找可以转移的节点
while (current->children[index] == nullptr || current->children[index] == root) {
current = current->fail;
if (current == nullptr) { // 如果到达根节点,则转移到根节点
current = root;
break;
}
}
current = current->children[index];
// 检查是否有敏感词
if (current->isEndOfWord) {
// 查找敏感词的起始位置
ACNode* temp = current;
int len = 0;
while (temp != root && temp->isEndOfWord) {
len++;
// 这里简化处理,实际应用中需要记录敏感词的长度
// 或者在AC自动机中存储每个敏感词的长度
temp = temp->fail;
}
// 替换敏感词为*号
for (int j = 0; j < len; j++) {
if (i - j >= 0) {
result[i - j] = '*';
}
}
}
}
return result;
}
};
int main() {
ACAutomaton ac;
// 插入敏感词
ac.insert("bad");
ac.insert("evil");
ac.insert("ugly");
// 构建失败指针
ac.buildFailurePointers();
// 过滤敏感词
string text = "This is a bad example, it's evil and ugly.";
string filteredText = ac.filterSensitiveWords(text);
cout << "Original text: " << text << endl;
cout << "Filtered text: " << filteredText << endl;
return 0;
}后缀自动机(Suffix Automaton,SAM)是一种用于字符串处理的强大数据结构,它可以在线性时间内解决许多字符串问题。后缀自动机是一个能够表示字符串的所有后缀的确定有限状态自动机(DFA)。
后缀自动机的基本概念:
后缀自动机的构建是一个在线过程,可以逐个字符地构建。每次添加一个字符时,需要创建一个新的状态,并更新相关的转移和后缀链接。
后缀自动机的C++实现:
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
using namespace std;
struct State {
unordered_map<char, int> next; // 转移函数
int len; // 该状态对应的最长子串的长度
int link; // 后缀链接
State(int _len = 0, int _link = -1) : len(_len), link(_link) {}
};
class SuffixAutomaton {
private:
vector<State> states;
int last; // 最后一个状态的索引
public:
SuffixAutomaton() {
states.emplace_back(); // 添加初始状态
last = 0;
}
// 添加一个字符
void extend(char c) {
int cur = states.size();
states.emplace_back(states[last].len + 1);
int p = last;
// 更新转移函数
while (p != -1 && states[p].next.find(c) == states[p].next.end()) {
states[p].next[c] = cur;
p = states[p].link;
}
if (p == -1) {
// 没有找到相同的转移,后缀链接指向初始状态
states[cur].link = 0;
} else {
int q = states[p].next[c];
if (states[p].len + 1 == states[q].len) {
// q已经是p通过c转移后的正确状态
states[cur].link = q;
} else {
// 需要分裂状态q
int clone = states.size();
states.push_back(states[q]);
states[clone].len = states[p].len + 1;
// 更新q和cur的后缀链接
while (p != -1 && states[p].next[c] == q) {
states[p].next[c] = clone;
p = states[p].link;
}
states[q].link = clone;
states[cur].link = clone;
}
}
last = cur;
}
// 构建后缀自动机
void build(const string& s) {
for (char c : s) {
extend(c);
}
}
// 检查子串是否存在
bool contains(const string& t) {
int state = 0;
for (char c : t) {
if (states[state].next.find(c) == states[state].next.end()) {
return false;
}
state = states[state].next[c];
}
return true;
}
// 统计不同子串的数量
long long countDistinctSubstrings() {
long long count = 0;
for (int i = 1; i < states.size(); i++) {
count += states[i].len - states[states[i].link].len;
}
return count;
}
// 获取状态数量
int size() const {
return states.size();
}
};
int main() {
string s = "banana";
SuffixAutomaton sam;
sam.build(s);
cout << "Number of distinct substrings: " << sam.countDistinctSubstrings() << endl;
// 检查子串是否存在
vector<string> testCases = {"ana", "nan", "ban", "xyz", "an", "a", ""};
for (const string& t : testCases) {
cout << "'" << t << "' is " << (sam.contains(t) ? "a substring" : "not a substring") << " of '" << s << "'" << endl;
}
return 0;
}后缀自动机的构建时间复杂度为O(n),其中n是字符串的长度。后缀自动机的空间复杂度为O(n)。后缀自动机是一种非常高效的数据结构,可以在线性时间内解决许多字符串问题。
后缀自动机在字符串处理中有广泛的应用,主要包括:
使用后缀自动机查找最长公共子串:
#include <iostream>
#include <string>
#include <vector>
#include <unordered_map>
using namespace std;
struct State {
unordered_map<char, int> next;
int len;
int link;
State(int _len = 0, int _link = -1) : len(_len), link(_link) {}
};
class SuffixAutomaton {
private:
vector<State> states;
int last;
public:
SuffixAutomaton() {
states.emplace_back();
last = 0;
}
void extend(char c) {
int cur = states.size();
states.emplace_back(states[last].len + 1);
int p = last;
while (p != -1 && states[p].next.find(c) == states[p].next.end()) {
states[p].next[c] = cur;
p = states[p].link;
}
if (p == -1) {
states[cur].link = 0;
} else {
int q = states[p].next[c];
if (states[p].len + 1 == states[q].len) {
states[cur].link = q;
} else {
int clone = states.size();
states.push_back(states[q]);
states[clone].len = states[p].len + 1;
while (p != -1 && states[p].next[c] == q) {
states[p].next[c] = clone;
p = states[p].link;
}
states[q].link = clone;
states[cur].link = clone;
}
}
last = cur;
}
void build(const string& s) {
for (char c : s) {
extend(c);
}
}
// 查找最长公共子串
pair<string, int> findLongestCommonSubstring(const string& t) {
int maxLen = 0;
int endPos = -1;
int state = 0;
int len = 0;
for (int i = 0; i < t.length(); i++) {
char c = t[i];
while (state != 0 && states[state].next.find(c) == states[state].next.end()) {
state = states[state].link;
len = states[state].len;
}
if (states[state].next.find(c) != states[state].next.end()) {
state = states[state].next[c];
len++;
}
if (len > maxLen) {
maxLen = len;
endPos = i;
}
}
if (maxLen == 0) {
return {"", 0};
}
return {t.substr(endPos - maxLen + 1, maxLen), maxLen};
}
};
// 查找两个字符串的最长公共子串
pair<string, int> longestCommonSubstring(const string& s, const string& t) {
SuffixAutomaton sam;
sam.build(s);
return sam.findLongestCommonSubstring(t);
}
int main() {
string s1 = "banana";
string s2 = "ananas";
auto [lcs, length] = longestCommonSubstring(s1, s2);
cout << "Longest common substring between '" << s1 << "' and '" << s2 << "': '" << lcs << "' (length: " << length << ")" << endl;
string s3 = "abababcababcabcabababd";
string s4 = "abcabababcabab";
auto [lcs2, length2] = longestCommonSubstring(s3, s4);
cout << "Longest common substring between '" << s3 << "' and '" << s4 << "': '" << lcs2 << "' (length: " << length2 << ")" << endl;
return 0;
}在IO竞赛中,字符串算法的应用非常广泛,常见的题型包括:
在IO竞赛中,掌握一些字符串算法的实战技巧可以帮助我们更高效地解决问题:
要想熟练掌握字符串算法,需要进行大量的实战训练: