
  • 后缀树
  • 后缀数组
    • 概念
    • sa[]
    • rk[]
    • height[]
  • 例题
    • HDU-1403最长公共子串
    • 洛谷P2408 不同子串个数
    • HDU-5769Substring




后缀树(suffix tree)就是把所有的后缀子串用字典树的方法建立的一棵树,如图:



using namespace std;
const int maxn = 100005;
int trie[maxn][26];
int pos = 1, n;
char s[maxn], t[maxn];
void insert(int idx) { //构建后缀树int p = 0; for (int i = idx; i < n; i++) {int u = s[i] - 'a';if (trie[p][u] == 0)trie[p][u] = pos++;p = trie[p][u];}
bool find() {  //查询是否是子串int p = 0;for (int i = 0; s[i]; i++) {int u = s[i] - 'a';if (trie[p][u] == 0)return false;p = trie[p][u];}return true;
int main() {scanf("%s%s", s,t);n = strlen(s);for (int i = 0; i < n; i++) {//枚举起点insert(i);}printf("%s子串", find() ? "是" : "不是");return 0;




直接对后缀树构造和编程不太方便,而后缀数组(suffix array)就是更简单的替代方法。

下标i 后缀s[i] 下标j 字典序 后缀数组sa[j]
0 aabab 0 aabab 0
1 abab 1 ab 3
2 bab 2 abab 1
3 ab 3 b 4
4 b 4 bab 2

后缀数组就是字典序对应的后缀下标,即sasasa(suffix array缩写)数组。比如s[1]=3s[1]=3s[1]=3,表示字典序排1的子串,是原来字符串中第3个位置开始的后缀子串,即ababab。


using namespace std;
string s, t;
int sa[] = { 0,3,1,4,2 }; //设sa[]已求出
int find() {  //t在s中位置int l = 0, r = s.size();while (r > l + 1) { //字典序里二分int mid = (l + r) / 2;if (s.compare(sa[mid], t.length(), t) < 0)l = mid;  //-1不相等移动左指针else r = mid; //0相等移动右指针}if (s.compare(sa[r], t.length(), t) == 0)return sa[r];  //返回原始位置if (s.compare(sa[l], t.length(), t) == 0)return sa[l];return -1; //没找到
int main() {s = "aabab";t = "ba";cout << find();return 0;




  1. 用数字替代字母,如a=0,b=1。
  2. 连续两个数字组合,如00代表aa,01代表ab,最后一个1没有后续,在尾部加上0,组成10,并不影响字符得比较。
  3. 连续4个数字组合,如0010代表aaba,同样得01和10没有后续,补0。
  4. 得到5个完全不一样的数字,可以区分大小了,进行排序,得到rk数组={0,2,4,1,3}。
  5. 最后通过排名得到后缀数组sa[]={0,3,1,4,2}。
步骤 a a b a b
第一步 0 0 1 0 1
第二步 00 01 10 01 10
第三步 0010 0101 1010 0100 1000
下标i 0 1 2 3 4
排序rk[i] 0 2 4 1 3
转换sa[i] sa[0]=0 sa[2]=1 sa[4]=2 sa[1]=3 sa[3]=4
sa[i] 0 3 1 4 2


步骤 a a b a b
第一步 0 0 1 0 1
第二步 00 01 10 01 10
排序rk[] 0 1 2 1 2
第三步 02 11 22 10 20
下标i 0 1 2 3 4
排序rk[i] 0 2 4 1 3
转换sa[i] sa[0]=0 sa[2]=1 sa[4]=2 sa[1]=3 sa[3]=4
sa[i] 0 3 1 4 2



  • sa[]后缀数组,suffix array缩写,记录的是位置,是字典序排名第i的是谁。
  • rk[]排名数组,rank array缩写,记录的是排名,是第i个后缀子串排名第几。


using namespace std;
const int maxn = 200005;
char s[maxn];
int sa[maxn], rk[maxn], tmp[maxn + 1];
int n, k;
bool cmp_sa(int i, int j) { //直接比较,省去组合过程if (rk[i] != rk[j]) //比较组合数高位return rk[i] < rk[j];else { //比较组合数低位int ri = i + k <= n ? rk[i + k] : -1;int rj = j + k <= n ? rk[j + k] : -1;return ri < rj;}
void calc_sa() { //计算sa[](快速排序)for (int i = 0; i <= n; i++) {rk[i] = s[i]; //记录原始数值sa[i] = i; //记录当前排序结果}for (k = 1; k <= n; k *= 2) { //每次递增2倍sort(sa, sa + n, cmp_sa);//因为rk[]存在相同数,所以需要上一轮rk[]才能比较(即cmp_sa里)//所以不能直接赋给rk[],需要一个tmp[]周转tmp[sa[0]] = 0; for (int i = 0; i < n; i++) //sa[]倒推组合数记录在tmp[]tmp[sa[i + 1]] = tmp[sa[i]] + (cmp_sa(sa[i], sa[i + 1]) ? 1 : 0);for (int i = 0; i < n; i++)rk[i] = tmp[i];}
int main() {memcpy(s, "aabab", 6);n = strlen(s);calc_sa();for (int i = 0; i < n; i++)cout << sa[i] << " ";// 0 3 1 4 2return 0;



格子 0 1 2 3 4 5 6 7 8 9
个位 40 91 82,52 43 67
十位 40,43 52 67 82 91


using namespace std;
const int maxn = 200005;
char s[maxn];
int sa[maxn], rk[maxn];
int cnt[maxn], t1[maxn], t2[maxn];
int n, k;
void calc_sa() { //计算sa[](基数排序)int m = 127; //ASCLL范围int i, * x = t1, * y = t2;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[i] = s[i]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[i]]] = i;for (k = 1; k <= n; k *= 2) {int p = 0; //利用长度k的排序结果对长度2k的排序for (i = n - k; i < n; i++)y[p++] = i;for (i = 0; i < n; i++)if (sa[i] >= k)y[p++] = sa[i] - k;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[y[i]]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[y[i]]]] = y[i];swap(x, y);p = 1;x[sa[0]] = 0;for (i = 1; i < n; i++)x[sa[i]] = y[sa[i - 1]] == y[sa[i]] && y[sa[i - 1] + k] == y[sa[i] + k] ? p - 1 : p++;if (p >= n)break;m = p;}
int main() {memcpy(s, "aabab", 6);n = strlen(s);calc_sa();for (int i = 0; i < n; i++)cout << sa[i] << " ";// 0 3 1 4 2return 0;


height[]height[]height[]是一个辅助数组,和最长公共前缀(Longest Common Prefix,LCP)相关。



void getheight(int n) { //n是字符串长度int k = 0;for (int i = 0; i < n; i++)rk[sa[i]] = i;for (int i = 0; i < n; i++) {if (k)k--;int j = sa[rk[i] - 1];while (i + k < n && j + k < n && s[i + k] == s[j + k])k++;height[rk[i]] = k;}

  1. 在串sss中查找子串ttt
  2. 在串sss中找最长重复子串
  3. 找串s1s1s1和串s2s2s2的最长公共子串
  4. 找串s的最大回文子串


HDU-1403 Longest Common Substring

Given two strings, you have to tell the length of the Longest Common Substring of them.
For example:
str1 = banana
str2 = cianaic
So the Longest Common Substring is “ana”, and the length is 3.
The input contains several test cases. Each test case contains two strings, each string will have at most 100000 characters. All the characters are in lower-case.
Process to the end of file.
For each test case, you have to tell the length of the Longest Common Substring of them.
Sample Input
Sample Output

using namespace std;
const int maxn = 200005;
char s[maxn];
int sa[maxn], rk[maxn], height[maxn];
int cnt[maxn], t1[maxn], t2[maxn];
int n, k;
void calc_sa() {int m = 127;int i, * x = t1, * y = t2;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[i] = s[i]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[i]]] = i;for (k = 1; k <= n; k *= 2) {int p = 0;for (i = n - k; i < n; i++)y[p++] = i;for (i = 0; i < n; i++)if (sa[i] >= k)y[p++] = sa[i] - k;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[y[i]]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[y[i]]]] = y[i];swap(x, y);p = 1;x[sa[0]] = 0;for (i = 1; i < n; i++)x[sa[i]] = y[sa[i - 1]] == y[sa[i]] && y[sa[i - 1] + k] == y[sa[i] + k] ? p - 1 : p++;if (p >= n)break;m = p;}
void getheight(int n) {int k = 0;for (int i = 0; i < n; i++)rk[sa[i]] = i;for (int i = 0; i < n; i++) {if (k)k--;int j = sa[rk[i] - 1];while (i + k < n && j + k < n && s[i + k] == s[j + k])k++;height[rk[i]] = k;}
int main() {while (~scanf("%s", s)) {int len1 = strlen(s);s[len1] = '$';scanf("%s", s + len1 + 1);n = strlen(s);calc_sa();getheight(n);int ans = 0;for (int i = 1; i < n; i++) {if (height[i] > ans) {if ((sa[i] < len1 && sa[i - 1] >= len1) || sa[i - 1] < len1 && sa[i] >= len1)ans = height[i];}}printf("%d\n", ans);}return 0;


洛谷P2408 不同子串个数

P2408 不同子串个数

我们定义两个子串不同,当且仅当有这两个子串长度不一样 或者长度一样且有任意一位不一样。
输入 #1
输出 #1
输入 #2
输出 #2
(具体来说,C++和C选手请使用long long 类型,pascal选手请使用Int64)
由于输入文件过大,请使用 高效的读入方法(具体的,c++和c选手请不要使用cin,pascal选手不需要管)


using namespace std;
typedef long long ll;
const int maxn = 200005;
char s[maxn];
int sa[maxn], rk[maxn], height[maxn];
int cnt[maxn], t1[maxn], t2[maxn];
int n, k;
void calc_sa() {int m = 127;int i, * x = t1, * y = t2;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[i] = s[i]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[i]]] = i;for (k = 1; k <= n; k *= 2) {int p = 0;for (i = n - k; i < n; i++)y[p++] = i;for (i = 0; i < n; i++)if (sa[i] >= k)y[p++] = sa[i] - k;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[y[i]]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[y[i]]]] = y[i];swap(x, y);p = 1;x[sa[0]] = 0;for (i = 1; i < n; i++)x[sa[i]] = y[sa[i - 1]] == y[sa[i]] && y[sa[i - 1] + k] == y[sa[i] + k] ? p - 1 : p++;if (p >= n)break;m = p;}
void getheight(int n) {int k = 0;for (int i = 0; i < n; i++)rk[sa[i]] = i;for (int i = 0; i < n; i++) {if (k)k--;int j = sa[rk[i] - 1];while (i + k < n && j + k < n && s[i + k] == s[j + k])k++;height[rk[i]] = k;}
int main() {scanf("%d%s", &n, s);n++;calc_sa();getheight(n);n--;ll ans = 0;for (int i = 1; i <= n; i++)ans += n - sa[i] - height[i];printf("%lld", ans);return 0;


HDU-5769 Substring

?? is practicing his program skill, and now he is given a string, he has to calculate the total number of its distinct substrings.
But ?? thinks that is too easy, he wants to make this problem more interesting.
?? likes a character X very much, so he wants to know the number of distinct substrings which contains at least one X.
However, ?? is unable to solve it, please help him.
The first line of the input gives the number of test cases T;T test cases follow.
Each test case is consist of 2 lines:
First line is a character X, and second line is a string S.
X is a lowercase letter, and S contains lowercase letters(‘a’-‘z’) only.
The sum of |S| in all the test cases is no more than 700,000.
For each test case, output one line containing “Case #x: y”(without quotes), where x is the test case number(starting from 1) and y is the answer you get for that case.
Sample Input
Sample Output
Case #1: 3
Case #2: 3
In first case, all distinct substrings containing at least one a: a, ab, abc.
In second case, all distinct substrings containing at least one b: b, bb, bbb.


using namespace std;
typedef long long ll;
const int maxn = 200005;
char s[maxn];
int sa[maxn], rk[maxn], height[maxn];
int cnt[maxn], t1[maxn], t2[maxn];
int n, k;
void calc_sa() {int m = 127;int i, * x = t1, * y = t2;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[i] = s[i]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[i]]] = i;for (k = 1; k <= n; k *= 2) {int p = 0;for (i = n - k; i < n; i++)y[p++] = i;for (i = 0; i < n; i++)if (sa[i] >= k)y[p++] = sa[i] - k;for (i = 0; i < m; i++)cnt[i] = 0;for (i = 0; i < n; i++)cnt[x[y[i]]]++;for (i = 1; i < m; i++)cnt[i] += cnt[i - 1];for (i = n - 1; i >= 0; i--)sa[--cnt[x[y[i]]]] = y[i];swap(x, y);p = 1;x[sa[0]] = 0;for (i = 1; i < n; i++)x[sa[i]] = y[sa[i - 1]] == y[sa[i]] && y[sa[i - 1] + k] == y[sa[i] + k] ? p - 1 : p++;if (p >= n)break;m = p;}
void getheight(int n) {int k = 0;for (int i = 0; i < n; i++)rk[sa[i]] = i;for (int i = 0; i < n; i++) {if (k)k--;int j = sa[rk[i] - 1];while (i + k < n && j + k < n && s[i + k] == s[j + k])k++;height[rk[i]] = k;}
int main() {int t;scanf("%d", &t);for (int cs = 1; cs <= t; cs++) {char x[3];scanf("%s%s", x, s);n = strlen(s);n++;calc_sa();getheight(n);n--;int nex[maxn];int pos = n;for (int i = n - 1; i >= 0; i--) {if (s[i] == x[0])pos = i;nex[i] = pos;}ll ans = 0;for (int i = 1; i <= n; i++) {ans += n - max(nex[sa[i]], sa[i] + height[i]);}printf("Case #%d: %lld\n", cs, ans);}return 0;

