Suffix Array Part 3 — Longest Common Substring (LCS)

Suffix array can be used to find longest common substring (LCS) among multiple strings. This post illustrate the algorithm by an example of finding LCS among two strings.

Algorithm

The idea is that LCS for two strings is the longest common prefix of some suffices of the two string. Suppose we have two strings abc and bcd, then the suffix of the two strings will be {abc, bc, c} and {bcd, cd, d} respectively. If we sort them according to alphabetical order, it will be rearranged as below,

Figure 1. Sorted Suffices of Both abc and bcd

It is not difficult to tell that the LCS is bc, which is longest common prefix of the two suffices from different strings.

This observation leads to the algorithm below,

1. combine two input strings into one single string
2. compute the suffix array
3. find the longest common prefix of two neighboring suffix that start with suffix of different input strings

Using the same example abc and bcd, we combine the strings to get a single string abcbcd, then the algorithm can be carried out as figure below,

Figure 2. LCS Algorithm using Suffix Array

There are two prefix of the neighboring suffices that start with suffix of different input strings, bc of {bcbcd, bcd} and c of {cbcd, cd}. The longer one is bc. Therefore the LCS of abc and bcd is bc.

Implementation

Below is a sample implementation of the algorithm described above in C,

/**

find longest common substring of two input texts using suffix array

*/

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

int pstrcmp(const void *a, const void *b) {

    //printf("pstrcmp: %s %sn", (const char*)*(char **)a, (const char*)*(char **)b);

    return strcmp((const char *)*(char **)a, (const char *)*(char **)b);

}

 

int lcp(char *a, char *b) {

    int len1, len2;

    int i;

    len1 = strlen(a);

    len2 = strlen(b);

    for (i = 0; (i < len1) && (i < len2); ++i) {

        if (a[i] != b[i]) break;

    }

    return i;

}

 

int main(int argc, char **argv) {

    int i;

    char **ap;      //suffix pointers array

    int len1, len2;

    char *cstr;

    int lcslen = 0, lcplen, lcssufpos = -1;

    len1 = strlen(argv[1]);

    len2 = strlen(argv[2]);

    ap = (char**)malloc((len1+len2)*sizeof(char*));

    cstr = (char*)malloc((len1+len2)*sizeof(char));

    cstr = strcat(argv[1], argv[2]);

    for (i = 0; i < len1+len2; ++i) {

        ap[i] = &(cstr[i]);

    }

    for (i = 0; i < len1+len2; ++i) {

        printf("%sn", ap[i]);

    }

    printf("sort the sufficesn");

    qsort(ap, len1+len2, sizeof(char *), pstrcmp);

    for (i = 0; i < len1+len2; ++i) {

        printf("%sn", ap[i]);

    }

    printf("finding the lcsn");

    for (i = 0; i < len1 + len2 - 1; ++i) {

        if ((ap[i] - cstr >= len1) && (ap[i+1] - cstr >= len1)) {

            //both starts with suffix of second string

            continue;

        } else if ((ap[i] - cstr < len1) && (ap[i+1] - cstr < len1)) {

            //both starts with suffix of first string

            continue;

        } else {

            lcplen = lcp(ap[i], ap[i+1]);

            if (lcplen > lcslen) {

                lcslen = lcplen;

                lcssufpos = i;

            }

        }

    }

    if (lcssufpos == -1) {

        printf("no lcsn");

    } else {

        printf("%.*sn", lcslen, ap[lcssufpos]);

    }

}

Compile the code using command below,

gcc -o suflcs suflcs.c

Then a sample execution,

Figure 3. Execution of LCS Program

Note that the algorithm can be extended to more than two input strings, say K strings. Then the problem becomes finding the longest common prefix of neighboring K suffices which start at suffix of K input strings.

References:
1. http://algs4.cs.princeton.edu/63suffix/LCS.java.html

Suffix Array Part 2–String Matching/Searching

This is part 2 of the post “suffix array”, which covers the string matching/searching problem that can be solved by suffix array. You may want to read part 1 first.

String Matching/Searching

Once we have the suffix array, it’s easy to check if another string appears in the input string/text or not. If the string appears in the input text, if will be the prefix of one or more its suffix. Since the suffix array are sorted suffix of the input string, binary search can be applied.

Below is an implementation that finds all occurrence of a string in the input text,

/**

this program demonstrate how to use suffix array to search for a particular substring 

*/

 

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

 

int pstrcmp(const void *a, const void *b) {

    //printf("pstrcmp: %s %sn", (const char*)*(char **)a, (const char*)*(char **)b);

    return strcmp((const char *)*(char **)a, (const char *)*(char **)b);

}

 

int mybsearch(const char *target, char **ap, int len) {

    int l, r, m;

    int tsize = strlen(target);

    for (l = 0, r = len - 1; l < len && r >= 0 && l <= r;){

        m = (l+r)/2;

        if (strncmp(target, ap[m], tsize) < 0) {

            r = m - 1;

        } else if (strncmp(target, ap[m], tsize) > 0) {

            l = m + 1;

        } else {

            return m;

        }

    }

    return -1;

}

 

int main(int argc, char **argv) {

    int i;

    char **ap;      //suffix pointers array

    int len;

    int bpos;

    len = strlen(argv[1]);

    ap = (char**)malloc(len*sizeof(char*));

    for (i = 0; i < len; ++i) {

        ap[i] = &(argv[1][i]);

    }

    for (i = 0; i < len; ++i) {

        printf("%sn", ap[i]);

    }

    printf("sort the sufficesn");

    qsort(ap, len, sizeof(char *), pstrcmp);

    for (i = 0; i < len; ++i) {

        printf("%sn", ap[i]);

    }

    printf("searching for string:%sn", argv[2]);

    bpos = mybsearch(argv[2], ap, len);

    printf("found at suffix array pos, original string pos: %d, %dn", bpos, (ap[bpos]-argv[1])/sizeof(char));

    if (bpos > 0) {

        printf("found %sn", ap[bpos]);

        //after binary search, we searching in both left directions and right directions for suffix

        //that also has the target string as prefix

        for (i = bpos-1; i >= 0; --i){

            if (strncmp(argv[2], ap[i], strlen(argv[2])) == 0) {

                printf("found at suffix array pos, original string pos: %d, %dn", i, (ap[i]-argv[1])/sizeof(char));

            } else {

                break;

            }

        }

        for (i = bpos + 1; i < len; ++i) {

            if (strncmp(argv[2], ap[i], strlen(argv[2])) == 0) {

                printf("found at suffix array pos, original string pos: %d, %dn", i, (ap[i]-argv[1])/sizeof(char));

            } else {

                break;

            }

        }

    }

}

The program accepts two input parameters, the input text and the string to search for.
Compile the code using the command below,

gcc -o sufsearch sufsearch.c

Below is a screenshot of the execution,

Figure 1. Searching Strings using Suffix Array

Suppose the length of the string is m, the input text is n. Note that every comparison of the string with suffix of the input text consumes O(m) and O(logn) comparisons is needed to perform the binary search. Therefore, the run time for the algorithm above is O(mlogn), excluding the cost of building the suffix array.

There’re are better algorithms that use the longest common prefix to help the search process. The run time is O(m + logn). Interested users can refer to reference 6 for details.

Note that the input text can be preprocessed to build the suffix array, and the search target string can be given later. This differs from KMP algorithm that the pattern must be given before the computation can begin. This difference makes them suitable for different applications.

References:
1. Suffix Array, Wikipedia. http://en.wikipedia.org/wiki/Suffix_array
2. Brief Introduction to Suffix Array: http://sary.sourceforge.net/docs/suffix-array.html
3. Algorithms, 4th edition website: http://algs4.cs.princeton.edu/63suffix/
4. Programming Perls. 2nd Edition.
5. libdivsufsort, http://code.google.com/p/libdivsufsort/
6. Suffix Arrays: a new method for On-Line String Searches: http://delivery.acm.org/10.1145/330000/320218/p319-manber.pdf?ip=137.132.250.14&acc=ACTIVE%20SERVICE&CFID=67974067&CFTOKEN=52233823&__acm__=1330495809_b1bcb53c53d3d7a276f870cb1e0fdf89

Suffix Array–Part 1

Suffix Array

Suffix array is an array which contains the suffix of a string sorted in lexicographical (alphabetical) order. It’s used in several string matching/searching problems. This post briefly explains suffix array, its applications and provides C implementations to some of them.

To compute the suffix array, we first find all the suffix. In C, this is simply finding the start position of each suffix and use a pointer pointing to it. We then sort the array of suffix. This is implemented as code below,

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

 

int pstrcmp(const void *a, const void *b) {

    //printf("pstrcmp: %s %sn", (const char*)*(char **)a, (const char*)*(char **)b);

    return strcmp((const char *)*(char **)a, (const char *)*(char **)b);

}

 

int main(int argc, char **argv) {

    int i;

    char **ap;      //suffix pointers array

    int len;

    len = strlen(argv[1]);

    ap = (char**)malloc(len*sizeof(char*));

    for (i = 0; i < len; ++i) {

        ap[i] = &(argv[1][i]);

    }

    for (i = 0; i < len; ++i) {

        printf("%sn", ap[i]);

    }

    printf("sort the sufficesn");

    qsort(ap, len, sizeof(char *), pstrcmp);

    for (i = 0; i < len; ++i) {

        printf("%sn", ap[i]);

    }

}

The code accepts an input string and compute the suffix array according to the description above. The C standard library qsort is used to sort the suffix.

Compile the code using the command below,

gcc -o suffix suffix.c

Below is a screenshot of the code executed,

Figure 1. Running Suffix Array Computation

As shown above, the program first output all suffix for the input array “sewfds”, then sort it and output the suffix array.

Note that get the suffix consumes O(n), sort them consumes O(nlogn) of suffix comparison. But the suffix comparison (which is string comparison) requires O(n) time. So the overall runtime is O(n) + O(n^2logn) = O(n^2logn). There’re algorithms can compute suffix array in faster ways, but they’re not discussed in this post.

Longest Repeated String

The first problem that suffix array can be used is the longest repeated string/substring: given an input array, find the longest duplicated substring in it. For example, given the input string “Ask not what your country can do for you, but what you can do for your country”, the longest duplicated substring is “ can do for you “, and “ your country “ is a close second place. (This is an example given in reference 4.)

If we can compute the suffix array for the input string, then all the substrings of the input string become the prefix of some suffix. If the substring is duplicated, then it appears in multiple suffix. Since the suffix array is sorted, then finding the duplicated repeated string becomes finding the prefix of the neighboring suffix. The longest duplicated repeated string is simply the longest common prefix of neighboring suffix.

The C implementation is given as below,

/**

find longest repeated substring using suffix array

*/

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

 

int pstrcmp(const void *a, const void *b) {

    //printf("pstrcmp: %s %sn", (const char*)*(char **)a, (const char*)*(char **)b);

    return strcmp((const char *)*(char **)a, (const char *)*(char **)b);

}

 

int comlen(const char *a, const char *b) {

    int i = 0;

    while ((a[i]!='') && (b[i] != '') && (a[i] == b[i])) {

        ++i;

    }

    return i;

}

 

int lrs(char **ap, int len, char **lrsp) {

    int i;

    int maxlen = 0;

    int clen;

    for (i = 0; i < len-1; ++i) {

        if ((clen = comlen(ap[i], ap[i+1])) > maxlen) {

            *lrsp = ap[i];

            maxlen = clen;

        }

    }

    return maxlen;

}

 

int main(int argc, char **argv) {

    int i;

    char **ap;      //suffix pointers array

    int len, lrslen;

    char *lrsp;

    len = strlen(argv[1]);

    ap = (char**)malloc(len*sizeof(char*));

    for (i = 0; i < len; ++i) {

        ap[i] = &(argv[1][i]);

    }

    for (i = 0; i < len; ++i) {

        printf("%sn", ap[i]);

    }

    printf("sort the sufficesn");

    qsort(ap, len, sizeof(char *), pstrcmp);

    for (i = 0; i < len; ++i) {

        printf("%sn", ap[i]);

    }

    printf("find the longest repeated subtringn");

    lrslen = lrs(ap, len, &lrsp);

    printf("%.*sn", lrslen, lrsp); 

}

The above code is written according to the description above. Save the code to suflrs.c and compile the code using the command below,

gcc -o suflrs suflrs.c

The code accepts an input string and outputs the longest repeated substring. A simple execution (with some printf disabled) is as below,

Figure 2. Finding Longest Repeated Substring using Suffix Array

Note that building the suffix array consumes O(n^2logn). Finding the longest commong substring takes O(n^2). So the total time is O(n^2logn).

For a fast suffix array construction library, please refer to reference 5.

More applications of suffix array are covered in part 2.

References:
1. Suffix Array, Wikipedia. http://en.wikipedia.org/wiki/Suffix_array
2. Brief Introduction to Suffix Array: http://sary.sourceforge.net/docs/suffix-array.html
3. Algorithms, 4th edition website: http://algs4.cs.princeton.edu/63suffix/
4. Programming Perls. 2nd Edition.
5. libdivsufsort, http://code.google.com/p/libdivsufsort/

Knuth-Morris-Pratt Algorithm (KMP)

I’ve learned Knuth-Morris-Pratt Algorithm (KMP) several times. Several weeks after I learned it, if I am asked to solve the string/pattern matching problem, I only know there’s a KMP algorithm that does it efficiently at the time complexity of O(n+m). How to do it? I need to read it again. So I know I’ll need to write a post for it.

0. The Algorithm

Knuth-Morris-Pratt algorithm is named after the three computer scientists who discovered the algorithm. It is an efficient algorithm to solve pattern matching problems. In this post, we’ll just use string matching to illustrate.

Let’s say you have a string “ababaca” of length M, and you want to find whether the string appears in a text “bacbababaabcbab” of length N.

0.1 The Naive Method

It’s easy to come out a naive method with two nested loops. We simply start search at each position of the text, and see if the next M characters match the pattern. If there’s a mismatch, we simply move to next position.

You can refer to 1.1 for C implementation of the naive method.

0.2 KMP

The KMP algorithm makes use of the information in a matching sub-pattern of a mismatch. This is illustrated as below,

Pattern   a b a b a c a
Text bacb a b a b a a abcbab

When the second mismatch in the example above happens, we have matched the first 5 chars and reached the 10th char of the text. We know the first 5 chars of the pattern is “ababa” and the last 6 chars we scanned in the text is “ababaX”, where X is not “a”. KMP made use of this information to move the pattern.

Pattern   a b a b a c a
Text bacbab a b a a*     abcbab

It can be observed that after the move, the suffix “aba” of the previous matched sub-pattern “ababa” matches the prefix of the same sub-pattern (Note that “a” also satisfies the condition, but “aba” is the longest). And the next matching search can start at “a*” position. 

In fact, KMP preprocess the pattern to get the length of the longest prefix in the sub-pattern that matches a suffix in the same sub-pattern. The number of steps to jump is then decided according to the following

No. of steps to jump = max(1, length of matched sub-pattern in mismatch – preprocessed
value corresponding to the last matched position)

For a detailed discussion of the preprocessing, please refer here.

In our example, the preprocessing will have the following values,

Position 0 1 2 3 4 5 6
Pattern a b a b a c a
Pre-process values 0 0 1 2 3 0 1
No. of Steps to jump max(0,1) max(0,1) max(2-0, 1) max(3-1, 1) max(4-2, 1) max(5-3, 1) max(6-0, 1)

If we use the pre-computed table above to search for the string “ababaca” in text “bcaababacababaca”, it will be like below,
ab                           
acaabababaca                Jump step = 1
a
acaabababaca                Jump step = 1
   ab                                                
acaabababaca                Jump step = 1
     ababac                   
acaabababaca                Jump step = 1
     xxababaca
acaabababaca                Jump step = 2
=> a match is found

1. C Code

1.1 C Implementation for Naive String Matching

#include <stdio.h>

#include <string.h>

 

int main(void) {

    char *pattern = "ababcab";

    char *text = "cabdabadcdefababcabef";

    int i = 0, j = 0;

    int m, n;

 

    m = strlen(pattern);

    n = strlen(text);

    printf("%d:%dn", m, n);

 

    for (; i < n - m + 1; ++i) {

        for (j = 0; j < m; ++j) {

            if (pattern[j] != text[i + j]) {

                break;

            }

        }

        if (j == m) {

            //found a match

            printf("there's a match starting at %d: %sn", i, &(text[i]));

            break;

        }

    }

    if (i == n - m + 1) {

        printf("no matchn");

    }

    return 0;

}

Compile the code using

gcc -o naive naive.c

1.2 C Implementation for KMP

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

void precompute(char *pattern, int *pre) {

    int plen = strlen(pattern);

    int i;

    memset(pre, 0, sizeof(int)*plen);

    for (i = 1; i < plen; ++i) {

        pre[i] = pre[i-1];

        while (pattern[i] != pattern[pre[i]]) {

            if (pre[i] == 0) {

                pre[i] = -1;

                break;

            } else {

                pre[i] = pre[pre[i]-1];

            }

        }

        pre[i] = pre[i] + 1;

    }

}

 

int main(int argc, char **argv) {

    char *pattern = argv[1];

    char *text = argv[2];

    int *pre;

    int m = strlen(pattern);

    int n = strlen(text);

    int i, j, overlap;

    pre = (int*) malloc((m+1)*sizeof(int));

    precompute(pattern, pre);

    for (i = 0, j = 0; i < n; ) {

        while (text[i+j] == pattern[j]) {

            ++j;

            if (j == m) {

               printf("found a match at pos: %dn", i);

               break;

            }

        }

        if (j > 0) {

            i += (j - pre[j-1] > 1) ? (j - pre[j-1]):1;

            j = pre[j-1];

        } else {

            ++i;

        }

    }

    return 0;

}

Compile the code using

gcc -o kmp kmp.c

2. Sample Run
You can use the following scripts to execute naive implementation and kmp implementation side by side,

#!/bin/sh

pattern="dsfads"

text="sfdkjfldskjalfkdsadsfadsdsfadsfdaklfdjsla;fjdsafkjfdlsajfoiewjfksafdasfdsa"

./kmp $pattern $text

./naive $pattern $text

 

pattern="fdsfadsa"

text="fdasklfjdlsajfdsfdsfadsdsafdsaffdsfadsafdsakljfdlsakjflkdsajflkdsajflkdsjalkfjdslkajfkd;lsajfklds;jalkfjd;sa"

./kmp $pattern $text

./naive $pattern $text

 

pattern="ababab"

text="ababababababfdasjklabababafdkslajabababafdafabababafdsafababab"

./kmp $pattern $text

./naive $pattern $text

 

pattern="ababaca"

text="abababacaabacaabababaabcacafslkfjdslaabacacababacaacaads"

./kmp $pattern $text

./naive $pattern $text

 

pattern="ababacaab"

text="abababacaabacaabababaabcacafslkfjdslaabacacababacaacaads"

./kmp $pattern $text

./naive $pattern $text

 

pattern="ABCDABD"

text="ABCABCDABABCDABCDABDE"

./kmp $pattern $text

./naive $pattern $text

 

pattern="ABCDABD"

text="ABccDAFDSA"

./kmp $pattern $text

./naive $pattern $text

3. Naïve VS. KMP

It can be proved that Naive implementation has the worst case O(mn), while KMP has the worst case O(m+n). Here we add a counting variable to each implementation and run a test case to demonstrate that it’s indeed the case and our KMP implementation runs in O(m+n) time.

The modified code for naive implementation is as below,

#include <stdio.h>

#include <string.h>

 

int main(int argc, char **argv) {

    char *pattern = argv[1];

    char *text = argv[2];

    int i = 0, j = 0;

    int m, n;

    int cmpcnt = 0;

 

    m = strlen(pattern);

    n = strlen(text);

    //printf("%d:%dn", m, n);

 

    for (; i < n - m + 1; ++i) {

        for (j = 0; j < m; ++j) {

            ++cmpcnt;

//            printf("%d:%d:%dn", i, j, cmpcnt);

            if (pattern[j] != text[i + j]) {

                break;

            }

        }

        if (j == m) {

            //found a match

            printf("naive: there's a match starting at %d: %sn", i, &(text[i]));

            //break;

        }

    }

    if (i == n - m + 1) {

        //printf("no matchn");

    }

    printf ("naive (m = %d;  n = %d): No. of comparisons performed: %dn", m , n, cmpcnt);

    return 0;

}

The modified KMP implementation is as below,

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

int cmpcnt = 0;

 

void precompute(char *pattern, int *pre) {

    int plen = strlen(pattern);

    int i;

    memset(pre, 0, sizeof(int)*plen);

    for (i = 1; i < plen; ++i) {

        pre[i] = pre[i-1];

        ++cmpcnt;

        while (pattern[i] != pattern[pre[i]]) {

            if (pre[i] == 0) {

                pre[i] = -1;

                break;

            } else {

                pre[i] = pre[pre[i]-1];

            }

            ++cmpcnt;

        }

        pre[i] = pre[i] + 1;

    }

}

 

int main(int argc, char **argv) {

    char *pattern = argv[1];

    char *text = argv[2];

    int *pre;

    int m = strlen(pattern);

    int n = strlen(text);

    int i, j, overlap;

    pre = (int*) malloc((m+1)*sizeof(int));

    precompute(pattern, pre);

    for (i = 0, j = 0; i < n; ) {

        ++cmpcnt;

        while (text[i+j] == pattern[j]) {

            ++j;

            if (j == m) {

               printf("found a match at pos: %dn", i);

               break;

            }

            ++cmpcnt;            

        }

        if (j > 0) {

            i += (j - pre[j-1] > 1) ? (j - pre[j-1]):1;

            j = pre[j-1];

        } else {

            ++i;

        }

    }

    printf("kmp (m=%d; n=%d): total No. of comparisons performed: %dn", m, n, cmpcnt);

    return 0;

}

The test script is as below,

#!/bin/sh

pattern="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"

text="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"

./kmp $pattern $text

./naive $pattern $text

Now the results,

found a match at pos: 786
kmp (m=34; n=820): total No. of comparisons performed: 1671
naive: there’s a match starting at 786: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab
naive (m = 34;  n = 820): No. of comparisons performed: 26758

This post doesn’t discuss much of the pre-processing of KMP, I’ve written a separate post for it. You can find it here.

References:
1. Introduction to Algorithms, 3rd Edition.
2. Wikipedia, Knuth-Morris-Pratt algorithm: http://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
3. The Knuth-Morris-Pratt Algorithm in my own words: http://jboxer.com/blog/2009/12/13/the-knuth-morris-pratt-algorithm-in-my-own-words/

KMP (Knuth-Morris-Pratt) Algorithm Pre-computation

A More General Problem

The pre-computation of KMP finds the longest suffix of a sub-pattern P[0..j] (where j is [0, m-1], m is the length of the pattern) that is also the prefix of this sub-pattern.

A more general problem is to find the longest suffix of a pattern X[0..lx-1], which is also the prefix of a pattern Y[0..ly-1]. If we have solutions for this more general problem, then we simply set both X and Y to P[0..j], and compute for all P[0..j], where j is in the range of [0, m-1]. So we start by solving the more general problem.

Tow Key Observations

The key to solve the problem is to notice that if there’s a pattern Z1 which is suffix of pattern X and prefix of pattern Y, but not the longest one, then Z1 is also a suffix of the longest one Zmax. For example, let X be “abefdef  “, Y be “efdefg”, then “ef” is suffix of X and prefix of Y, but not the longest one. The longest one Zmax should be “efdef”. Then “ef” is a suffix of “efdef”.

In other words, if we have found the longest suffix of X and prefix of Y “efdef”, we let “efdef” be X, and repeat the finding process for new X “efdef” and Y “efdefg”, then we find “ef” (note that “efdef” itself doesn’t count). The process and go on and on, and it’s actually recursive.

The description above can be summarized as below,

If we can find the longest suffix of X and prefix of Y, using method sufpre(X, Y), we can find all suffix (longest, 2nd longest, 3nd longest, in this order…) of X and prefix of Y by applying the same method recursively X = sufpre(X, Y), until X is empty.

In pseudocode, the above can be expressed as,

while (x ! =empty) {
   x = sufpre(x, y);
   record x
}

Another important observation is that to find the longest suffix of X[0..lx-1] and prefix of Y[0..ly-1],  we can find the same for X[0..lx-2] and Y[0..ly-1] first, say it’s Y[0..a]. And if the last character of X matches with the character at a+1 of Y. The longest suffix of X[0..lx-1] and prefix of Y[0..ly-1] is simply Y[0..a+1].

For example, let X be “abcbcf”, Y be “bcbcfg. First we find that the longest suffix of X[0..lx-2] “abcbc” and prefix of Y[0..ly-1] “bcbcfg” is “bcbc”. Then the last character of X is “f”, which matches with the a+1 character of Y, which is also “f”. We say that the longest suffix of X[0..lx-1] and prefix of Y[0..ly-1] is Y[0..a+1], which is “bcbcf”.

If the last character of X doesn’t match with the a+1 character of Y, we use the first observation to find a shorter suffix of X[0..lx-2] and Y[0..ly-1], say Y[0..b], where b < a. Then try adding the last character of X again by compare last character of X with b+1 character of Y. The process repeats until we found next longest suffix that can produce a valid match with the adding of last character of X or until there’s no more suffix of X and prefix of Y.

For example, we change the X to “abcbcb”, and let Y still be “bcbcfg”. First we find the longest suffix of X[0..lx-2] “abcbc” and prefix of Y[0..ly-1] “bcbcfg” is “bcbc” (Y[0..a]). Then we add the last character of X, let X be “abcbcb”, and the last character “b” doesn’t match with the a+1 character of Y “f”. Then we find the shorter suffix of X[0..lx-2] and Y[0..ly-1] is “bc” (Y[0..b]). And the last character of X “b” matches with the b+1 character of Y “b”. So the longest suffix of X and prefix of y is “bcb”.

The description above can be summarized as below,

The longest suffix of X[0..lx-1] and prefix Y[0..ly-1], say Zmax, can be found based on the longest suffix of X[0..lx-2] and prefix Y[0..ly-1], say Za=Y[0..a].

  • If X[lx-1] is same as Y[a+1], then Zmax = Y[0..a+1].
  • Otherwise, the next longest suffix of X[0..lx-2] and prefix Y[0..ly-1], say Za’ = Y[0..a’] (a’ < a) is checked. The process repeats until Za’ is empty.

The description above can be expressed as pseudocode below,

Z = sufpre(X[0..lx-2], Y[0..ly-1])                       //Z = Y[0..a]
while (X[lx-1] != Y[a+1]) {
   if (Z == “”) break;                                         //a = -1;
   else Z = sufpre(Z, Y[0..ly-1])
}
return Y[0..a+1];

The method is recursive. However, dynamic programming can be applied here to to remember the sufpre value for all X[0..j], where j is [0..lx-2]. Then the computation of sufpre for X[0..lx-1] doesn’t require recursive calls.

Now we’re ready to implement the pre-computation for KMP.

C Implementation

Follow the pseudo code above and replace the recursive call with dynamic programming, we can have the code below,

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

 

int main(int argc, char **argv) {

    char *pattern = argv[1];

    int plen = strlen(pattern);

    int i;

    int *pre;

    pre = (int*) malloc(sizeof(int)*plen);

    pre[0] = 0;

    for (i = 1; i < plen; ++i) {

        pre[i] = pre[i-1];

        while (pattern[i] != pattern[pre[i]]) {

            if (pre[i] == 0) {

                pre[i] = -1;

                break;

            } else {

                pre[i] = pre[pre[i]-1];                //pre[i] is the length, pre[i]-1 is the right index to use

            }

        }

        pre[i] = pre[i] + 1;

    }

    for (i = 0; i < plen; ++i) {

        printf("%d:%dn", i, pre[i]);

    }

    return 0;

}

Note that this implementation accepts an input string and calculate the length of longest suffix of its substring which is also the prefix of the same substring.

I also found another implementation from reference 5 and modified it to do KMP pre-computation only. The code is as below,

#include<stdio.h>

#include<string.h>

 

void kmp_table(char *ptr_i, int word_length);

int T[50];

 

int main(int argc, char **argv)

{

char* ptr_i = argv[1];

kmp_table(ptr_i,strlen(ptr_i));

}

 

 

void kmp_table(char *ptr_i,int word_length)

{

 int i,j;

  i=1;

  j=0;

  T[0]=0;

  for(;i<word_length; )

  {

        if(*(ptr_i+i) == *(ptr_i+j))

           {

            T[i]=j+1;

            i=i+1;

            j=j+1;

           }

 

        else if(j>0)

            {

             j=T[j-1];

            }

 

        else

            {

             T[i]=0;

             i=i+1;

            }

 }

 for (i = 0; i < word_length; ++i) {

    printf("%d:%dn", i, T[i]);

}

 

}

Save my implementation to prekmp.c and modified reference 5 implementation to kk.c and compile them.

gcc -o prekmp prekmp.c
gcc -o kk kk.c

Verification
If you’re using Linux, I also have this batch file for run the two programs and compare the results of the two. 

#!/bin/sh

rm pre1.txt

rm pre2.txt

input="ababab"

./prekmp $input >> pre1.txt

./kk $input >> pre2.txt

 

input="ababcabc"

./prekmp $input >> pre1.txt

./kk $input >> pre2.txt

 

input="abcdefgh"

./prekmp $input >> pre1.txt

./kk $input >> pre2.txt

 

input="aaaaaaaaa"

./prekmp $input >> pre1.txt

./kk $input >> pre2.txt

 

diff pre1.txt pre2.txt

You can add as many test cases as you want. As long as the diff command at last line doesn’t detect different between two output files, it means the two programs produces same results.

I’ve written a separate post here for KMP.

References:
1. Knuth-Morris-Pratt String Matching: http://www.ics.uci.edu/~eppstein/161/960227.html
2. Introduction to Algorithms, 3rd Edition.
3. Wikipedia, Knuth-Morris-Pratt algorithm: http://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm
4. The Knuth-Morris-Pratt Algorithm in my own words: http://jboxer.com/blog/2009/12/13/the-knuth-morris-pratt-algorithm-in-my-own-words/
5. KMP C++ Implementation: http://www.oneous.com/Tutorial-Content.php?id=24