c - 快速字符串搜索以查找与给定模式匹配的字符串数组元素

Question

我有一个常量字符串数组，我遍历它以查找元素的索引，该索引是包含搜索模式的字符串。我应该选择哪种搜索算法来提高查找此元素的速度？如果有必要，我在运行用于准备查找表的应用程序之前的时间不受限制。

我更正了一个问题 - 我没有进行精确的字符串匹配 - 我正在搜索元素内的模式，该元素位于数组中

array of strings:
[0] Red fox jumps over the fence
[1] Blue table
[2] Red flowers on the fence

我需要找到一个包含单词'table'的元素 - 在这种情况下它的元素 1

我确实喜欢一组 30 个数组的 50000 次迭代，它可以包含多达 30000 个不少于 128 个字符的字符串。现在我正在使用旧的strstr蛮力，这太慢了......

好的，发布我的函数的一部分，第一个strstr- 如果有任何出现，则在未切割的行数组中查找，然后进行粗略搜索。我知道我可以加快这部分的速度，但我没有对这种方法进行优化......

// sheets[i].buffer - contains a page of a text which is split into lines
// fullfunccall - is the pattern
// sheets[i].cells[k] - are the pointers to lines in a buffer

for( i=0; i<sheetcount; i++) {
  if( i!= currsheet && sheets[i].name && sheets[i].name[0] != '\0') {
    if( strstr(sheets[i].buffer, fullfunccall )) {
      usedexternally = 1;

      int foundatleastone = 0;
      for( k=0; k<sheets[i].numcells; k++ ) {
        strncpy_s(testline, MAX_LINE_SIZE, sheets[i].cells[k]->line, sheets[i].cells[k]->linesize);
        testline[sheets[i].cells[k]->linesize] = '\0';

        if( strstr(testline, fullfunccall )) {
          dependency_num++;

          if( dependency_num >= MAX_CELL_DEPENDENCIES-1) {
            printf("allocation for sheet cell dependencies is insuficcient\n");
            return;
          }

          sheets[currsheet].cells[currcellposinsheet]->numdeps = dependency_num+1;
          foundatleastone++;
          sheets[currsheet].cells[currcellposinsheet]->deps[dependency_num] = &sheets[i].cells[k];
        }
      }

      if( foundatleastone == 0 ) {
        printf("error locating dependency for external func: %s\n", fullfunccall );
        return;
      }
    }
  };
}

score 2 · Accepted Answer

对于您正在处理的每个工作表，您可以构建一个后缀数组，如本文所述。在开始搜索之前，请阅读工作表，找到行首（作为工作表缓冲区的整数索引），创建后缀数组并按照文章中的说明对其进行排序。

现在，如果您正在查找出现模式（例如“table”）的行，您可以搜索“table”之后的下一个条目和“tablf”之后的下一个条目，这是第一个不匹配的，其中你已经移动了最右边的字母，里程表样式。

如果两个索引相同，则不存在匹配项。如果它们不同，您将获得工作表中的指针列表：

"tab. And now ..."
----------------------------------------------------------------
"table and ..."                0x0100ab30
"table water for ..."          0x0100132b
"tablet computer ..."          0x01000208
----------------------------------------------------------------
"tabloid reporter ..."

这将为您提供一个指针列表，通过减去工作表缓冲区的基指针，您可以获得整数偏移量。与行开头的比较将为您提供与这些指针相对应的行号。（行号已排序，因此您可以在此处进行二进制搜索。）

内存开销是与工作表缓冲区大小相同的指针数组，因此对于 30,000 个 200 个字符的字符串，在 64 位机器上大约为 48MB。（行索引的开销可以忽略不计。）

对数组进行排序需要很长时间，但每张纸只进行一次。

编辑：这个想法似乎运作良好。我已经实现了它，并且可以在不到一秒的时间内扫描近 600,000 个文本文件中约 130,000 个单词的字典：

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define die(...) exit((fprintf(stderr, "Fatal: " __VA_ARGS__), \
    putc(10, stderr), 1))



typedef struct Sheet Sheet;    

struct Sheet {
    size_t size;    /* Number of chars */
    char *buf;      /* Null-terminated char buffer */
    char **ptr;     /* Pointers into char buffer */
    size_t nline;   /* number of lines */
    int *line;      /* array of offset of line beginnings */
    size_t naux;    /* size of scratch array */
    char **aux;     /* scratch array */
};


/*
 *      Count occurrence of c in zero-terminated string p.
 */
size_t strcount(const char *p, int c)
{
    size_t n = 0;

    for (;;) {
        p = strchr(p, c);
        if (p == NULL) return n;
        p++;
        n++;        
    }

    return 0;
}

/*
 *      String comparison via pointers to strings.
 */
int pstrcmp(const void *a, const void *b)
{
    const char *const *aa = a;
    const char *const *bb = b;

    return strcmp(*aa, *bb);
}

/*
 *      Pointer comparison.
 */
int ptrcmp(const void *a, const void *b)
{
    const char *const *aa = a;
    const char *const *bb = b;

    if (*aa == *bb) return 0;   
    return (*aa < *bb) ? -1 : 1;
}

/*
 *      Create and prepare a sheet, i.e. a text file to search.
 */
Sheet *sheet_new(const char *fn)
{
    Sheet *sheet;
    FILE *f = fopen(fn, "r");
    size_t n;
    int last;
    char *p;
    char **pp;

    if (f == NULL) die("Couldn't open %s", fn);

    sheet = malloc(sizeof(*sheet));
    if (sheet == NULL) die("Allocation failed");

    fseek(f, 0, SEEK_END);
    sheet->size = ftell(f);
    fseek(f, 0, SEEK_SET);

    sheet->buf = malloc(sheet->size + 1);
    sheet->ptr = malloc(sheet->size * sizeof(*sheet->ptr));

    if (sheet->buf == NULL) die("Allocation failed");
    if (sheet->ptr == NULL) die("Allocation failed");

    fread(sheet->buf, 1, sheet->size, f);
    sheet->buf[sheet->size] = '\0';
    fclose(f);

    sheet->nline = strcount(sheet->buf, '\n');
    sheet->line = malloc(sheet->nline * sizeof(*sheet->line));

    sheet->aux = NULL;
    sheet->naux = 0;

    n = 0;
    last = 0;
    p = sheet->buf;
    pp = sheet->ptr;
    while (*p) {
        *pp++ = p;
        if (*p == '\n') {
            sheet->line[n++] = last;
            last = p - sheet->buf + 1;
        }
        p++;
    }

    qsort(sheet->ptr, sheet->size, sizeof(*sheet->ptr), pstrcmp);

    return sheet;
}

/*
 *      Clean up sheet.
 */
void sheet_delete(Sheet *sheet)
{
    free(sheet->buf);
    free(sheet->ptr);
    free(sheet->line);
    free(sheet->aux);
    free(sheet);
}

/*
 *      Binary range search for string pointers.
 */
static char **pstr_bsearch(const char *key,
    char **arr, size_t high)
{
    size_t low = 0;

    while (low < high) {
        size_t mid = (low + high) / 2;
        int diff = strcmp(key, arr[mid]);

        if (diff < 0) high = mid;
        else low = mid + 1;
    }

    return arr + low;
}

/*
 *      Binary range search for line offsets.
 */
static const int *int_bsearch(int key, const int *arr, size_t high)
{
    size_t low = 0;

    while (low < high) {
        size_t mid = (low + high) / 2;
        int diff = key - arr[mid];

        if (diff < 0) high = mid;
        else low = mid + 1;
    }

    if (low < 1) return NULL;
    return arr + low - 1;
}

/*
 *      Find occurrences of the string key in the sheet. Returns the
 *      number of lines in which the key occurs and assigns up to
 *      max lines to the line array. (If max is 0, line may be NULL.)
 */
int sheet_find(Sheet *sheet, char *key,
    int line[], int max)
{
    char **begin, **end;
    int n = 0;
    size_t i, m;
    size_t last;

    begin = pstr_bsearch(key, sheet->ptr, sheet->size);
    if (begin == NULL) return 0;

    key[strlen(key) - 1]++;
    end = pstr_bsearch(key, sheet->ptr, sheet->size);
    key[strlen(key) - 1]--;
    if (end == NULL) return 0;
    if (end == begin) return 0;

    m = end - begin;
    if (m > sheet->naux) {
        if (sheet->naux == 0) sheet->naux = 0x100;
        while (sheet->naux < m) sheet->naux *= 2;
        sheet->aux = realloc(sheet->aux, sheet->naux * sizeof(*sheet->aux));
        if (sheet->aux == NULL) die("Re-allocation failed");        
    }

    memcpy(sheet->aux, begin, m * sizeof(*begin));
    qsort(sheet->aux, m, sizeof(*begin), ptrcmp);

    last = 0;
    for (i = 0; i < m; i++) {
        int offset = sheet->aux[i] - sheet->buf;
        const int *p;

        p = int_bsearch(offset, sheet->line + last, sheet->nline - last);

        if (p) {
            if (n < max) line[n] = p - sheet->line;
            last = p - sheet->line + 1;
            n++;
        }
    }

    return n;
}

/*
 *      Example client code
 */
int main(int argc, char **argv)
{
    Sheet *sheet;
    FILE *f;

    if (argc != 3) die("Usage: %s patterns corpus", *argv);

    sheet = sheet_new(argv[2]);

    f = fopen(argv[1], "r");
    if (f == NULL) die("Can't open %s.", argv[1]);
    for (;;) {
        char str[80];
        int line[50];
        int i, n;

        if (fgets(str, sizeof(str), f) == NULL) break;
        strtok(str, "\n");
        n = sheet_find(sheet, str, line, 50);
        printf("%8d %s\n", n, str);

        if (n > 50) n = 50;
        for (i = 0; i < n; i++) printf("    [%d] %d\n", i, line[i] + 1);
    }
    fclose(f);

    sheet_delete(sheet);

    return 0;
}

该实现有其粗糙的边缘，但它有效。我不是特别喜欢临时数组和对找到的指针范围的额外排序，但事实证明，即使对大后缀数组进行排序也不会花费太长时间。

如果您愿意，可以将此解决方案扩展到更多工作表。

score 2 · Accepted Answer

您写道，您的“干草堆”（要搜索的字符串集）大约是 30000 个字符串，其中大约有 30000 个字符串。每个 200 个字符。您还写道，“needle”（要搜索的术语）是 5 或 20 个字符的字符串。

基于此，您可以预先计算一个哈希表，它将任何 5 个字符的子序列映射到它所在的大海捞针中的字符串。对于 30000 个字符串（每个 200 个字符），最多有 30000 * (200 - 5) = 5.850 .000 个不同的 5 字符子字符串。如果将其中的每一个散列为 16 位校验和，则需要至少 11MB 的内存（用于散列键）加上一些指向子字符串出现的字符串的指针。

例如，给定一个简化的干草堆

static const char *haystack[] = { "12345", "123456", "23456", "345678" };

您预先计算一个哈希映射，该映射映射任何可能的 5 个字符的字符串，使得

12345 => haystack[0], haystack[1]
23456 => haystack[1], haystack[2]
34567 => haystack[3]
45678 => haystack[4]

有了这个，您可以获取给定键的前五个字符（5 或 20 个字符长），对其进行散列，然后strstr对键通过散列映射到的所有字符串进行正常处理。

score 0 · Accepted Answer

我相信最实用的是 DFA，因为它最多读取输入的每个字符一次 - 更准确地说，它读取每个输入字符一次，并在模式不完全匹配时立即停止（如果设置正确）。使用 DFA，您还可以同时检查多个模式。

在实践中经过充分测试的 DFA 算法的两个好的（但不同的）实现是

除非您提供更多信息，否则无法说出适合您的任务的内容。

编辑：DFA 保留“确定性有限自动机”

编辑：正如您所指出的，您的模式是精确的子字符串，最常见的解决方案是KMP算法（Knuth-Morris-Pratt）

c - 快速字符串搜索以查找与给定模式匹配的字符串数组元素

3 回答 3

Related

Reference