最近需要改进一个字符串查找的算法。
我用了类似于KMP的算法。相比于一个一个比较。效率提高大概25倍
代码
#include <stdio.h>
#include <time.h>
const int maxNum = 1005;
char* genRandomString(int length)
{
int flag, i;
char* string;
srand((unsigned) time(NULL ));
if ((string = (char*) malloc(length)) == NULL )
{
printf("Malloc failed!flag:14\n");
return NULL ;
}
for (i = 0; i < length - 1; i++)
{
flag = rand() % 3;
switch (flag)
{
case 0:
string[i] = 'A' + (rand() + length) % 26;
break;
case 1:
string[i] = 'a' + rand() % 26;
break;
case 2:
string[i] = '0' + (rand() + length) % 10;
break;
default:
string[i] = 'x';
break;
}
}
string[length - 1] = '\0';
return string;
}
char* getString(char* src, int length)
{
int flag, i;
char* string;
srand((unsigned) time(NULL ));
if ((string = (char*) malloc(length)) == NULL )
{
printf("Malloc failed!flag:14\n");
return NULL ;
}
int srcLen = strlen(src);
int startNum = rand() % (srcLen - length);
memcpy(string , *(src + startNum) , length);
string[length - 1] = '\0';
return string;
}
int Sunday(char * sr, int srclen, char * ta, int targetlen)
{
int shift[maxNum];
int i = 0;
char* src = sr;
char* target = ta;
// 默认值,移动m+1位
for( i = 0; i < maxNum; i++) {
shift[i] = targetlen + 1;
}
for( i = 0; i < targetlen; i++) {
shift[target[i]] = targetlen - i;
}
// 模式串开始位置在主串的哪里
int s = 0;
// 模式串已经匹配到的位置
int j;
while(s <= srclen - targetlen) {
j = 0;
while(src[s + j] == target[j]) {
j++;
// 匹配成功
if(j >= targetlen) {
return s;
}
}
s += shift[src[s + targetlen]];
}
return -1;
}
int atoi(char *s)
{
int i = 0;
int n = 0;
for (i = 0; *s >= '0'&& *s <= '9' ; ++i)
{
n = 10 * n + (*s - '0');
}
return n;
}
int myMemmem(char * a, int alen, char * b, int blen)
{
int i =0;
int j =0;
for ( i = 0; i <= (alen - blen); ++i)
{
for (j = 0; j < blen; ++ j)
{
if (a[i + j] != b[j])
{
break;
}
}
if (j >= blen)
{
return i;
}
}
return -1;
}
/**
IN
at the thought of
though
OUT
7
**/
int main() {
// 主串和模式串
char* T;
char* P;
int a = 0;
int b = 0;
clock_t start;
clock_t finish;
double duration;
while(1) {
// 获取一行
printf("put T length\n");
scanf("%d",&a);
printf("put P length\n");
scanf("%d",&b);
T = genRandomString(a);
P = genRandomString(b);
// printf("--------------T = (%s)\n", T);
// printf("--------------P = (%s)\n", P);
start = clock();
int res = Sunday(T, a-1, P, b-1);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "--Sunday time is %f seconds\n", duration );
if(res == -1) {
printf("Sunday主串和模式串不匹配\n");
} else {
printf("Sunday模式串在主串的位置为:%d\n", res);
}
start = clock();
res = myMemmem(T, a-1, P, b-1);
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "--myMemmem time is %f seconds\n", duration );
if(res == -1) {
printf("myMemmem主串和模式串不匹配\n");
} else {
printf("myMemmem模式串在主串的位置为:%d\n", res);
}
free(T);
free(P);
}
return 0;
}
后续
新的版本的memmem函数用的是高效率的算法,比KMP的快。
代码如下
/* Copyright (C) 1991-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* This particular implementation was written by Eric Blake, 2008. */
#ifndef _LIBC
# include <config.h>
#endif
/* Specification of memmem. */
#include <string.h>
#ifndef _LIBC
# define __builtin_expect(expr, val) (expr)
#endif
#define RETURN_TYPE void *
#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
#include "str-two-way.h"
#undef memmem
/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK
if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
HAYSTACK. */
void *
memmem (const void *haystack_start, size_t haystack_len,
const void *needle_start, size_t needle_len)
{
/* Abstract memory is considered to be an array of 'unsigned char' values,
not an array of 'char' values. See ISO C 99 section 6.2.6.1. */
const unsigned char *haystack = (const unsigned char *) haystack_start;
const unsigned char *needle = (const unsigned char *) needle_start;
if (needle_len == 0)
/* The first occurrence of the empty string is deemed to occur at
the beginning of the string. */
return (void *) haystack;
/* Sanity check, otherwise the loop might search through the whole
memory. */
if (__builtin_expect (haystack_len < needle_len, 0))
return NULL;
/* Use optimizations in memchr when possible, to reduce the search
size of haystack using a linear algorithm with a smaller
coefficient. However, avoid memchr for long needles, since we
can often achieve sublinear performance. */
if (needle_len < LONG_NEEDLE_THRESHOLD)
{
haystack = memchr (haystack, *needle, haystack_len);
if (!haystack || __builtin_expect (needle_len == 1, 0))
return (void *) haystack;
haystack_len -= haystack - (const unsigned char *) haystack_start;
if (haystack_len < needle_len)
return NULL;
return two_way_short_needle (haystack, haystack_len, needle, needle_len);
}
else
return two_way_long_needle (haystack, haystack_len, needle, needle_len);
}
libc_hidden_def (memmem)
#undef LONG_NEEDLE_THRESHOLD
这个memmem函数用的是two_way_long_needle 的算法。已经是比较快的算法了。