/*
* WM.cpp
*
* Created on: 2010-2-20
* Author: qingxuan.chenqx
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "wm.h"
#include <time.h>
extern int nline = 1;
extern int nfound = 0;
#define MAXN 0x100001 //模式串的最大长度MAXN - 1
#define MAXM 1001//单词最大长度为MAXM - 1
WM_STRUCT * wmNew()
{
WM_STRUCT *p = (WM_STRUCT *) malloc(sizeof(WM_STRUCT));
if (!p)
return 0;
p->msNumPatterns = 0;
p->msSmallest = 1000;
return p;
}
void wmFree(WM_STRUCT *ps)
{
if (ps->msPatArray)
{
if (ps->msPatArray->psPat)
free(ps->msPatArray->psPat);
free(ps->msPatArray);
}
if (ps->msNumArray)
free(ps->msNumArray);
if (ps->msHash)
free(ps->msHash);
if (ps->msPrefix)
free(ps->msPrefix);
if (ps->msShift)
free(ps->msShift);
free(ps);
}
int wmAddPattern(WM_STRUCT *ps, unsigned char *q, int m)//m字符串长度
{
WM_PATTERN_STRUCT *p;
p = (WM_PATTERN_STRUCT *) malloc(sizeof(WM_PATTERN_STRUCT));
if (!p)
return -1;
p->psPat = (unsigned char*) malloc(m + 1);
memset(p->psPat + m, 0, 1);
memcpy(p->psPat, q, m);
p->psLen = m;
p->psFound = 0;//initialise
ps->msNumPatterns++;
if (p->psLen < (unsigned) ps->msSmallest)
ps->msSmallest = p->psLen;
p->next = ps->plist;
ps->plist = p;
return 0;
}
static unsigned HASH16(unsigned char *T)
{
return (unsigned short) (((*T) << 8) | *(T + 1));
}
void sort(WM_STRUCT *ps)//字符串哈希值从小到大排列
{
int m = ps->msSmallest;
int i, j;
unsigned char *temp;
int flag;
for (i = ps->msNumPatterns - 1, flag = 1; i > 0 && flag; i--)
{
flag = 0;
for (j = 0; j < i; j++)
{
if (HASH16(&(ps->msPatArray[j + 1].psPat[m - 2])) < HASH16(
&(ps->msPatArray[j].psPat[m - 2])))
{
flag = 1;
temp = ps->msPatArray[j + 1].psPat;
ps->msPatArray[j + 1].psPat = ps->msPatArray[j].psPat;
ps->msPatArray[j].psPat = temp;
}
}
}
}
static void wmPrepHashedPatternGroups(WM_STRUCT *ps)//计算有多少个不同哈希值,且从小到大
{
unsigned sindex, hindex, ningroup;
int i;
int m = ps->msSmallest;
ps->msNumHashEntries = HASHTABLESIZE;
ps->msHash = (HASH_TYPE*) malloc(sizeof(HASH_TYPE) * ps->msNumHashEntries);
if (!ps->msHash)
{
printf("No memory in wmPrepHashedPatternGroups()\n");
return;
}
for (i = 0; i < (int) ps->msNumHashEntries; i++)
{
ps->msHash[i] = (HASH_TYPE) -1;
}
for (i = 0; i < ps->msNumPatterns; i++)
{
hindex = HASH16(&ps->msPatArray[i].psPat[m - 2]);
sindex = ps->msHash[hindex] = i;
ningroup = 1;
while ((++i < ps->msNumPatterns) && (hindex == HASH16(
&ps->msPatArray[i].psPat[m - 2])))
ningroup++;
ps->msNumArray[sindex] = ningroup;
i--;
}
}
static void wmPrepShiftTable(WM_STRUCT *ps)//建立shift表
{
int i;
unsigned short m, k, cindex;
unsigned shift;
m = (unsigned short) ps->msSmallest;
ps->msShift = (unsigned char*) malloc(SHIFTTABLESIZE * sizeof(char));
if (!ps->msShift)
return;
for (i = 0; i < SHIFTTABLESIZE; i++)
{
ps->msShift[i] = (unsigned) (m - 2 + 1);
}
for (i = 0; i < ps->msNumPatterns; i++)
{
for (k = 0; k < m - 1; k++)
{
shift = (unsigned short) (m - 2 - k);
cindex = ((ps->msPatArray[i].psPat[k] << 8)
| (ps->msPatArray[i].psPat[k + 1]));//B为2
if (shift < ps->msShift[cindex])
ps->msShift[cindex] = shift;//k=m-2时,shift=0,
}
}
}
static void wmPrepPrefixTable(WM_STRUCT *ps)//建立Prefix表
{
int i;
ps->msPrefix = (HASH_TYPE*) malloc(sizeof(HASH_TYPE) * ps->msNumPatterns);
if (!ps->msPrefix)
{
printf("No memory in wmPrepPrefixTable()\n");
return;
}
for (i = 0; i < ps->msNumPatterns; i++)
{
ps->msPrefix[i] = HASH16(ps->msPatArray[i].psPat);
}
}
void wmGroupMatch(WM_STRUCT *ps,//后缀哈希值相同,比较前缀以及整个字符匹配
int lindex, unsigned char *Tx, unsigned char *T)
{
WM_PATTERN_STRUCT *patrn;
WM_PATTERN_STRUCT *patrnEnd;
int text_prefix;
unsigned char *px, *qx;
patrn = &ps->msPatArray[lindex];
patrnEnd = patrn + ps->msNumArray[lindex];
text_prefix = HASH16(T);
for (; patrn < patrnEnd; patrn++)
{
if (ps->msPrefix[lindex++] != text_prefix)
continue;
else
{
px = patrn->psPat;
qx = T;
while (*(px++) == *(qx++) && *(qx - 1) != '\0')
;
if (*(px - 1) == '\0')
{
//printf("Match pattern \"%s\" at line %d column %d\n",
//patrn->psPat, nline, T - Tx + 1);
patrn->psFound++;
nfound++;
}
}
}
}
int wmPrepPatterns(WM_STRUCT *ps)//由plist得到msPatArray
{
int kk;
WM_PATTERN_STRUCT *plist;
ps->msPatArray = (WM_PATTERN_STRUCT*) malloc(sizeof(WM_PATTERN_STRUCT)
* ps->msNumPatterns);
if (!ps->msPatArray)
return -1;
ps->msNumArray
= (unsigned short*) malloc(sizeof(short) * ps->msNumPatterns);
if (!ps->msNumArray)
return -1;
for (kk = 0, plist = ps->plist; plist != NULL && kk < ps->msNumPatterns; plist
= plist->next)
{
memcpy(&ps->msPatArray[kk++], plist, sizeof(WM_PATTERN_STRUCT));
}
sort(ps);
wmPrepHashedPatternGroups(ps);
wmPrepShiftTable(ps);
wmPrepPrefixTable(ps);
return 0;
}
void wmSearch(WM_STRUCT *ps, unsigned char *Tx, int n)//字符串查找
{
int Tleft, lindex, tshift;
unsigned char *T, *Tend, *window;
Tleft = n;
Tend = Tx + n;
if (n < ps->msSmallest)
return;
for (T = Tx, window = Tx + ps->msSmallest - 1; window < Tend; T++, window++, Tleft--)
{
tshift = ps->msShift[(*(window - 1) << 8) | *window];
while (tshift)
{
window += tshift;
T += tshift;
Tleft -= tshift;
if (window > Tend)
return;
tshift = ps->msShift[(*(window - 1) << 8) | *window];
}
if ((lindex = ps->msHash[(*(window - 1) << 8) | *window])
== (HASH_TYPE) -1)
continue;
lindex = ps->msHash[(*(window - 1) << 8) | *window];
wmGroupMatch(ps, lindex, Tx, T);
}
}
int main()
{
int length, n, nline=0, nkey = 0;
WM_STRUCT *p;
char keyword[MAXM]; //单词
char *str = new char[MAXN]; //模式串
FILE *fd, *key;
clock_t start, finish;
double Total_time;
p = wmNew();
//printf("scanf the number of words-->\n");
//scanf("%d", &n);
//printf("scanf the words-->\n");
//while (n--)
//{
fd = fopen("3.txt","rb+");
if(fd == NULL)
{
fprintf(stderr,"Open file error!\n");
exit(1);
}
//for (i = 1; i < argc; i++)
// if (strcmp ((const char *)argv[i], "-nocase") == 0)
// nocase = 1;
key = fopen("keyword_c.txt","r");
if(key == NULL)
{
fprintf(stderr,"Open file error!\n");
exit(1);
}
int count_key = 0;
while ( fgets(keyword,100,key) ){
keyword[strlen(keyword)-1] = '\0';
length = strlen(keyword);
if(length > 3){
wmAddPattern(p, (unsigned char *) keyword, length);
nkey++;
}
else
{
printf("%d:%s\n", count_key, keyword);
}
count_key++;
}
start = clock();
wmPrepPatterns(p);
finish = clock();
Total_time = (double)(finish-start) / CLOCKS_PER_SEC;
printf( "Construction time: %f seconds\n", Total_time);
start = clock();
//printf("scanf the text string-->\n");
//scanf("%s", str);
//length = strlen(str);
/*Search Pattern*/
while ( fgets(str,MAXN,fd) )
{
length = strlen(str);
wmSearch(p, (unsigned char *) str, length);
nline++;
}
finish = clock();
Total_time = (double)(finish-start) / CLOCKS_PER_SEC;
printf( "Search time: %f seconds\n", Total_time);
//Summary
int kk = 0;
int jj = 0;
WM_PATTERN_STRUCT *plist;
for (kk = 0; kk < p->msNumPatterns; kk++)
{
if(p->msPatArray[kk].psFound != 0){
printf("%s:%d\t", p->msPatArray[kk].psPat, p->msPatArray[kk].psFound);
jj++;
if (jj%4 == 0)
{
printf("\n");
}
}
}
wmFree(p);
printf("nFound = %d, nline = %d, nkey = %d\n",nfound, nline, nkey);
return (0);
}
- 1
- 2
前往页