/*
* Copyright (c) 2006, 403
* All rights reserved.
*
* 文件名称: ac_bm.c
* 摘 要: 对较长的文本进行关键字的快速搜索匹配
*/
// 头文件
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "ac_bm.h"
// 宏定义
//#define CASE_SENSITIVE
//#define DEBUG_SEARCH
unsigned char mychar[100];
/*--------------------------------------------------------------------------
* 函数名:
* ACTree_bulid
*
* 函数功能:
* 创建ACtree模式树,包括各个节点,字符索引,及其他的信息,ptree指向根节点
*
* 参数说明:
* pattern_tree *ptree : 指向ACTree的指针
* pattern_data *patterns : 关键字字串数组
* int npattern : 关键字字串个数
*
* 返回值类型: int
* 0 : 创建成功
* -1 : 出错
----------------------------------------------------------------------------*/
int ACtree_build (pattern_tree *ptree,
pattern_data *patterns,
int npattern)
{
int i ;
pattern_tree_node *root = NULL, *parent = NULL ;
unsigned char ch ;
int max_pattern_len = 0, min_pattern_len = PATTERN_LEN ;
if (NULL == ptree || NULL == patterns || npattern < 0)
{
goto err ;
}
root = (pattern_tree_node *) malloc (sizeof (pattern_tree_node)) ;
if (NULL == root)
{
goto err ;
}
memset (root, 0, sizeof (pattern_tree_node)) ;
root->label = -2 ; // 树根标志
root->depth = 0 ; // 树根深度
// 对输入的字串循环处理添加进ACTree
for (i = 0 ; i < npattern ; i++)
{
int pat_len ;
int ch_i ;
pat_len = (patterns+i)->len ;
if (pat_len == 0)
{
continue ;
}
else
{
if (pat_len > PATTERN_LEN)
{
pat_len = PATTERN_LEN ;
}
if (pat_len > max_pattern_len)
{
max_pattern_len = pat_len ;
}
if (pat_len < min_pattern_len)
{
min_pattern_len = pat_len ;
}
parent = root ;
for (ch_i = 0 ; ch_i < pat_len ; ch_i++)
{
ch = ((patterns+i)->data)[ch_i] ;
#ifndef CASE_SENSITIVE
ch = tolower(ch) ;
#endif
// 对应的字符索引为NULL
if (NULL == parent->childs[ch])
{
break ;
}
parent = parent->childs[ch] ;
}
if (ch_i < pat_len)
{
// 在父节点下添加新节点
for (; ch_i < pat_len ; ch_i++)
{
pattern_tree_node *node = NULL ;
ch = ((patterns+i)->data)[ch_i] ;
#ifndef CASE_SENSITIVE
ch = tolower(ch) ;
#endif
node = (pattern_tree_node *) malloc (sizeof (pattern_tree_node)) ;
if (node == NULL)
{
goto err ;
}
memset (node, 0, sizeof(pattern_tree_node)) ;
node->depth = ch_i + 1 ;
node->ch = ch ;
node->label = -1 ;
// 将新节点添加到父节点的对应字符索引指针
parent->childs[ch] = node ;
// 不考虑大小写的分别
#ifndef CASE_SENSITIVE
if ((ch >= 'a') && (ch <= 'z'))
{
parent->childs[ch-32] = node ;
}
#endif
parent->nchild++ ;
parent->one_child = ch ;
node->parent = parent ;
parent = node ;
}
}
}
// lable 记录字串来自于第几个输入字串
parent->label = i ;
}
ptree->pattern_list = patterns ;
ptree->pattern_count = npattern ;
ptree->root = root ;
ptree->max_depth = max_pattern_len ;
ptree->min_pattern_size = min_pattern_len ;
return 0 ;
err:
// 出错处理,释放申请的空间
if (ptree->root != NULL)
{
_clean_tree (ptree->root) ;
free (ptree->root) ;
ptree->root = NULL ;
}
return -1 ;
}
/*--------------------------------------------------------------------------
* 函数名:
* _print_tree
*
* 函数功能:
* 打印当前节点及其所有后缀节点
*
* 参数说明:
* pattern_tree_node *root : 当前的节点的指针
*
* 返回值类型: void
* 无
----------------------------------------------------------------------------*/
void _print_tree (pattern_tree_node *root)
{
int i ;
if (NULL == root)
return ;
printf("ch:%2c GSshift:%2d label:%2d depth:%2d childs:", root->ch, root->GSshift, root->label, root->depth) ;
for (i = 0 ; i < 256 ; i++)
{
#ifndef CASE_SENSITIVE
if ((i >= 'A') && (i <= 'Z'))
{
continue ;
}
#endif
if (NULL != root->childs[i])
{
printf("%c ", (root->childs[i])->ch) ;
}
}
printf("\n") ;
// 递归打印本节点的所有后缀节点信息
for (i = 0 ; i < 256 ; i++)
{
if (NULL != root->childs[i])
{
_print_tree (root->childs[i]) ;
}
}
return ;
}
/*--------------------------------------------------------------------------
* 函数名:
* ACtree_print_tree
*
* 函数功能:
* 打印整个树ACTree的所有节点字符信息
*
* 参数说明:
* pattern_tree *ptree : 指向ACTree模式树的指针
*
* 返回值类型: void
* 无
----------------------------------------------------------------------------*/
//#if 0
void ACtree_print_tree (pattern_tree *ptree)
{
printf ("--- ACTree information---\n") ;
if (NULL == ptree)
{
return ;
}
if (NULL != ptree->root)
{
_print_tree (ptree->root) ;
}
return ;
}
//#endif
/*--------------------------------------------------------------------------
* 函数名:
* ACtree_compute_BCshifts
*
* 函数功能:
* 设置ptree的BCshift
*
* 参数说明:
* pattern_tree *ptree : 指向ACTree模式树的指针
*
* 返回值: int
* 0 : 处理成功
----------------------------------------------------------------------------*/
int ACtree_compute_BCshifts (pattern_tree *ptree)
{
int i, j = 0 ;
for (i = 0 ; i < 256 ; i++)
ptree->BCshift[i] = ptree->min_pattern_size ;
for (i = ptree->min_pattern_size - 1 ; i > 0 ; i--)
{
for (j = 0 ; j < ptree->pattern_count ; j++)
{
unsigned char ch ;
ch = (ptree->pattern_list+j)->data[i] ;
#ifndef CASE_SENSITIVE
ch = tolower(ch) ;
#endif
ptree->BCshift[ch] = i ;
#ifndef CASE_SENSITIVE
if ((ch > 'a') && (ch <'z'))
{
ptree->BCshift[ch-32] = i ;
}
#endif
}
}
return 0 ;
}
/*--------------------------------------------------------------------------
* 函数名:
* set_GSshift
*
* 函数功能:
* 设置ACTree中关键字pat1的GCshift
*
* 参数说明:
* pattern_tree *ptree : 指向ACTree模式树的指针
* unsigned char *pat : 关键字pat1
* int depth : 要改变GSshift字符的深度为depth
* int shift : 拟进行的变化
*
* 返回值: int
* 0 : 处理成功
* -1 : 出错
----------------------------------------------------------------------------*/
int set_GSshift (pattern_tree *ptree, unsigned char *pat, int depth, int shift)
{
int i ;
pattern_tree_node *node ;
if (NULL == ptree || NULL == ptree->root)
{
goto err ;
}
node = ptree->root ;
for (i = 0 ; i < depth ; i++)
{
node = node->childs[pat[i]] ;
if (NULL == node)
{
goto err ;
}
}
// 取小位移
node->GSshift = node->GSshift < shift ? node->GSshift : shift ;
return 0 ;
err:
return -1 ;
}
/*--------------------------------------------------------------------------
* 函数名:
* compute_GSshift
*
* 函数功能:
* 调整ACTree中关键字pat1因pat2而发生变化的GSshift
*
* 参数说明:
* pattern_tree *ptree : 指向ACTree的指针
* unsigned char *pat1 : 关键字pat1
* int pat1_len : 关键字pat1的字串长度
* unsigned char *pat2 : 关键字pat2
* int pat2_len : 关键字pat2的字串长度
*
* 返回值: int
* 0 : 处理成功 或 未出错
* -1 : 出错
----------------------------------------------------------------------------*/
int compute_GSshift (pattern_tree *ptree,
unsigned char *pat1,
int pat1_len,
unsigned char *pat2,
int pat2_len)
{
unsigned char first_char ;
int i ;
int pat1_index, pat2_index, offset ;
if (NULL == pat1 || NULL == pat2 || pat1_len < 0 || pat2_len < 0)
{
goto err ;
}
if (pat1_len == 0 || pat2_len == 0)
{
return 0 ;
}
first_char = pat1[0] ;
#ifndef CASE_SENSITIVE
first_char = tolower(first_char) ;
#endif
- 1
- 2
前往页