// RegularExpression.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
typedef struct _st_RE_Element
{
char ch;
bool isEscape;
}st_RE_Element, *pst_RE_Element;
enum MATCH_STATE
{
MATCH_OK,
MATCH_NEXT_POSITION,
MATCH_FAIL,
MATCH_ERROR, //regular expression syntax error, like "a**b"
};
// c 匹配任意的字母(大小写均可)
// . 任意字符
// * 重复前一字符0次或多次,但尽量多次
// ^ 匹配输入串的开头
// $ 匹配输入串的结尾
// \ 转义字符(\c表示一个字母'c',\\表示一个字母'\')
char g_RegMetaSet[] = {'c', '.', '*', '^', '$', '\\'};
void TestRegExpr();
void TestCheckRegExpr();
const char* RegExprFind(const char *pText, const char *pReg);
MATCH_STATE Match(const char *pText, const char *pReg);
int GetReElement(pst_RE_Element pElement, const char *pReg);
bool CheckRegExpr(const char *pReg);
bool IsRegMetacharacter(char ch);
bool IsAlpha(char ch);
bool MatchAt(st_RE_Element regChar, char ch);
void GetPreChar(st_RE_Element *pPreChar, const char *pReg);
void MyOutput(const char* pReg, const char *p);
const char *g_pReg = NULL;
const char *g_pText= NULL;
const char *g_pBeg = NULL;
const char *g_pEnd = NULL;
int main(int argc, char* argv[])
{
TestRegExpr();
TestCheckRegExpr();
return 0;
}
void TestRegExpr()
{
const char *p = NULL;
const char *pText = "Let's ggggo abcdeccc.^$*\\\\\\fg 223333332122222333322222";
const char *pReg[] = {
"Let",
"^Let.*$",
"c*",
"g*o*",
"g*a*",
"k*2*3*k*",
"k*2*k*3*",
"\\$",
"\\.",
"\\\\*",
"\\..*23",
"c*",
"\\c*",
"2*$",
"3*",
"2*223",
"2*23",
"g*o.*3*21",
"g*o.*3*2",
"g*o.*3*",
"g*o.*3",
"Below are Not Found!------------------------------------------",
"k*",
"g*o.2*3",
"3*$",
"^3*",
"Below are Syntax Error!---------------------------------------",
"^*abc",
".*a$bc",
};
printf("\t\t\t\t\t0 1 2 3 4 5\r\n");
printf("\t\t\t\t\t012345678901234567890123456789012345678901234567890\r\n");
printf("Text is \t\t\t\t%s\r\n", pText);
g_pText = pText;
for (int i = 0; i < sizeof(pReg)/sizeof(pReg[0]); i++)
{
g_pReg = pReg[i];
p = RegExprFind(pText, pReg[i]);
MyOutput(pReg[i], p);
}
}
void MyOutput(const char* pReg, const char *p)
{
if (NULL == p)
printf("Not Found!\tRegExpr = \"%s\"\r\n", pReg);
else if ((char*)-1 == p)
printf("Syntax error!\tRegExpr = \"%s\"\r\n", pReg);
else
{
printf("Find at %02d \tRegExpr = \"%s\"\t", p-g_pText, pReg);
while (p < g_pEnd)
printf("%c", *p++);
printf("\r\n");
}
}
const char* RegExprFind(const char *pText, const char *pReg)
{
const char *pCur = pText;
if (!CheckRegExpr(pReg))
return (char*)-1;
do
{
g_pBeg = pCur;
MATCH_STATE eResult = Match(pCur, pReg);
if (MATCH_OK == eResult)
return pCur;
else if (MATCH_ERROR == eResult)
return (char*)-1;
else if (MATCH_FAIL == eResult)
return NULL;
}
while (*pCur++);
return NULL;
}
MATCH_STATE Match(const char *pCur, const char *pReg)
{
g_pEnd = pCur;
if ('\0' == *pReg)
return (g_pEnd != g_pBeg) ? MATCH_OK : MATCH_NEXT_POSITION;
if ('$' == *pReg)
return ('\0' == *pCur) ? MATCH_OK : MATCH_FAIL;
if ('^' == *pReg)
{
// return Match(pCur, pReg+1);
if (MATCH_OK == Match(pCur, pReg+1))
return MATCH_OK;
else
return MATCH_FAIL;
}
st_RE_Element elementCur;// 更复杂的情况要借助结构体
st_RE_Element elementNext;
int nElementLen1 = 0;
int nElementLen2 = 0;
nElementLen1 = GetReElement(&elementCur, pReg);
nElementLen2 = GetReElement(&elementNext, pReg+nElementLen1);
if (!elementNext.isEscape && elementNext.ch == '*')// 贪婪匹配比较麻烦
{
const char *pStart = pCur;
while(*pCur && MatchAt(elementCur, *pCur))
pCur++;
while (pCur >= pStart)
{
if (MATCH_OK == Match(pCur, pReg+nElementLen1+nElementLen2))
return MATCH_OK;
pCur--;
}
}
else
{
if (MatchAt(elementCur, *pCur))
return Match(pCur+1, pReg+nElementLen1);
}
return MATCH_NEXT_POSITION;
}
int GetReElement(pst_RE_Element pElement, const char *pReg)
{
if (*pReg == '\\')
{
pElement->isEscape = true;
pElement->ch = pReg[1];
return 2;
}
else
{
pElement->isEscape = false;
pElement->ch = *pReg;
return 1;
}
}
bool MatchAt(st_RE_Element regChar, char ch)
{
if (regChar.isEscape) // \c \\ etc.
{
return ch == regChar.ch;
}
else // a . c
{
if ('.' == regChar.ch || ('c' == regChar.ch && IsAlpha(ch)) )
{
return true;
}
return ch == regChar.ch;
}
}
bool IsAlpha(char ch)
{
if (ch > 'a' && ch < 'z' ||
ch > 'A' && ch < 'Z')
{
return true;
}
return false;
}
void GetPreChar(st_RE_Element *pPreChar, const char *pReg)
{
pPreChar->ch = pReg[-1];
if (&pReg[-2] >= g_pReg && pReg[-2] == '\\')
pPreChar->isEscape = true;
else
pPreChar->isEscape = false;
}
void TestCheckRegExpr()
{
const char *regArray[]= {
"^^now err",
"^*abcd",
"ababcd\\",
"^^*bcd",
"a^*bcd",
"^ab$cd",
"*abcd",
"a**abcd",
"\\a*bcd",
"now ok",
"\\\\a*bcd",
"\\**bcd",
".*abcd",
"^a*bcd",
"^abcd$",
".abc d",
".*abcd",
"\\c*abcd",
"\\*abcd",
"c*abcd",
"\\.c*abcd",
"abc\\.c*abcd",
"abc d",
"\\^*abcd"
};
for (int i = 0; i < sizeof(regArray)/sizeof(regArray[0]); i++)
{
if (CheckRegExpr(regArray[i]))
printf("\"%s\" \tis correct regular expression!\r\n", regArray[i]);
else
printf("\"%s\" \tis error regular expression!\r\n", regArray[i]);
}
}
bool CheckRegExpr(const char *pReg)
{
const char *pBegin = pReg;
if ('*' == *pBegin)
return false;
while (*pReg)
{
if ( ('^' == *pReg && pReg != pBegin) ||
('^' == *pReg && pReg[1] == '*') ||
('$' == *pReg && pReg[1] != '\0') )
return false;
if ('*' == *pReg && '*' == pReg[1])
return false;
if ('\\' == *pReg)
{
if (!IsRegMetacharacter(pReg[1]))
return false;
else
pReg++;
}
pReg++;
}
return true;
}
bool IsRegMetacharacter(char ch)
{
for (int i = 0; i < sizeof(g_RegMetaSet)/sizeof(g_RegMetaSet[0]); i++)
{
if (ch == g_RegMetaSet[i])
return true;
}
return false;
}