// GetWebLinkInfo.cpp: implementation of the CGetWebLinkInfo class.
//
//Author:Li Junhui
//E-MAIL: ljh_0110@163.com
#include "stdafx.h"
#include "GetWebLinkInfo.h"
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include <wininet.h>
#include <map>
#include "PageDownloader.h"
CGetWebLinkInfo::CGetWebLinkInfo()
{
m_pLinkInfo = NULL;
}
CGetWebLinkInfo::~CGetWebLinkInfo()
{
}
int CGetWebLinkInfo::AnalyzeBulidTree(LinkInfo * pLinkInfo)
{
m_pLinkInfo = pLinkInfo;
//1、排重
FilterRepeat();
//2、分类
//BulidTree();
BulidTreeFolder();
//3、过滤掉无规律的链接
FilterNoChildNode();
//4、二级目录从大到小排序
Sort4ChildCount();
//5、按照树形结构输出
OutputBulidTree(m_pLinkInfo);
return 0;
}
int CGetWebLinkInfo::GetUrlDeep(LPTSTR pUrl)
{
int iDeep = -1;
if( pUrl==NULL || *pUrl==_T('\0') )
{
return iDeep;
}
if( _tcsnicmp(pUrl,_T("http://"),7) != 0 )
{
return iDeep;
}
TCHAR* pTmp = pUrl+7;
iDeep = 1;
while(*pTmp != _T('\0'))
{
pTmp = _tcschr(pTmp,_T('/'));
if( pTmp == NULL )
{
break;
}
else
{
if( ++pTmp && *pTmp != _T('\0') )
{
iDeep++;
}
}
}
return iDeep;
}
void CGetWebLinkInfo::FilterRepeat()
{
map<CString, LinkInfo*> urlMap;
map<CString, LinkInfo*>::iterator it;
//前两个节点不排重
LinkInfo* pLinkNode = m_pLinkInfo->pNext->pNext;
while( pLinkNode )
{
if( (it=urlMap.find(pLinkNode->szUrl)) == urlMap.end() )
{
pLinkNode->nValid = 1;
urlMap[pLinkNode->szUrl] = pLinkNode;
}
else
{
//如果发现链接已经存在。则比较标题长度
pLinkNode->nDeep = GetUrlDeep(pLinkNode->szUrl);
int nLength1 = _tcslen(it->second->szTitle);
int nLength2 = _tcslen(pLinkNode->szTitle);
//深度小于3的保留短标题
if( (pLinkNode->nDeep<3) && (nLength2<nLength1) )
{
pLinkNode->nValid = 1;
it->second->nValid = 0;
it->second = pLinkNode;
}
else if( (pLinkNode->nDeep>=3) && (nLength2>nLength1) )
{
pLinkNode->nValid = 1;
it->second->nValid = 0;
it->second = pLinkNode;
}
}
pLinkNode = pLinkNode->pNext;
}
return;
}
void CGetWebLinkInfo::BulidTree()
{
for(int nDeep=1; nDeep<=4; nDeep++)
{
LinkInfo* pLinkNode = m_pLinkInfo;
while (pLinkNode != NULL)
{
//判断是否有效,判断是否处理过
if( (pLinkNode->nValid==1) && (pLinkNode->bHandled==0) )
{
//判断是否已经计算过深度,如果没有计算过则先计算深度
if(pLinkNode->nDeep == 0)
{
pLinkNode->nDeep = GetUrlDeep(pLinkNode->szUrl);
if(pLinkNode->nDeep == -1)
{
pLinkNode->nValid = 0;
pLinkNode = pLinkNode->pNext;
continue;
}
}
if(pLinkNode->nDeep == nDeep)
{
LinkInfo* pParent = FindParentNode(pLinkNode, m_pLinkInfo);
if( pParent )
{
InsertChildNode(pLinkNode, pParent);
}
else
{
InsertChildNode(pLinkNode, m_pLinkInfo);
}
}
}
pLinkNode = pLinkNode->pNext;
}
}
//把剩余的插入到构造的树上
LinkInfo* pLinkNode = m_pLinkInfo;
while (pLinkNode != NULL)
{
if( (pLinkNode->nValid==1) && (pLinkNode->bHandled==0) )
{
LinkInfo* pParent = FindParentNode(pLinkNode, m_pLinkInfo);
if( pParent )
{
InsertChildNode(pLinkNode, pParent);
}
else
{
//没有找到合适父节点的放到其他里面去.
InsertChildNode(pLinkNode, m_pLinkInfo->pBrother);
}
}
pLinkNode = pLinkNode->pNext;
}
}
//二级目录下过滤掉单个的
void CGetWebLinkInfo::FilterNoChildNode()
{
LinkInfo* pChildNode = m_pLinkInfo->pChild;
LinkInfo* pOtherNode = m_pLinkInfo->pBrother;
if(pChildNode == NULL) return;
LinkInfo* pChildHead = NULL;
LinkInfo* pPerChildNode = NULL;
LinkInfo* pTmpNode = NULL;
while(pChildNode)
{
if(pChildNode->nChildNodeCnt==0)
{
if(pPerChildNode)
{
pPerChildNode->pBrother = pChildNode->pBrother;
}
pTmpNode = pChildNode;
pChildNode = pChildNode->pBrother;
pTmpNode->pBrother = NULL;
InsertChildNode(pTmpNode, pOtherNode);
}
else
{
if(pChildHead==NULL)
{
pChildHead = pChildNode;
}
if(pPerChildNode==NULL)
{
pPerChildNode = pChildNode;
}
else
{
pPerChildNode = pPerChildNode->pBrother;
}
pChildNode = pChildNode->pBrother;
}
}
m_pLinkInfo->pChild = pChildHead;
}
//构造目录节点
void CGetWebLinkInfo::BulidTreeFolder()
{
LinkInfo *pLinkNode = m_pLinkInfo->pNext->pNext;
LinkInfo *pPreLinkNode = NULL;
BOOL bBreak = FALSE;
for(int iIndex=1; iIndex<3; iIndex++)
{
while (pLinkNode != NULL)
{
//判断是否有效,判断是否处理过
if( (pLinkNode->nValid==1) && (pLinkNode->bHandled==0) )
{
//判断是否已经计算过深度,如果没有计算过则先计算深度
if(pLinkNode->nDeep == 0)
{
pLinkNode->nDeep = GetUrlDeep(pLinkNode->szUrl);
if(pLinkNode->nDeep == -1)
{
pLinkNode->nValid = 0;
pLinkNode = pLinkNode->pNext;
continue;
}
}
if(pLinkNode->nDeep == iIndex)
{
bBreak = TRUE;
pLinkNode->bHandled = 1;
if (pPreLinkNode != NULL)
{
pPreLinkNode->pBrother = pLinkNode;
m_pLinkInfo->nChildNodeCnt++;
}
pPreLinkNode = pLinkNode;
if (m_pLinkInfo->pChild == NULL)
{
m_pLinkInfo->pChild = pLinkNode;
m_pLinkInfo->nChildNodeCnt = 1;
}
BulidSubTreeFolder(pLinkNode,iIndex+1);
}
}
pLinkNode = pLinkNode->pNext;
}
if(bBreak) break;
}
//把剩余的插入到构造的树上
pLinkNode = m_pLinkInfo->pNext->pNext;
while (pLinkNode != NULL)
{
if( (pLinkNode->nValid==1) && (pLinkNode->bHandled==0) )
{
if( !InsertTreeNode(m_pLinkInfo, pLinkNode,TRUE) )
{
//没有找到合适父节点的放到其他里面去.
InsertChildNode(pLinkNode, m_pLinkInfo->pBrother);
}
}
pLinkNode = pLinkNode->pNext;
}
}
void CGetWebLinkInfo::BulidSubTreeFolder(LinkInfo *pParentNode,int level)
{
LinkInfo *pLinkNode = m_pLinkInfo->pNext->pNext;
LinkInfo *pChildNodePos = NULL;
while (pLinkNode != NULL)
{
//判断是否有效,判断是否处理过
if( (pLinkNode->nValid==1) && (pLinkNode->bHandled==0) )
{
//判断是否已经计算过深度,如果没有计算过则先计算深度
if(pLinkNode->nDeep == 0)
{
pLinkNode->nDeep = GetUrlDeep(pLinkNode->szUrl);
if(pLinkNode->nDeep == -1)
{
pLinkNode->nValid = 0;
pLinkNode = pLinkNode->pNext;
continue;
}
}
if(pLinkNode->nDeep == level)
{
if( _tcsstr(pLinkNode->szUrl,pParentNode->szUrl)!=NULL )
{
if( *(pLinkNode->szUrl+_tcslen(pLinkNode->szUrl)-1) == '/' )
{
pParentNode->nChildNodeCnt++;
pLinkNode->bHandled = 1;
if (pChildNodePos != NULL)
{
pChildNodePos->pBrother = pLinkNode;
}
pChildNodePos = pLinkNode;
if(pParentNode->pChild == NULL)
{
pParentNode->pChild = pChildNodePos;
}
BulidSubTreeFolder(pLinkNode,level+1);
}
}
}
}
pLinkNode = pLinkNode->pNext;
}
}
BOOL CGetWebLinkInfo::InsertTreeNode(LinkInfo* pParentNode, LinkInfo* pLinkNode, BOOL IsRoot)
{
BOOL bRet = FALSE;
LinkInfo* pPerChildNode = NULL;
LinkInfo* pChildNode = pParentNode->pChild;
while (pChildNode != NULL)
{
if( *(pChildNode->szUrl+_tcslen(pChildNode->szUrl)-1) == '/' )
{
if( _tcsstr(pLinkNode->szUrl, pChildNode->szUrl)!= NULL)
{
if( InsertTreeNode(pChildNode, pLinkNode) )
{
return TRUE;
}
}
}
pPerChildNode = pChildNode;
pChildNode = pChildNode->pBrother;
}
if(!IsRoot && !bRet)
{
if(pPerChildNode==NULL)
{
pParentNode->pChild = pLinkNode;
pParentNode->nChildNodeCnt=1;
pLinkNode->bHandled=1;
}
else
{
pPerChildNode->pBrother = pLinkNode;
pParentNode->nChildNodeCnt++;
pLinkNode->bHandled=1;
}
bRet = TRUE;
- 1
- 2
- 3
前往页