#include "url.h"
#include "strfunc.h"
/* GetDir
* Page -> dir <-
* Page = "/dir1/dir2/page.htm" => dir = "/dir1/dir2/"
*/
int GetDir(char* Page,char* dir)
{
int i;
int last=0;
char* tmpPage;
char* tmpP;
tmpPage = (char *)malloc((size_t)strlen(Page)+5);
strcpy(tmpPage, Page);
tmpP = strchr(tmpPage,'?');
if( tmpP > tmpPage)
tmpPage[tmpP-tmpPage]=0;
for(i=0;i<(signed)strlen(tmpPage);i++)
if(tmpPage[i]=='/')
last=i;
strncpy(dir,(last==0) ? "/" : tmpPage,(last==0) ? 1 : last);
dir[(last==0) ? 1 : last]=0;
if(dir[strlen(dir)-1]!='/')
strcat(dir,"/");
free(tmpPage);
return 1;
}
int CheckPage(char* page)
{
int c=0;
int b;
char tmpPage[MAXPAGESIZE+2000];
char rTmpPage[MAXPAGESIZE];
char *rPos;
int i;
memset(rTmpPage,0,sizeof(rTmpPage));
strncpy(rTmpPage,page,MIN(strlen(page),MAXPAGESIZE-1));
if(strlen(page)<2)
return 1;
if(page[0]==' ')
return -1;
if(page[0]=='.' && page[1]=='/')
strcpy(rTmpPage,rTmpPage+2);
for(i=1;rTmpPage[i]!=0 && i<MAXPAGESIZE-1;i++)
{
if(rTmpPage[i-1] != '.' && rTmpPage[i] == '.' && rTmpPage[i+1]=='/')
{
rTmpPage[i]=0;
strcat(rTmpPage,rTmpPage+i+2);
i-=2;
}
}
if(rTmpPage[i-1]=='.')
rTmpPage[i-1]=0;
else
rTmpPage[i]=0;
if(strstr(rTmpPage,"..")==0)
{
strcpy(page,rTmpPage);
return 1;
}
c=0;
rPos=rTmpPage;
if(page[0]=='/')
{
tmpPage[0]='/';
tmpPage[1]=0;
}
else
tmpPage[0]=0;
while(rPos[0]!=0)
{
c=strchr(rPos,'/')-rPos;
if((unsigned)c>strlen(rPos) || c<0)
{
strcat(tmpPage,rPos);
break;
}
if(rPos[0]==' ')
return -1;
if(strncmp(rPos,"..",c)!=0)
{
strncat(tmpPage,rPos,c+1);
tmpPage[strlen(tmpPage)+c+1]=0;
}
else
{
for(b=strlen(tmpPage)-2;b>0;b--)
{
if(tmpPage[b]=='/')
{
tmpPage[b+1]=0;
break;
}
}
if(b==0)
{
tmpPage[0] = (tmpPage[0]=='/') ? '/' : '\0';
tmpPage[1] = '\0';
}
}
rPos+=c+1;
}
if(tmpPage[0]==0)
{
tmpPage[0]='/';
tmpPage[1]=0;
}
strcpy(page,tmpPage);
return 1;
}
/* PageType
* Host <-
* Host->Page = "/test.htm" Host->type = 1 (type htm/html)
*/
int PageType(struct sHost* Host)
{
int i;
char rPage[MAXPAGESIZE];
int bArgs=0; /*bArgs=1 == the page contains a '?'*/
int slHP;
if(Host==NULL)
return -1;
memset(rPage,0,MAXPAGESIZE);
strncpy(rPage,Host->Page,MAXPAGESIZE-1);
//printf("Host->Page :%s\n", Host->Page);
if(strchr(rPage,'?')>rPage) //does this page contain a '?'
{
rPage[strchr(rPage,'?')-rPage]=0; //cut it
bArgs=1;
}
if(Host->Page[strlen(Host->Page)-1]=='/')
{
Host->type = 1; // Html file
return 1;
}
slHP = MIN(strlen(Host->Page),MAXPAGESIZE);
for(i=0;i<slHP;i++)
{
if(Host->Page[i]=='.')
break;
}
if(i==(signed)strlen(Host->Page) && bArgs==0) //Maybe a directory (no '.' found)
{
if(strlen(Host->Page)>=MAXPAGESIZE-1)
return -1;
strcat(Host->Page,"/");
Host->type = 1; // Html file
return 1;
}
for(i=0;HtmlExtensions[i][0]!=0;i++)
{
if(strcasecmp(rPage+strlen(rPage)-strlen(HtmlExtensions[i]),(char*)HtmlExtensions[i])==0)
{
Host->type = 1; // Html file
return 1;
}
}
for(i=0;PlainTextExtension[i][0]!=0;i++)
{
if(strcasecmp(rPage+strlen(rPage)-strlen(PlainTextExtension[i]),(char*)PlainTextExtension[i])==0)
{
Host->type = 2;
return 1;
}
}
/*Support for custom extensions*/ /*TO TEST*/
for(i=0;CustomExtensions[i][0]!=0;i++)
{
if(strcasecmp(rPage+strlen(rPage)-strlen(CustomExtensions[i]),(char*)CustomExtensions[i])==0)
{
Host->type = 4;
return 1;
}
}
if(bArgs==1)
Host->type = 1;
else
Host->type = 3; //discard it
return 1;
}
/* PortNumFromHostname
* hostname -><-
* hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90;
*/
unsigned int PortNumFromHostname(char* hostname)
{
unsigned int i;
for(i=0;i<strlen(hostname);i++)
if(hostname[i]==':')
break;
if(i!=strlen(hostname))
{
hostname[i]=0;
return (unsigned)atoi(hostname+i+1);
}
return PORT;
}
int GenerateURL(struct sHost Host,char* URL)
{
char port[5];
sprintf(port,"%d",Host.port);
strcpy(URL,"http://");
strcat(URL,Host.Host);
strcat(URL,":");
strcat(URL,port);
strcat(URL,Host.Page);
return 1;
}
/* ParseUrl
* Url <- sHost
* Url: "http://www.test.com/page.htm" ==>
* ==> sHost.Url = Url && sHost.Host = "www.test.com" && sHost.Page = "page.htm"
*/
int ParseUrl(char* url,struct sHost* sh,struct sHost* currentHost)
{
char tUrl[MAXURLSIZE];
char BaseDir[MAXPAGESIZE];
unsigned int offset=0,i;
char* token1=NULL;
char* tmpPage;
if(url==NULL || sh==NULL)
return -1;
if(strlen(url)>MAXURLSIZE-1)
return -1;
if( strncasecmp(url,"ftp://",6)==0 ||
strncasecmp(url,"mailto:",7)==0 ||
strncasecmp(url,"about:",6)==0 ||
strncasecmp(url,"irc://",6)==0 ||
strncasecmp(url,"news://",7)==0 ||
strncasecmp(url,"https://",8)==0) //protocols not supported
return -1;
memset(sh,0,sizeof(struct sHost));
memset(tUrl,0,MAXURLSIZE);
for(i=0;i<strlen(url);i++)
{
if(url[i]=='#')
{
url[i]=0;
break;
}
}
if(url[0]==0)
return -1;
if(strncasecmp(url,"http://",7)==0)
{
if(strlen(url)==7)
return -1;
else
offset=7;
}
if(strncmp(url,"//",2)==0)
{
if(strlen(url)==7)
return -1;
else
offset=2;
}
strncpy(tUrl,url+offset,strlen(url)-offset);
tUrl[strlen(url)-offset]=0;
if(offset>0) //url with prefix: "http://" || "//"
{
for(i=0;i<strlen(tUrl);i++)
{
if(tUrl[i]=='/' || tUrl[i]=='?')
{
token1=tUrl+i;
break;
}
}
if(token1>tUrl) //is there a '/'?
{
strncpy(sh->Host,tUrl,token1-tUrl); //yes: the host is the part of the string before '/' and the page the rest
strncpy(sh->Page,token1,MAXPAGESIZE-1);
if(strncasecmp(sh->Page,"mailto:",7)==0)
return -1;
}
else //no: the host is the url and the page is the index
{
strncpy(sh->Host,tUrl,MAXHOSTSIZE-1);
strcpy(sh->Page,"/");
}
sh->port = PortNumFromHostname(sh->Host);
strtrim(sh->Host,sh->Host);
tmpPage = (char *)malloc(MAXPAGESIZE);
strtrim(sh->Page,tmpPage);
ReplaceStr(tmpPage,sh->Page,(char *)"&",(char *)"&");
free(tmpPage);
/* currentHost has the same hostname and port and has an host_id */
if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )
{
/* yes: this page is from the same domain: use currentHost host_id */
sh->host_id = currentHost->host_id;
}
if(CheckPage(sh->Page)==-1)
return -1;
return PageType(sh);
}
else //now we expect a relative url
{
if(strlen(url)>MAXPAGESIZE-1)
return -1;
if(currentHost==NULL) //if we haven't a reference host we can't continue
return -1;
strncpy(sh->Host,currentHost->Host,MIN(MAXHOSTSIZE-1,strlen(currentHost->Host)));
if(tUrl[0]!='/') //if the first char is not '/' we must consider the current directory
GetDir(currentHost->Page,BaseDir);
else
BaseDir[0]=0;
for(i=strlen(tUrl);i>0;i--) //is there a '.' before last '/'?
if(tUrl[i]=='/')
break;
else if(tUrl[i]=='.') //yes: this is a page Ex. "/sources.html"
{
if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE)
return -1;
strcpy(sh->Page,BaseDir);
strcat(sh->Page,tUrl);
//get the port from the current Host
sh->port = currentHost->port;
strtrim(sh->Host,sh->Host);
tmpPage = (char *)malloc(MAXPAGESIZE);
strtrim(sh->Page,tmpPage);
ReplaceStr(tmpPage,sh->Page,(char *)"&",(char *)"&");
free(tmpPage);
/* currentHost has the same hostname and port and has an host_id */
if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port )
{
/* yes: this page is from the same domain: use curren
没有合适的资源?快使用搜索试试~ 我知道了~
网络蜘蛛webspider开源系统
3星 · 超过75%的资源 需积分: 10 22 下载量 14 浏览量
2012-12-30
10:11:58
上传
评论
收藏 14KB GZ 举报
温馨提示
共16个文件
h:8个
c:5个
cpp:2个
稳定的网络蜘蛛,可以并行抓取多个站点,BS架构控制系统,使用httpsqs 和淘宝的tair可以作为分布式爬虫基础,去重能力强,存储系统为mysql,可以用sphinx等做索引,原始版本
资源推荐
资源详情
资源评论
收起资源包目录
webspider-1.0.0.0.tar.gz (16个子文件)
webspider
misc.c 895B
url.c 10KB
Makefile 806B
options.h 7KB
url.h 594B
options.c 8KB
stdhead.h 391B
strfunc.c 3KB
strfunc.h 456B
webspider.cpp 700B
html.h 928B
httpsqs_client.cpp 9KB
httpsqs_client.h 750B
html.c 10KB
misc.h 383B
webspider.h 83B
共 16 条
- 1
资源评论
- yssxyssx2014-09-24谢谢楼主的资源。
- edwardcsdn2013-07-24很有参考性。感谢楼主
成竹在线
- 粉丝: 1
- 资源: 16
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功