网络蜘蛛webspider开源系统资源-CSDN文库

网络蜘蛛

3星 · 超过75%的资源需积分: 10 14 浏览量 2012-12-30 10:11:58 上传评论收藏 14KB GZ 举报

共16个文件

h：8个

c：5个

cpp：2个

资源推荐

资源详情

资源评论

收起资源包目录

webspider-1.0.0.0.tar.gz （16个子文件）

webspider

misc.c 895B

url.c 10KB

Makefile 806B

options.h 7KB

url.h 594B

options.c 8KB

stdhead.h 391B

strfunc.c 3KB

strfunc.h 456B

webspider.cpp 700B

html.h 928B

httpsqs_client.cpp 9KB

httpsqs_client.h 750B

html.c 10KB

misc.h 383B

webspider.h 83B

#include "url.h" #include "strfunc.h" /* GetDir * Page -> dir <- * Page = "/dir1/dir2/page.htm" => dir = "/dir1/dir2/" */ int GetDir(char* Page,char* dir) { int i; int last=0; char* tmpPage; char* tmpP; tmpPage = (char *)malloc((size_t)strlen(Page)+5); strcpy(tmpPage, Page); tmpP = strchr(tmpPage,'?'); if( tmpP > tmpPage) tmpPage[tmpP-tmpPage]=0; for(i=0;i<(signed)strlen(tmpPage);i++) if(tmpPage[i]=='/') last=i; strncpy(dir,(last==0) ? "/" : tmpPage,(last==0) ? 1 : last); dir[(last==0) ? 1 : last]=0; if(dir[strlen(dir)-1]!='/') strcat(dir,"/"); free(tmpPage); return 1; } int CheckPage(char* page) { int c=0; int b; char tmpPage[MAXPAGESIZE+2000]; char rTmpPage[MAXPAGESIZE]; char *rPos; int i; memset(rTmpPage,0,sizeof(rTmpPage)); strncpy(rTmpPage,page,MIN(strlen(page),MAXPAGESIZE-1)); if(strlen(page)<2) return 1; if(page[0]==' ') return -1; if(page[0]=='.' && page[1]=='/') strcpy(rTmpPage,rTmpPage+2); for(i=1;rTmpPage[i]!=0 && i<MAXPAGESIZE-1;i++) { if(rTmpPage[i-1] != '.' && rTmpPage[i] == '.' && rTmpPage[i+1]=='/') { rTmpPage[i]=0; strcat(rTmpPage,rTmpPage+i+2); i-=2; } } if(rTmpPage[i-1]=='.') rTmpPage[i-1]=0; else rTmpPage[i]=0; if(strstr(rTmpPage,"..")==0) { strcpy(page,rTmpPage); return 1; } c=0; rPos=rTmpPage; if(page[0]=='/') { tmpPage[0]='/'; tmpPage[1]=0; } else tmpPage[0]=0; while(rPos[0]!=0) { c=strchr(rPos,'/')-rPos; if((unsigned)c>strlen(rPos) || c<0) { strcat(tmpPage,rPos); break; } if(rPos[0]==' ') return -1; if(strncmp(rPos,"..",c)!=0) { strncat(tmpPage,rPos,c+1); tmpPage[strlen(tmpPage)+c+1]=0; } else { for(b=strlen(tmpPage)-2;b>0;b--) { if(tmpPage[b]=='/') { tmpPage[b+1]=0; break; } } if(b==0) { tmpPage[0] = (tmpPage[0]=='/') ? '/' : '\0'; tmpPage[1] = '\0'; } } rPos+=c+1; } if(tmpPage[0]==0) { tmpPage[0]='/'; tmpPage[1]=0; } strcpy(page,tmpPage); return 1; } /* PageType * Host <- * Host->Page = "/test.htm" Host->type = 1 (type htm/html) */ int PageType(struct sHost* Host) { int i; char rPage[MAXPAGESIZE]; int bArgs=0; /*bArgs=1 == the page contains a '?'*/ int slHP; if(Host==NULL) return -1; memset(rPage,0,MAXPAGESIZE); strncpy(rPage,Host->Page,MAXPAGESIZE-1); //printf("Host->Page :%s\n", Host->Page); if(strchr(rPage,'?')>rPage) //does this page contain a '?' { rPage[strchr(rPage,'?')-rPage]=0; //cut it bArgs=1; } if(Host->Page[strlen(Host->Page)-1]=='/') { Host->type = 1; // Html file return 1; } slHP = MIN(strlen(Host->Page),MAXPAGESIZE); for(i=0;i<slHP;i++) { if(Host->Page[i]=='.') break; } if(i==(signed)strlen(Host->Page) && bArgs==0) //Maybe a directory (no '.' found) { if(strlen(Host->Page)>=MAXPAGESIZE-1) return -1; strcat(Host->Page,"/"); Host->type = 1; // Html file return 1; } for(i=0;HtmlExtensions[i][0]!=0;i++) { if(strcasecmp(rPage+strlen(rPage)-strlen(HtmlExtensions[i]),(char*)HtmlExtensions[i])==0) { Host->type = 1; // Html file return 1; } } for(i=0;PlainTextExtension[i][0]!=0;i++) { if(strcasecmp(rPage+strlen(rPage)-strlen(PlainTextExtension[i]),(char*)PlainTextExtension[i])==0) { Host->type = 2; return 1; } } /*Support for custom extensions*/ /*TO TEST*/ for(i=0;CustomExtensions[i][0]!=0;i++) { if(strcasecmp(rPage+strlen(rPage)-strlen(CustomExtensions[i]),(char*)CustomExtensions[i])==0) { Host->type = 4; return 1; } } if(bArgs==1) Host->type = 1; else Host->type = 3; //discard it return 1; } /* PortNumFromHostname * hostname -><- * hostname="www.auuuu.com:90" => hostname="www.auuuu.com"; return 90; */ unsigned int PortNumFromHostname(char* hostname) { unsigned int i; for(i=0;i<strlen(hostname);i++) if(hostname[i]==':') break; if(i!=strlen(hostname)) { hostname[i]=0; return (unsigned)atoi(hostname+i+1); } return PORT; } int GenerateURL(struct sHost Host,char* URL) { char port[5]; sprintf(port,"%d",Host.port); strcpy(URL,"http://"); strcat(URL,Host.Host); strcat(URL,":"); strcat(URL,port); strcat(URL,Host.Page); return 1; } /* ParseUrl * Url <- sHost * Url: "http://www.test.com/page.htm" ==> * ==> sHost.Url = Url && sHost.Host = "www.test.com" && sHost.Page = "page.htm" */ int ParseUrl(char* url,struct sHost* sh,struct sHost* currentHost) { char tUrl[MAXURLSIZE]; char BaseDir[MAXPAGESIZE]; unsigned int offset=0,i; char* token1=NULL; char* tmpPage; if(url==NULL || sh==NULL) return -1; if(strlen(url)>MAXURLSIZE-1) return -1; if( strncasecmp(url,"ftp://",6)==0 || strncasecmp(url,"mailto:",7)==0 || strncasecmp(url,"about:",6)==0 || strncasecmp(url,"irc://",6)==0 || strncasecmp(url,"news://",7)==0 || strncasecmp(url,"https://",8)==0) //protocols not supported return -1; memset(sh,0,sizeof(struct sHost)); memset(tUrl,0,MAXURLSIZE); for(i=0;i<strlen(url);i++) { if(url[i]=='#') { url[i]=0; break; } } if(url[0]==0) return -1; if(strncasecmp(url,"http://",7)==0) { if(strlen(url)==7) return -1; else offset=7; } if(strncmp(url,"//",2)==0) { if(strlen(url)==7) return -1; else offset=2; } strncpy(tUrl,url+offset,strlen(url)-offset); tUrl[strlen(url)-offset]=0; if(offset>0) //url with prefix: "http://" || "//" { for(i=0;i<strlen(tUrl);i++) { if(tUrl[i]=='/' || tUrl[i]=='?') { token1=tUrl+i; break; } } if(token1>tUrl) //is there a '/'? { strncpy(sh->Host,tUrl,token1-tUrl); //yes: the host is the part of the string before '/' and the page the rest strncpy(sh->Page,token1,MAXPAGESIZE-1); if(strncasecmp(sh->Page,"mailto:",7)==0) return -1; } else //no: the host is the url and the page is the index { strncpy(sh->Host,tUrl,MAXHOSTSIZE-1); strcpy(sh->Page,"/"); } sh->port = PortNumFromHostname(sh->Host); strtrim(sh->Host,sh->Host); tmpPage = (char *)malloc(MAXPAGESIZE); strtrim(sh->Page,tmpPage); ReplaceStr(tmpPage,sh->Page,(char *)"&",(char *)"&"); free(tmpPage); /* currentHost has the same hostname and port and has an host_id */ if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port ) { /* yes: this page is from the same domain: use currentHost host_id */ sh->host_id = currentHost->host_id; } if(CheckPage(sh->Page)==-1) return -1; return PageType(sh); } else //now we expect a relative url { if(strlen(url)>MAXPAGESIZE-1) return -1; if(currentHost==NULL) //if we haven't a reference host we can't continue return -1; strncpy(sh->Host,currentHost->Host,MIN(MAXHOSTSIZE-1,strlen(currentHost->Host))); if(tUrl[0]!='/') //if the first char is not '/' we must consider the current directory GetDir(currentHost->Page,BaseDir); else BaseDir[0]=0; for(i=strlen(tUrl);i>0;i--) //is there a '.' before last '/'? if(tUrl[i]=='/') break; else if(tUrl[i]=='.') //yes: this is a page Ex. "/sources.html" { if(strlen(BaseDir)+strlen(tUrl)>=MAXPAGESIZE) return -1; strcpy(sh->Page,BaseDir); strcat(sh->Page,tUrl); //get the port from the current Host sh->port = currentHost->port; strtrim(sh->Host,sh->Host); tmpPage = (char *)malloc(MAXPAGESIZE); strtrim(sh->Page,tmpPage); ReplaceStr(tmpPage,sh->Page,(char *)"&",(char *)"&"); free(tmpPage); /* currentHost has the same hostname and port and has an host_id */ if( currentHost && currentHost->host_id!=0 && strcmp( currentHost->Host, sh->Host ) == 0 && currentHost->port == sh->port ) { /* yes: this page is from the same domain: use curren

评论收藏

内容反馈