/*
* libsimspider - Web Spider Engine Library
* author : calvin
* email : calvinwilliams.c@gmail.com
*
* Licensed under the LGPL v2.1, see the file LICENSE in base directory.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include "libsimspider.h"
#include "libsimspider-queue.h"
#include "LOGC.h"
#include "libsimspider.h"
#include <curl/curl.h>
char __SIMSPIDER_VERSION_2_6_6[] = "2.6.6" ;
char *__SIMSPIDER_VERSION = __SIMSPIDER_VERSION_2_6_6 ;
struct SimSpiderEnv
{
/*fjs 2016-01-21
* sudo apt-get install libcurl4-openssl-dev
*#include <curl/curl.h>
*add "-lcurl" library when building
*/
CURLM *curls ;
CURLSH *share_curls ;
int still_running ;
int finished_count ;
char valid_file_extname_set[ SIMSPIDER_VALID_FILE_EXTNAME_SET + 1 ] ;
char valid_html_file_extname_set[ SIMSPIDER_VALID_HTML_FILE_EXTNAME_SET + 1 ] ;
int allow_empty_file_extname ;
char cert_pathfilename[ SIMSPIDER_MAXLEN_FILENAME + 1 ] ;
int allow_runoutof_website ;
int max_recursive_depth ;
int request_delay ;
int concurrent_count_automode ;
int max_concurrent_count ;
int adjust_concurrent_count ;
int max_retry_count ;
char *accept_encoding ;
char *transfer_encoding ;
int html_linker_parser_enable ;
funcBeginTaskProc *pfuncBeginTaskProc ;
funcRequestHeaderProc *pfuncRequestHeaderProc ;
funcRequestBodyProc *pfuncRequestBodyProc ;
funcResponseHeaderProc *pfuncResponseHeaderProc ;
funcResponseBodyProc *pfuncResponseBodyProc ;
funcFinishTaskProc *pfuncFinishTaskProc ;
void *request_queue_handler ;
funcResetRequestQueueProc *pfuncResetRequestQueueProc ;
funcResizeRequestQueueProc *pfuncResizeRequestQueueProc ;
funcPushRequestQueueUnitProc *pfuncPushRequestQueueUnitProc ;
funcPopupRequestQueueUnitProc *pfuncPopupRequestQueueUnitProc ;
void *done_queue_handler ;
funcResetDoneQueueProc *pfuncResetDoneQueueProc ;
funcResizeDoneQueueProc *pfuncResizeDoneQueueProc ;
funcQueryDoneQueueUnitProc *pfuncQueryDoneQueueUnitProc ;
funcAddDoneQueueUnitProc *pfuncAddDoneQueueUnitProc ;
funcUpdateDoneQueueUnitProc *pfuncUpdateDoneQueueUnitProc ;
void *public_data ;
} ;
struct DoneQueueUnit
{
char *referer_url ;
char *url ;
int recursive_depth ;
int retry_count ;
int status ;
struct SimSpiderEnv *penv ;
CURL *curl ;
char *post_url ;
struct curl_slist *free_curlheadlist_later ;
struct curl_slist *free_curllist1_later ;
struct curl_slist *free_curllist2_later ;
struct curl_slist *free_curllist3_later ;
struct SimSpiderBuf header ;
struct SimSpiderBuf body ;
void *private_data ;
} ;
static void CleanDoneQueueUnit( struct DoneQueueUnit *pdqu )
{
if( pdqu->referer_url )
{
free( pdqu->referer_url );
pdqu->referer_url = NULL ;
}
if( pdqu->url )
{
free( pdqu->url );
pdqu->url = NULL ;
}
if( pdqu->header.base )
{
free( pdqu->header.base );
pdqu->header.base = NULL ;
pdqu->header.bufsize = 0 ;
pdqu->header.len = 0 ;
}
if( pdqu->body.base )
{
free( pdqu->body.base );
pdqu->body.base = NULL ;
pdqu->body.bufsize = 0 ;
pdqu->body.len = 0 ;
}
if( pdqu->free_curlheadlist_later )
{
curl_slist_free_all( pdqu->free_curlheadlist_later );
pdqu->free_curlheadlist_later = NULL ;
}
if( pdqu->free_curllist1_later )
{
curl_slist_free_all( pdqu->free_curllist1_later );
pdqu->free_curllist1_later = NULL ;
}
if( pdqu->free_curllist2_later )
{
curl_slist_free_all( pdqu->free_curllist2_later );
pdqu->free_curllist2_later = NULL ;
}
if( pdqu->free_curllist3_later )
{
curl_slist_free_all( pdqu->free_curllist3_later );
pdqu->free_curllist3_later = NULL ;
}
return;
}
void FreeDoneQueueUnit( void *pv )
{
struct DoneQueueUnit *pdqu = (struct DoneQueueUnit *)pv ;
if( pdqu )
{
CleanDoneQueueUnit( pdqu );
free( pdqu );
}
return;
}
struct DoneQueueUnit *AllocDoneQueueUnit( struct SimSpiderEnv *penv , char *referer_url , char *url , int recursive_depth )
{
struct DoneQueueUnit *pdqu = NULL ;
pdqu = (struct DoneQueueUnit *)malloc( sizeof(struct DoneQueueUnit) ) ;
if( pdqu == NULL )
return NULL;
memset( pdqu , 0x00 , sizeof(struct DoneQueueUnit) );
if( referer_url )
{
pdqu->referer_url = strdup( referer_url ) ;
if( pdqu->referer_url == NULL )
{
FreeDoneQueueUnit( pdqu );
return NULL;
}
}
if( url )
{
pdqu->url = strdup( url ) ;
if( pdqu->url == NULL )
{
FreeDoneQueueUnit( pdqu );
return NULL;
}
}
pdqu->status = 0 ;
pdqu->recursive_depth = recursive_depth ;
pdqu->retry_count = 0 ;
return pdqu;
}
char *GetDoneQueueUnitRefererUrl( struct DoneQueueUnit *pdqu )
{
return pdqu->referer_url;
}
int SetDoneQueueUnitRefererUrl( struct DoneQueueUnit *pdqu , char *referer_url )
{
pdqu->referer_url = strdup( referer_url ) ;
if( pdqu->referer_url == NULL )
return SIMSPIDER_ERROR_ALLOC;
else
return 0;
}
char *GetDoneQueueUnitUrl( struct DoneQueueUnit *pdqu )
{
return pdqu->url;
}
int SetDoneQueueUnitUrl( struct DoneQueueUnit *pdqu , char *url )
{
pdqu->url = strdup( url ) ;
if( pdqu->url == NULL )
return SIMSPIDER_ERROR_ALLOC;
else
return 0;
}
int GetDoneQueueUnitRecursiveDepth( struct DoneQueueUnit *pdqu )
{
return pdqu->recursive_depth;
}
void SetDoneQueueUnitRecursiveDepth( struct DoneQueueUnit *pdqu , int recursive_depth )
{
pdqu->recursive_depth = recursive_depth ;
return;
}
int GetDoneQueueUnitRetryCount( struct DoneQueueUnit *pdqu )
{
return pdqu->retry_count;
}
void SetDoneQueueUnitRetryCount( struct DoneQueueUnit *pdqu , int retry_count )
{
pdqu->retry_count = retry_count ;
return;
}
int GetDoneQueueUnitStatus( struct DoneQueueUnit *pdqu )
{
return pdqu->status;
}
void SetDoneQueueUnitStatus( struct DoneQueueUnit *pdqu , int status )
{
pdqu->status = status ;
return;
}
CURL *GetDoneQueueUnitCurl( struct DoneQueueUnit *pdqu )
{
return pdqu->curl;
}
struct SimSpiderBuf *GetDoneQueueUnitHeaderBuffer( struct DoneQueueUnit *pdqu )
{
return & (pdqu->header);
}
struct SimSpiderBuf *GetDoneQueueUnitBodyBuffer( struct DoneQueueUnit *pdqu )
{
return & (pdqu->body);
}
int ResizeRequestQueue( struct SimSpiderEnv *penv , long new_size )
{
int nret = 0 ;
nret = penv->pfuncResizeRequestQueueProc( penv , new_size ) ;
if( nret )
{
ErrorLog( __FILE__ , __LINE__ , "pfuncResizeRequestQueueProc failed[%d] errno[%d]" , nret , errno );
return -1;
}
return 0;
}
int ResizeDoneQueue( struct SimSpiderEnv *penv , long new_size )
{
int nret = 0 ;
nret = penv->pfuncResizeDoneQueueProc( penv , new_size ) ;
if( nret )
{
ErrorLog( __FILE__ , __LINE__ , "pfuncResizeDoneQueueProc failed[%d] errno[%d]" , nret , errno );
return -1;
}
return 0;
}
static struct SimSpiderEnv *AllocSimSpiderEnv()
{
struct SimSpiderEnv *penv = NULL ;
penv = (struct SimSpiderEnv *)malloc( sizeof(struct SimSpiderEnv) ) ;
if( penv == NULL )
return NULL;
memset( penv , 0x00 , sizeof(struct SimSpiderEnv) );
return penv;
}
int ReallocHeaderBuffer( struct DoneQueueUnit *pdqu , long new_bufsize )
{
char *new_base = NULL ;
if( new_bufsize <= pdqu->header.bufsize )
return 0;
new_base = (char*)realloc( pdqu->header.base , new_bufsize ) ;
if( new_base == NULL )
return SIMSPIDER_ERROR_ALLOC;
memset( new_base + pdqu->header.len , 0x00 , new_bufsize - pdqu->header.len );
pdqu->header.base = new_base ;
pdqu->header.bufsize = new_bufsize ;
return 0;
}
int ReallocBodyBuffer( struct DoneQueueUnit *pdqu , long new_bufsize )
{
char *new_base = NULL ;
if( new_bufsize <= pdqu->body.bufsize )
return 0;
new_base = (char*)realloc( pdqu->body.base , new_bufsize ) ;
if( new_base == NULL )
return SIMSPIDER_ERROR_ALLOC;
memset( new_base + pdqu->body.len , 0x00 , new_bufsize - pdqu->body.len );
pdqu->body.base = new_base ;
pdqu->body.bufsize = new_bufsize ;
return 0;
}
int CleanSimSpiderBuffer( struct DoneQueueUnit *pdqu )
{
memset( pdqu->header.base , 0x00