#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
typedef struct _img_url_lnk {
char *url;
struct _img_url_lnk *pre;
struct _img_url_lnk *next;
} img_url_lnk;
static const char g_filter[][32] = {
"html",
"htm"
};
static img_url_lnk *g_first_img_url = NULL;
static img_url_lnk *g_last_img_url = NULL;
static img_url_lnk* img_url_malloc(void)
{
img_url_lnk *ret = (img_url_lnk *)malloc(sizeof(img_url_lnk));
if (!ret)
return NULL;
ret->url = NULL;
ret->pre = NULL;
ret->next = NULL;
}
static void img_url_free(img_url_lnk *img)
{
if (!img)
return;
if (img->url)
free(img->url);
img->pre = NULL;
img->next = NULL;
}
static int img_url_push(img_url_lnk *img)
{
int ret = -1;
if (!img)
return ret;
if (!g_first_img_url) {
g_first_img_url = img;
g_last_img_url = g_first_img_url;
} else {
img->pre = g_last_img_url;
g_last_img_url->next = img;
g_last_img_url = img;
}
return 0;
}
static img_url_lnk* img_url_pop(void)
{
img_url_lnk *tmp = g_last_img_url;
if (!tmp)
return tmp;
if (g_last_img_url == g_first_img_url) {
g_first_img_url = g_last_img_url = NULL;
} else {
g_last_img_url = g_last_img_url->pre;
}
return tmp;
}
static int update_url_name(const char *url, char *html_name)
{
const char *find = NULL;
if (!url)
return NULL;
int url_len = strlen(url);
int i = 0;
int j = 0;
if (!find)
find = url;
for (j = 0; j < url_len; j++) {
if (find[j] == '/') {
html_name[j] = '_';
continue;
}
html_name[j] = find[j];
}
printf("[%s] name = %s\r\n", __FUNCTION__, html_name);
return url_len;
}
static int gen_html_name(const char *url, char *html_name)
{
const char *find = NULL;
if (!url)
return NULL;
int url_len = strlen(url);
int i = 0;
int j = 0;
for (i=url_len-1; i>=0; i-- ) {
if (url[i] == '/' || url[i] == ' ') {
find = url+i+1;
if ((url_len = strlen(find)) > 0) {
break;
}
} else if (url[i] == '%') {
find = url+i+3;
if ((url_len = strlen(find)) > 0) {
break;
}
} else if (url_len-i>16) {
find = url+i+1;
url_len = strlen(find);
break;
}
}
if (!find)
find = url;
for (j = 0; j < url_len; j++) {
if (find[j] == '/') {
html_name[j] = '_';
continue;
}
html_name[j] = find[j];
}
printf("[%s] name = %s\r\n", __FUNCTION__, html_name);
return url_len;
}
static int get_html_inner_urls_ex(char *buf)
{
int ret = -1;
int buf_len = 0;
int push_flag = 0;
char url[512] = { '\0' };
img_url_lnk *tmp = NULL;
int i = 0, j = 0;
if (!buf)
return ret;
buf_len = strlen(buf);
for (i = 0; i < buf_len; i++) {
if (
i < buf_len-6
&& (buf[i] == 'h' || buf[i] == 'H')
&& (buf[i+1] == 'r' || buf[i+1] == 'R')
&& (buf[i+2] == 'e' || buf[i+2] == 'E')
&& (buf[i+3] == 'f' || buf[i+3] == 'F')
&& (buf[i+4] == '=')
&& (buf[i+5] == '\"')
) {
push_flag = 1;
i += 5;
j = 0;
memset(url, '\0', 512);
continue;
} else if (
i < buf_len-5
&& (buf[i] == 's' || buf[i] == 'S')
&& (buf[i+1] == 'r' || buf[i+1] == 'R')
&& (buf[i+2] == 'c' || buf[i+2] == 'C')
&& (buf[i+3] == '=')
&& (buf[i+4] == '\"')
) {
push_flag = 1;
i += 4;
j = 0;
memset(url, '\0', 512);
continue;
}
if (push_flag == 1) {
// add url
if (buf[i] == '\"') {
push_flag = 2;
// push in url
tmp = img_url_malloc();
tmp->url = strdup(url);
img_url_push(tmp);
continue;
}
url[j++] = buf[i];
continue;
}
}
return ret;
}
static int get_html_inner_urls(char *buf)
{
int ret = 0;
char *tag = "href=\"";
char *pos = NULL;
img_url_lnk *tmp = NULL;
if (!buf)
return ret;
pos = strstr(buf, tag);
while (pos){
pos += strlen(tag);
const char * next = strstr(pos, "\"");
if (next){
char *url = (char *)malloc(sizeof(char) * (next - pos + 1));
memset(url, '\0', sizeof(char) * (next-pos+1));
sscanf(pos, "%[^\"]", url);
tmp = img_url_malloc();
tmp->url = strdup(url);
img_url_push(tmp);
int del_flag = 0;
int pos_i = strlen(pos) - strlen(next);
while (pos_i >= 0) {
if (pos[pos_i] == '/') {
del_flag = 1;
}
if (del_flag) {
pos[pos_i] = ' ';
}
pos_i--;
}
pos = strstr(pos, tag);
free(url);
}
}
int debug = 0;
tag = "<img ";
const char* att1 = "src=\"";
const char* att2 = "lazy-src=\"";
const char *pos0 = strstr(buf, tag);
while (pos0){
pos0 += strlen(tag);
char* pos2 = strstr(pos0, att2);
if (!pos2 || pos2 > strstr(pos0, ">")) {
pos = strstr(pos0, att1);
if (!pos) {
pos0 = strstr(att1, tag);
continue;
}
else {
pos = pos + strlen(att1);
}
}
else {
pos = pos2 + strlen(att2);
}
const char * next = strstr(pos, "\"");
if (next){
char *url = (char *)malloc(sizeof(char) * (next - pos + 1));
sscanf(pos, "%[^\"]", url);
if (strstr(url, "pixel.png")) {
debug = 1;
}
if (debug == 1) {
printf("url = %s\r\n", url);
}
tmp = img_url_malloc();
tmp->url = strdup(url);
img_url_push(tmp);
int del_flag = 0;
int pos_i = strlen(pos) - strlen(next);
while (pos_i >= 0) {
if (pos[pos_i] == '/') {
del_flag = 1;
}
if (del_flag) {
pos[pos_i] = ' ';
}
pos_i--;
}
pos0 = strstr(pos0, tag);
free(url);
}
}
tag = "<script ";
pos0 = strstr(buf, tag);
while (pos0){
pos0 += strlen(tag);
char* pos2 = strstr(pos0, att2);
if (!pos2 || pos2 > strstr(pos0, ">")) {
pos = strstr(pos0, att1);
if (!pos) {
pos0 = strstr(att1, tag);
continue;
}
else {
pos = pos + strlen(att1);
}
}
else {
pos = pos2 + strlen(att2);
}
const char * next = strstr(pos, "\"");
if (next){
char *url = (char *)malloc(sizeof(char) * (next - pos + 1));
sscanf(pos, "%[^\"]", url);
//if (url[strlen(url) - 1] != '/') {
tmp = img_url_malloc();
tmp->url = strdup(url);
img_url_push(tmp);
//}
int del_flag = 0;
int pos_i = strlen(pos) - strlen(next);
while (pos_i >= 0) {
if (pos[pos_i] == '/') {
del_flag = 1;
}
if (del_flag) {
pos[pos_i] = ' ';
}
pos_i--;
}
pos0 = strstr(pos0, tag);
free(url);
}
}
return ret;
}
static int create_dir(const char *dir)
{
int ret = -1;
char buf[512] = { '\0' };
if (!dir)
return ret;
sprintf(buf, "mkdir %s", dir);
printf("buf = %s\r\n", buf);
return system(buf);
}
static int download_filter(const char *_url)
{
int ret = -1;
char *find = NULL;
char *url = NULL;
int len = 0;
int tmp_len = 0;
if (!_url)
return 0;
if (_url[strlen(_url)-1] == '/')
return 0;
url = strdup(_url);
len = strlen(url);
int i = -1;
while(url[i++])
url[i] = tolower(url[i]);