写了挺多网站的爬虫,一直想把所有网站整合成一个爬虫程序,不过没这个时间
下面简单展示掘金网站的文章爬
> 本爬虫会将爬取的内容直接上传到wordpress网站的数据库,所以运行爬虫前请填写好数据库的相关信息
如果不想传到数据库,请自行改动代码,将数据写到本地!
```python
#pytest
#coding=utf-8
#!/usr/bin/python
# IT网站文章爬虫
# 作者:小鸡 flyphp@outlook.com
import pymysql
import time
from bs4 import BeautifulSoup
import requests,sys
import datetime
import re
import json
import urllib
import urllib2
import types
# 连接数据库需要,IP,user,密码,库名
db = pymysql.connect(host='localhost', user="root", passwd='your passwd', db='www_idealli_com')
# 文章头部加入自己的广告
ads = ""
cur_tag = ''
today = '2019-02-18 18:09:06'
urls = []
tar = []
link_head = "https://juejin.im"
otar = []
ourls = []
ii = 0
it = 0
cur_page_num = 0
cur_tag_num = 2
last_post_id = 0
check_title_time = 1
# url_fo = open("mysql_urls.log", "w")
# car_fo = open("mysql_cars.log", "w")
#################################################
# 文章插入,数据库部分
#################################################
def tag_id(tag_name):
# check tag
cursor = db.cursor()
check_tag = "SELECT `term_id` FROM `wp_terms` WHERE `name` = '" + tag_name + "'"
try:
cursor.execute(check_tag)
except Exception as e:
print(e)
tag_id = cursor.fetchone()
tag_data = cursor.fetchall()
if tag_id is None:
try:
print(tag_name)
insert_tag = "INSERT INTO `wp_terms` (`term_id`, `name`, `slug`, `term_group`) VALUES (NULL, '"+ tag_name +"','" + tag_name + "', '0')"
cursor.execute(insert_tag)
cursor.execute(insert_tag)
check_tag = "SELECT `term_id` FROM `wp_terms` WHERE `name` = '" + tag_name + "'"
cursor.execute(check_tag)
tag_data = cursor.fetchall()
insert_taxonomy_tag = "INSERT INTO `wp_term_taxonomy` (`term_taxonomy_id`, `term_id`, `taxonomy`, `description`, `parent`, `count`) VALUES (NULL, '" + str(tag_data[0][0]) + "', 'post_tag', '', '0', '1')"
insert_taxonomy_category = "INSERT INTO `wp_term_taxonomy` (`term_taxonomy_id`, `term_id`, `taxonomy`, `description`, `parent`, `count`) VALUES (NULL, '" + str(tag_data[1][0]) + "', 'category', '', '0', '1')"
cursor.execute(insert_taxonomy_tag)
cursor.execute(insert_taxonomy_category)
db.commit()
except Exception as e:
print(e)
db.rollback()
try:
cursor.execute(check_tag)
except Exception as e:
print(e)
tag_data = cursor.fetchall()
check_taxo = "SELECT `term_taxonomy_id` FROM `wp_term_taxonomy` WHERE `term_id` = '" + str(tag_data[0][0]) + "'" + "or `term_id` = '" + str(tag_data[1][0]) + "'"
try:
cursor.execute(check_taxo)
except Exception as e:
print(e)
#print(cursor.fetchall())
taxo_data = cursor.fetchall()
if len(taxo_data) == 0:
insert_taxonomy_tag = "INSERT INTO `wp_term_taxonomy` (`term_taxonomy_id`, `term_id`, `taxonomy`, `description`, `parent`, `count`) VALUES (NULL, '" + str(tag_data[0][0]) + "', 'post_tag', '', '0', '1')"
insert_taxonomy_category = "INSERT INTO `wp_term_taxonomy` (`term_taxonomy_id`, `term_id`, `taxonomy`, `description`, `parent`, `count`) VALUES (NULL, '" + str(tag_data[1][0]) + "', 'category', '', '0', '1')"
cursor.execute(insert_taxonomy_tag)
cursor.execute(insert_taxonomy_category)
db.commit()
cursor.execute(check_taxo)
#print(cursor.fetchall())
taxo_data = cursor.fetchall()
return taxo_data
def insert_post(ti,au,con,tag_idd, tim,db):
title = ti.encode('utf-8')
cursor = db.cursor()
check_title = "select ID from wp_posts where ID>263100 and post_title='" + title + "';"
cursor.execute(check_title)
have_exit = cursor.fetchone()
if have_exit is None:
pass
else:
return 0
author = au.encode('utf-8')
global last_post_id
global check_title_time
last_post_id = int(last_post_id) +1
content = con
content = content.replace("'","`")
time = tim
taxo_data = tag_idd
post_id = last_post_id
try:
# check author
check_author = "SELECT `ID` FROM `wp_users` WHERE `display_name` LIKE '" + author + "'"
cursor = db.cursor()
cursor.execute(check_author)
author_id = cursor.fetchone()
if author_id is None:
insert_author = "INSERT INTO `wp_users` (`ID`, `user_login`, `user_pass`, `user_nicename`, `user_email`, `user_url`, `user_registered`, `user_activation_key`, `user_status`, `display_name`) VALUES (NULL, '', '', '', '', '', '0000-00-00 00:00:00.000000', '', '0', '" + author +"')"
try:
cursor.execute(insert_author)
db.commit()
except Exception as e:
db.rollback()
check_author = "SELECT `ID` FROM `wp_users` WHERE `display_name` LIKE '" + author + "'"
cursor.execute(check_author)
author_id = cursor.fetchone()
# creat new post
post_id = str(last_post_id)
insert_post = "INSERT INTO `wp_posts` (`ID`, `post_author`, `post_date`, `post_date_gmt`, `post_content`, `post_title`, `post_excerpt`, `post_status`, `comment_status`, `ping_status`, `post_password`, `post_name`, `to_ping`, `pinged`, `post_modified`, `post_modified_gmt`, `post_content_filtered`, `post_parent`, `guid`, `menu_order`, `post_type`, `post_mime_type`, `comment_count`) VALUES (" + post_id + ", '" + str(author_id[0]) + "', '" + str(time) + "', '" + str(time) + "', '" + str(content) + "', '" + str(title) + "', '', 'publish', 'open', 'open', '', '', '', '', '" + str(time) + "', '" + str(time) + "', '', '0', '', '0', 'post', '', '0')"
try:
cursor.execute(insert_post)
insert_relationship = "INSERT INTO `wp_term_relationships` (`object_id`, `term_taxonomy_id`, `term_order`) VALUES ('" + post_id + "', '" + str(taxo_data[0][0]) + "', '0')"
cursor.execute(insert_relationship)
update_taxo = "UPDATE `wp_term_taxonomy` SET `count` = `count`+1 WHERE `wp_term_taxonomy`.`term_taxonomy_id` = " + str(taxo_data[0][0])
cursor.execute(update_taxo)
insert_relationship = "INSERT INTO `wp_term_relationships` (`object_id`, `term_taxonomy_id`, `term_order`) VALUES ('" + post_id + "', '" + str(taxo_data[1][0]) + "', '0')"
cursor.execute(insert_relationship)
update_taxo = "UPDATE `wp_term_taxonomy` SET `count` = `count`+1 WHERE `wp_term_taxonomy`.`term_taxonomy_id` = " + str(taxo_data[1][0])
cursor.execute(update_taxo)
db.commit()
global ii
tip = "\tsucceed catch post" + str(title) + "\n\t this is the"+ str(ii) + "post and tag " + str(cur_tag_num)
print(tip)
print("\thttps://flycode.co/archives/" + str(post_id))
ii = ii + 1
return 1
except Exception as e:
db.rollback()
print(e)
return 0
except Exception as e:
db.rollback()
print(e)
return 0
#################################################
# 链接获取,爬虫部分
#################################################
def get_post(lin, tim , tag_idd):
link = lin
requests.adapters.DEFAULT_RETRIES = 5
global ads
global ads2
try:
print('\n\ttry to get post ' + link)
req = requests.get(url = link,timeout=10)
bf = BeautifulSoup(req.text,"html.parser")
req.close
texts = bf.find('div',class_='article-content')
if texts is None:
print(lin)
return
author = bf.find_all('meta',itemprop='name')[0]['content']
title = bf.find_all('h1',class_='article-title')[0].text
origin = u"<p>来源:<a target=_blank href="
origin = origin.encode('utf-8')
texts = "<meta name=\"referrer\" content=\"never\">" + ads2 + str(texts).replace('data-src=','src=') + ads + origin + str(link) +" rel=noopener>" + str(link) + "</a></p>"
#tag = bf.find('a',class_='tag').get('data-original-title')
#time.sleep(1.5)
except Exception as e:
print(e)
print("\t######################\n\n\terror!\n\n\t#######################")
time.sleep(3)
# get_url(nextlink)
try:
if insert_post(title, author,texts, tag_idd,tim , db) == 0:
print('\t post ' + title + ' have exit !!!')
return 0
else:
return 1
except:
return 0
def get_url(begin):
# global tar
# global
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
软件开发设计:应用软件开发、系统软件开发、移动应用开发、网站开发C++、Java、python、web、C#等语言的项目开发与学习资料 硬件与设备:单片机、EDA、proteus、RTOS、包括计算机硬件、服务器、网络设备、存储设备、移动设备等 操作系统:LInux、树莓派、安卓开发、微机操作系统、网络操作系统、分布式操作系统等。此外,还有嵌入式操作系统、智能操作系统等。 网络与通信:数据传输、信号处理、网络协议、网络与通信硬件、网络安全网络与通信是一个非常广泛的领域,它涉及到计算机科学、电子工程、数学等多个学科的知识。 云计算与大数据:包括云计算平台、大数据分析、人工智能、机器学习等,云计算是一种基于互联网的计算方式,通过这种方式,共享的软硬件资源和信息可以按需提供给计算机和其他设备。
资源推荐
资源详情
资源评论
收起资源包目录
python杂七杂八小项目合集.zip (162个子文件)
mmap_address.bin 8B
mmap_address.bin 8B
mmap_address.bin 8B
mmap_address.bin 8B
test.c 467B
test2.c 283B
error1.c 170B
error2.c 116B
刘金明语法分析器报告.docx 140KB
.gitignore 37B
settings.json 45B
settings.json 44B
settings.json 44B
settings.json 44B
README.md 10KB
README.md 9KB
LICENSE.md 1KB
README.md 1KB
README.md 352B
刘金明语法分析器报告.pdf 234KB
mnist.pkl 52.4MB
params.pkl 3.31MB
deep_convnet_params.pkl 966KB
sample_weight.pkl 178KB
test.png 242KB
lena.png 115KB
lena_gray.png 42KB
cnblog.py 11KB
juejin2.py 11KB
aliyun.py 10KB
juejin3.2.py 9KB
segment.py 9KB
aliyun.1.py 9KB
juejin3.1.py 9KB
layers.py 8KB
multi_layer_net_extend.py 7KB
get_predict_table.py 6KB
lexer.py 6KB
deep_convnet.py 6KB
simple_convnet.py 5KB
multi_layer_net.py 5KB
meizitu.py 4KB
parser.py 4KB
optimizer.py 4KB
generate.py 4KB
mnist.py 3KB
LL.py 3KB
trainer.py 3KB
function.py 3KB
batch_norm_test.py 3KB
hyperparameter_optimization.py 3KB
8_myAtoi.py 3KB
util.py 3KB
two_layer_net.py 2KB
two_layer_net.py 2KB
22_generateParenthesis.py 2KB
csdn.py 2KB
longestPalindrome.py 2KB
get_url.py 2KB
overfit_weight_decay.py 2KB
weight_init_compare.py 2KB
optimizer_compare_mnist.py 2KB
lanzhou.py 2KB
18_fourSum.py 2KB
12_intToRoman.py 2KB
grad_descent.py 2KB
10_isMatch.py 2KB
train_neuralnet.py 2KB
fish_game.py 2KB
13_romanToInt.py 2KB
two_list_sum.py 2KB
misclassified_mnist.py 2KB
apply_filter.py 2KB
gradient_2d.py 2KB
baidu.py 2KB
tianqi.py 2KB
download.py 2KB
16_threeSumClosest.py 1KB
optimizer_compare_naive.py 1KB
19_removeNthFromEnd.py 1KB
overfit_dropout.py 1KB
train_convnet.py 1KB
15_threeSum.py 1KB
train_neuralnet.py 1KB
17_letterCombinations.py 1KB
weight_init_activation_histogram.py 1KB
gradient.py 1KB
gradient.py 1KB
21_mergeTwoLists.py 1KB
csdn_test.py 1KB
11_maxArea.py 1KB
functions.py 1KB
functions.py 1KB
neuralnet_mnist_batch.py 1KB
TwoLayerNet.py 1KB
train_neuralnet2.py 1KB
neuralnet_mnist.py 1KB
two_sum.py 1KB
buy_apple_orange.py 988B
14_longestCommonPrefix.py 986B
共 162 条
- 1
- 2
资源评论
妄北y
- 粉丝: 1w+
- 资源: 1万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功