#!/usr/bin/env python
# -*- coding: utf-8 -*-
from random import shuffle
import os, sys, re, multiprocessing, argparse, sqlite3, atexit
from time import time, sleep
from mako.lookup import TemplateLookup
from bottle import route, run, static_file, default_app, request
from Daemo import Daemon, DaemonError
from TechParser import get_conf, recommend, parser
from TechParser.py2x import unicode_, range, pickle, urlencode
if get_conf.config is None:
get_conf.set_config_auto()
get_conf.auto_fix_config()
running_as_daemon = False
module_path = os.path.dirname(os.path.realpath(__file__))
template_dir_path = os.path.join(module_path, "templates")
static_dir_path = os.path.join(module_path, "static")
logdir = os.path.expanduser("~")
logdir = os.path.join(logdir, ".tech-parser")
if not os.path.exists(logdir):
os.mkdir(logdir)
if not os.path.exists(os.path.join(logdir, "__init__.py")):
open(os.path.join(logdir, "__init__.py"), "w").close()
if not os.path.exists(os.path.join(logdir, "user_parser_config.py")):
f = open(os.path.join(logdir, "user_parser_config.py"), "w")
default_config = open(os.path.join(module_path, "parser_config.py"))
f.write(default_config.read())
default_config.close()
f.close()
mylookup = TemplateLookup(directories=template_dir_path,
default_filters=["decode.utf8"],
input_encoding="utf-8", output_encoding="utf-8")
liked = recommend.get_interesting_articles(db=get_conf.config.db)
disliked = recommend.get_blacklist(db=get_conf.config.db)
liked_links = [i['link'] for i in liked]
disliked_links = [i['link'] for i in disliked]
def encoded_dict(in_dict):
out_dict = {}
for k, v in in_dict.items():
if isinstance(v, unicode_):
v = v.encode('utf8')
elif isinstance(v, str):
# Must be encoded in UTF-8
v.decode('utf8')
out_dict[k] = v
return out_dict
def setup_db():
"""Setup archive database"""
con = sqlite3.connect(os.path.join(logdir, "archive.db"))
cur = con.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS articles
(id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT, link TEXT, source TEXT, UNIQUE(link));""")
con.commit()
con.close()
def log(text, f=sys.stdout, add_newline=True, clear_str=False,
ignore_daemon=False):
if add_newline:
text += "\n"
if (clear_str or running_as_daemon) and not ignore_daemon:
text = re.sub(r"\033\[(\d+|\d+;\d+)m", "", text)
f.write(text)
f.flush()
def logerr(*args, **kwargs):
kwargs['f'] = sys.stderr
log(*args, **kwargs)
def split_into_pages(articles, n=30):
"""Split list into pages"""
pages = []
i = 0
for j in articles:
if i >= n:
i = 0
if i == 0:
pages.append([j])
else:
pages[-1].append(j)
i += 1
return pages
def simple_plural(n, s):
n = str(n)
if n.endswith("1") and not n.endswith("11"):
return s
else:
return s + "s"
def show_progress(s, shared_object):
progress = round(shared_object.value, 2)
return "\033[0;32m[{0}%]\033[0m ".format(progress)+s
def parse_site(queue, articles, progress):
config = get_conf.config
while not queue.empty():
site = queue.get()
s = "Got {0} {1} from {2}"
if site not in config.sites_to_parse:
d = config.rss_feeds[site]
else:
d = config.sites_to_parse[site]
update_progress(progress,
100.0 / (len(config.sites_to_parse) + len(config.rss_feeds)) / 2.0)
log(show_progress("Parsing articles from {0}".format(site), progress))
if 'module' not in d:
url = d.get('url', 'about:blank')
short_name = d.get('short-name', 'unknown')
icon = d.get('icon', 'about:blank')
color = d.get('color', '#000')
if not len(color):
color = '#000'
if not len(icon):
icon = 'about:blank'
try:
new_articles = parser.parse_rss(url, short_name, icon, color)
for i in new_articles:
articles.put(i)
update_progress(progress,
100.0 / (len(config.sites_to_parse) + len(config.rss_feeds)) / 2.0)
log(show_progress(s.format(len(new_articles),
simple_plural(len(new_articles), 'article'), site), progress))
except Exception as error:
logerr('Fail')
logerr(str(error))
else:
module = d["module"]
kwargs = d["kwargs"]
try:
found = module.get_articles(**kwargs)
for i in found:
articles.put(i)
update_progress(progress,
100.0 / (len(config.sites_to_parse) + len(config.rss_feeds)) / 2.0)
log(show_progress(s.format(len(found),
simple_plural(len(found), 'article'), site), progress))
except Exception as error:
logerr("Failed to parse articles from {0}".format(site))
logerr(str(error))
def update_progress(shared_object, num):
shared_object.value += num
def dump_articles(filename="articles_dumped"):
"""Dump articles to ~/.tech-parser/<filename>"""
m = multiprocessing.Manager()
articles = m.Queue()
progress = m.Value('d', 0.0, lock=False)
main_queue = m.Queue()
for i in get_conf.config.sites_to_parse:
main_queue.put(i)
for i in get_conf.config.rss_feeds:
main_queue.put(i)
pool = multiprocessing.Pool(processes=get_conf.config.num_threads)
for i in range(get_conf.config.num_threads):
pool.apply_async(parse_site, (main_queue, articles, progress))
pool.close()
pool.join()
articles_before = [i[0] for i in load_articles()]
list_articles = []
while not articles.empty():
list_articles.append(articles.get())
log("Total articles: %d" %(len(list_articles)))
log("New articles: %d"
%(len([i for i in list_articles if i not in articles_before])))
if get_conf.config.save_articles:
log("Saving articles to archive...")
setup_db()
con = sqlite3.connect(os.path.join(logdir, "archive.db"))
cur = con.cursor()
for article in list_articles:
title = article["title"]
link = article["link"]
source = article["source"]
try:
cur.execute("""INSERT INTO articles(title, link, source)
values(?, ?, ?);""", (title, link, source))
except sqlite3.IntegrityError:
pass
con.commit()
con.close()
num = len(recommend.get_interesting_articles(db=get_conf.config.db))
num += len(recommend.get_blacklist(db=get_conf.config.db))
if num >= 20:
log("Ranking articles...")
list_articles = recommend.find_similiar(list_articles, db=get_conf.config.db)
list_articles.sort(key=lambda x: x[1], reverse=True)
else:
log("Shuffling articles...")
shuffle(list_articles)
list_articles = [[a, -1] for a in list_articles]
log("Dumping data to file: {0}...".format(filename))
dumped = pickle.dumps(list_articles)
path = os.path.join(os.path.expanduser("~"), ".tech-parser")
path = os.path.join(path, filename)
f = open(path, "wb")
f.write(dumped)
f.close()
log("Done!")
def dump_articles_per(s):
"""Dump articles per <s> seconds"""
while True:
if int(time()) % s == 0:
dump_articles()
sleep(1)
def filter_articles(articles):
"""Filter articles"""
config = get_conf.config
articles_filtered = []
for article in articles:
passing = True
words_len = len(config.filters["All"]["or"])
title = article[0]["title"].lower()
for word in config.filters["All"]["or"]:
if word.lower() in title:
passing = True
break
else:
words_len -= 1
if words_len == 0:
passing = False
if passing:
for word in config.filters["All"]["not"]:
if word.lower() in title:
passing = False
break
if passing:
for word in config.filters["All"]["has"]:
if word.lower() not in title:
passing = False
break
if passing:
articles_filtered.append(article)
return articles_filtered
def load_articles(filename="articles_dumped"):
"""Load articles from ~/.tech-parser/<filename>"""
log("Reading articles from file: {0}...".format(filename))
try:
f = open(os.path.join(logdir, filename), 'rb')
except IOError:
log("File '{0}' doesn't exist: returning empty list".format(filename))
return []
dumped = f.read()
f.close()
articles = pickle.loads(dumped)
log("Done!")
return articles
@route('/static/<filename:path>')
def serve_static(file
没有合适的资源?快使用搜索试试~ 我知道了~
PyPI 官网下载 | TechParser-1.7.9.tar.gz
1.该资源内容由用户上传,如若侵权请联系客服进行举报
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
版权申诉
0 下载量 142 浏览量
2022-01-16
17:22:58
上传
评论
收藏 371KB GZ 举报
温馨提示
共91个文件
py:42个
ico:36个
woff:4个
资源来自pypi官网。 资源全名:TechParser-1.7.9.tar.gz
资源推荐
资源详情
资源评论
收起资源包目录
TechParser-1.7.9.tar.gz (91个子文件)
TechParser-1.7.9
PKG-INFO 455B
TechParser
ixbt.py 194B
py2x.py 708B
readwrite.py 180B
venturebeat.py 182B
gizmodo.py 172B
trashbox.py 707B
androidcentral.py 202B
zdnet.py 573B
parser.py 4KB
planetclojure.py 683B
templates
base.html 4KB
articles.html 3KB
blacklist.html 2KB
history.html 2KB
smashingmagazine.py 198B
dzone.py 188B
hackernews.py 187B
codeproject.py 591B
get_conf.py 1KB
wired.py 172B
mashable.py 174B
techrepublic.py 213B
maketecheasier.py 194B
ittoolbox.py 317B
__main__.py 15KB
mobilereview.py 443B
static
fonts
OpenSans-LightItalic.woff 75KB
OpenSans-Regular.woff 62KB
OpenSans-Light.woff 61KB
OpenSans-Italic.woff 74KB
icons
medium.ico 88KB
dzone.ico 1KB
dislike-32.ico 4KB
threednews.ico 5KB
readwrite.ico 5KB
hackernews.ico 7KB
androidcentral.ico 1KB
zdnet.ico 1KB
slashdot.ico 318B
digg.ico 1KB
helpix.ico 1KB
mobile-review.ico 1KB
engadget.ico 3KB
trashbox.ico 318B
techcrunch.ico 11KB
wired.ico 1KB
planetclojure.ico 1KB
mashable.ico 1KB
venturebeat.ico 11KB
ittoolbox.ico 1KB
redroid.ico 1KB
habrahabr.ico 5KB
reddit.ico 4KB
smashingmagazine.ico 1KB
ixbt.ico 26KB
like-32.ico 4KB
techrepublic.ico 1KB
droider.ico 894B
geektimes.ico 1KB
maketecheasier.ico 270B
theverge.ico 17KB
flowa.ico 318B
recode.ico 11KB
codeproject.ico 11KB
gizmodo.ico 32KB
topdesignmagazine.ico 1KB
style.css 4KB
jquery-2.1.1.min.js 82KB
jquery.touchSwipe.min.js 11KB
__init__.py 0B
geektimes.py 422B
slashdot.py 201B
parser_config.py 5KB
digg.py 174B
verge.py 252B
techcrunch.py 179B
flowa.py 171B
recode.py 562B
recommend.py 11KB
engadget.py 178B
redroid.py 171B
habrahabr.py 422B
helpix.py 307B
droider.py 188B
threednews.py 366B
reddit.py 422B
topdesignmag.py 208B
medium.py 662B
setup.cfg 40B
setup.py 702B
共 91 条
- 1
资源评论
挣扎的蓝藻
- 粉丝: 13w+
- 资源: 15万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- html实现儿童节庆祝项目源码
- Typora-1.3.8(含激活文件)
- Python爬取百度贴吧数据.zip
- Python爬取大众点评数据.zip
- 计算机视觉(CV)open CV 人脸识别程序.docx
- python 端午节游戏,选择三个传统的端午节活动之一:包粽子、赛龙舟或挂艾草 根据玩家的选择,游戏会描述相应的情节,并在完成后
- C语言各版块实验题库、实验汇报以及实验教学案例资源
- 数据库连接(MySQL与VS2022)_vs2022 mysql-CSDN博客 (2024_6_1 19_11_34).html
- Flappy bird小游戏
- 数电实验五:555单稳态触发电路、555单稳态触发电路、555构成的施密特触发器
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功