import os
import re
from collections import OrderedDict
from lxml import etree
from ibooks_highlight_exporter.helpers import find_first_file_with_extension
from ibooks_highlight_exporter.base import HeadingPointer
def get_epub_content_path(epub_directory_path: str) -> str:
return find_first_file_with_extension(epub_directory_path, "opf")
def get_epub_toc_path(epub_directory_path: str) -> str:
return find_first_file_with_extension(epub_directory_path, "ncx")
def get_parts(epub_directory_path: str) -> OrderedDict:
opf = etree.parse(open(get_epub_content_path(epub_directory_path), "r"))
parts = OrderedDict()
for item in opf.xpath(
"/*[local-name() = 'package']/*[local-name() = 'manifest']/*[local-name() = 'item']"
):
if item.get("media-type") == "application/xhtml+xml":
parts[item.get("id")] = item.get("href")
return parts
def parse_epub_cfi(epub_cfi: str) -> (str, int):
regex = re.compile(r"epubcfi\(/6/\d+\[(.+)\]!/4(.*?)/(\d+)")
m = regex.search(epub_cfi)
if m is None:
raise ValueError("Invalid epubcfi!")
else:
part_id = m.group(1)
try:
position = int(m.group(3))
except ValueError:
raise ValueError("Invalid epubcfi!")
return part_id, position
def get_heading_pointers(epub_path):
toc_path = get_epub_toc_path(epub_path)
toc = etree.parse(open(toc_path, "r"))
first_level_nav_points = toc.xpath(
"/*[local-name() = 'ncx']/*[local-name() = 'navMap']/*[local-name() = 'navPoint']"
)
chapter_links = []
for nav_point in first_level_nav_points:
chapter_links += traverse_nav_point(nav_point)
return find_chapter_positions(chapter_links, epub_path)
def traverse_nav_point(nav_point, level=0):
src = nav_point.xpath("./*[local-name() = 'content']")[0].get("src")
if "#" in src:
html_path, a_id = tuple(src.split("#"))
else:
html_path = src
a_id = None
chapter_name = nav_point.xpath(
"./*[local-name() = 'navLabel']/*[local-name() = 'text']"
)[0].text
sub_nav_points = nav_point.xpath("./*[local-name() = 'navPoint']")
chapter_links = [(html_path, chapter_name, a_id, level)]
for sub_nav_point in sub_nav_points:
chapter_links += traverse_nav_point(sub_nav_point, level + 1)
return chapter_links
def find_chapter_positions(chapter_links, epub_path):
heading_pointers = []
for chapter_link in chapter_links:
tree = etree.parse(open(os.path.join(epub_path, chapter_link[0]), "r"))
body_xpath = "/*[local-name() = 'html'] /*[local-name() = 'body']"
body = tree.xpath(body_xpath)[0]
if chapter_link[2] is not None:
el = body.xpath(f"//*[@id='{chapter_link[2]}']")[0]
ancestor_before_body = el
for ancestor in el.iterancestors():
if ancestor.tag.endswith("body"):
break
ancestor_before_body = ancestor
position = body.index(ancestor_before_body) * 2
else:
position = 0
heading_pointers.append(
HeadingPointer(
chapter_link[0], int(position), chapter_link[1], chapter_link[3]
)
)
return heading_pointers
PyPI 官网下载 | ibooks_highlight_exporter-0.1.11.tar.gz
版权申诉
69 浏览量
2022-01-28
01:10:20
上传
评论
收藏 6KB GZ 举报
挣扎的蓝藻
- 粉丝: 13w+
- 资源: 15万+
最新资源
- index.jsp
- Screenshot_20240521_090410_com.huawei.android.launcher.jpg
- 单文件制作工具 7.0.2.3851-x86-x64
- Linux命令.xmind
- 基于Transformer实现的跨域Cross-view实时Map-view语义分割算法-附项目源码-优质项目实战.zip
- linux常用命令大全-.zip
- 2024彩虹聚合DNS管理系统源码 管理系统快速开发平台 聚合平台管理.zip
- elasticsearch介绍-.zip
- nodejs安装及环境配置-.zip
- 谷歌浏览器自动化测试版113.0.5672.0(包含linux,windows32/64,mac三个版本,不会自动更新)
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈