import calendar
import glob
import os
import random
import re
import shutil
import tempfile
import time
import arrow
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import (
NoSuchElementException,
StaleElementReferenceException,
)
from utility_bill_scraper import convert_divs_to_df, format_fields, process_pdf
def get_name():
return "Kitchener Utilities"
def get_summary(soup):
def find_seq_id(tag):
return tag.name == "div" and tag.decode().find("SEQ-ID") >= 0
def find_account_summary(tag):
return tag.name == "span" and tag.decode().find("Your Account Summary") >= 0
summary_fields = format_fields(soup.find_all(find_account_summary)[0].contents)
summary_data = format_fields(
soup.find_all(find_seq_id)[0].next_sibling.contents[0].contents
)
summary_dict = dict(zip(summary_fields[1:], summary_data))
def find_charges(name):
def find_matching_div(tag):
return tag.name == "div" and tag.decode().find(name) >= 0
tag = soup.find(find_matching_div)
# Extract the top pixel coordinate.
match = re.search(r"top:(?P<top>\d+)px", tag.decode())
top = match.groups()[0]
# Find the second div with the same top pixel coordinate.
return format_fields(
soup.find_all(style=re.compile("top:%spx" % top))[1].span.contents
)[0]
summary_dict["Water Charges"] = find_charges("Water charges")
summary_dict["Gas Charges"] = find_charges("Gas charges")
return summary_dict
def get_water_consumption(soup):
def find_total_consumption(tag):
return tag.name == "div" and tag.decode().find("Total Consumption") >= 0
div_list = soup.find_all(find_total_consumption)
# Find the div containing 3 fields (gas has an extra
# 'Billing Conversion Multiplier'). Note that it is possible to have
# more than one consumption section.
tags = [x for x in div_list if len(format_fields(x.contents[0])) == 3]
consumption = []
for tag in tags:
# Extract the top pixel coordinate.
match = re.search(r"top:(?P<top>\d+)px", tag.decode())
top = match.groups()[0]
# Match all divs with the same top pixel coordinate.
def find_matching_top(tag):
return tag.name == "div" and tag.decode().find("top:%spx" % top) >= 0
divs = [format_fields(x.contents[0]) for x in soup.find_all(find_matching_top)]
consumption.append(dict(zip(divs[0], divs[2])))
return consumption
def get_water_and_sewer_charges(soup):
def find_water_consumption(tag):
return (
(tag.name == "div")
and (tag.decode().find("Consumption") >= 0)
and (tag.decode().find("Total Consumption") == -1)
)
water_div = soup.find_all(find_water_consumption)[0]
water_type = format_fields(water_div.next_sibling.contents[0])
result = {"Time period": water_type[0]}
water_type = water_type[1:]
consumption = format_fields(water_div.next_sibling.next_sibling.contents[0])
rates = format_fields(
water_div.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
)
charges = format_fields(
water_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[
0
]
)
for x in range(len(water_type)):
result[water_type[x]] = {
"Consumption": consumption[x],
"Rate": rates[x],
"Charges": charges[x],
}
return result
def get_water_charges(soup):
data = get_water_and_sewer_charges(soup)
types = ["Water", "Sewer"]
return dict(zip(types, [data[x]["Charges"] for x in types]))
def get_water_rates(soup):
data = get_water_and_sewer_charges(soup)
types = ["Water", "Sewer"]
return dict(zip(types, [data[x]["Rate"] for x in types]))
def get_gas_consumption(soup):
def find_total_consumption(tag):
return tag.name == "div" and tag.decode().find("Total Consumption") >= 0
div_list = soup.find_all(find_total_consumption)
# Find divs containing 4 fields (gas has an extra
# 'Billing Conversion Multiplier'). Note that it is possible to have
# more than one consumption section.
tags = [x for x in div_list if len(format_fields(x.contents[0])) > 3]
consumption = []
for tag in tags:
# Extract the top pixel coordinate.
match = re.search(r"top:(?P<top>\d+)px", tag.decode())
top = match.groups()[0]
# Match all divs with the same top pixel coordinate.
def find_matching_top(tag):
return tag.name == "div" and tag.decode().find("top:%spx" % top) >= 0
divs = [format_fields(x.contents[0]) for x in soup.find_all(find_matching_top)]
consumption.append(dict(zip(divs[0], divs[2])))
return consumption
def get_gas_charges(soup):
try:
# Find the bounding box that defines the gas section.
pos_re = r"left:(?P<left>\d+)px.*top:(?P<top>\d+)px.*width:(?P<width>\d+)px.*height:(?P<height>\d+)"
def find_gas_section(tag):
return tag.name == "div" and tag.decode().find("GAS") >= 0
tag = soup.find(find_gas_section)
pos = re.search(pos_re, tag.decode()).groupdict()
top_bound = int(pos["top"])
def find_gas_charges(tag):
return tag.name == "div" and tag.decode().find("Gas charges") >= 0
tag = soup.find(find_gas_charges)
pos = re.search(pos_re, tag.decode()).groupdict()
bottom_bound = int(pos["top"])
# Find all of the div tags within this bounding box.
def find_divs_within_bounds(tag):
match = re.search(pos_re, tag.decode())
if match:
top = int(match.groupdict()["top"])
return top >= top_bound and top < bottom_bound and tag.name == "div"
return False
df = convert_divs_to_df(soup.find_all(find_divs_within_bounds))
df["fields_str"] = [str(x) for x in df["fields"]]
df = df.sort_values(["top", "left"])
# Charges can be grouped in different sections (e.g., if the gas rate
# changes in the middle of the month). We only care about the last
# section, because it contains the Fixed Delivery Charge.
charges = df[
df["left"] > df[df["fields_str"] == "[u'Charges']"]["left"].iloc[0]
].iloc[-1]["fields"]
charge_desc = df[df["fields_str"].str.find(" days") >= 0].iloc[-1]["fields"][1:]
return dict(zip(charge_desc[-len(charges) :], charges))
except AttributeError as error:
print("Error scraping gas charges")
return {}
def get_gas_rates(soup):
def find_gas_rates(tag):
return tag.name == "div" and tag.decode().find("Gas Fixed Delivery Charge") >= 0
gas_div = soup.find_all(find_gas_rates)[0]
gas_fields = format_fields(gas_div.contents[0])
gas_fields = gas_fields[1:]
gas_rates = format_fields(
gas_div.next_sibling.next_sibling.next_sibling.contents[0]
)
format_fields(
gas_div.next_sibling.next_sibling.next_sibling.next_sibling.contents[0]
)
return dict(
zip(
[
x + " Rate"
for x in gas_fields
if (x.find("HST") == -1) and x.find("Fixed") == -1
],
gas_rates,
)
)
def convert_data_to_df(data):
cols = list(data[0]["summary"].keys())
if "Pre-authorized Withdrawal" in cols:
cols.remove("Pre-authorized Withdrawal")
cols.append("Total Due")
for x in data:
if "Pre-authorized Withdrawal" in x["summary"].keys():
x["summary"]["Total Due"] = x["summary"]["Pre-authorized Withdrawal"]
data_sets = []
for col in cols:
data_sets.append([x["summary"][col] for x in data])
df = pd.DataFrame(data=dict(zip(cols, data_sets)))
df["Issue Date"] = [
PyPI 官网下载 | utility-bill-scraper-0.2.4.tar.gz
版权申诉
94 浏览量
2022-01-17
06:23:14
上传
评论
收藏 19KB GZ 举报
挣扎的蓝藻
- 粉丝: 13w+
- 资源: 15万+
最新资源
- 下载安装这个软件.apk
- 【数据集详细解释及案例分析】数据集详细解释及案例分析
- 基于SHT71温湿度传感器、STM32F103C8T6、LCD1602温湿度采集显示系统proteus仿真设计
- 基于TH02温湿度传感器、STM32F103C8T6、LCD1602、FREERTOS的温湿度采集系统proteus仿真设计
- 【TCP-IP协议详细解释及案例分析】TCP-IP协议详细解释及案例分析
- 一文搞懂 LSTM(长短期记忆网络).rar
- 【autosar简介及基本案例解析】autosar简介及基本案例解析
- java模拟斗地主洗牌发牌
- springboot+vue登录系统 vue部分
- 常用常见 SQL语句语法
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈