PyPI官网下载|utility-bill-scraper-0.2.4.tar.gz资源-CSDN文库

版权申诉

Python库

94 浏览量 2022-01-17 06:23:14 上传评论收藏 19KB GZ 举报

共11个文件

py：8个

toml：1个

pkg-info：1个

资源推荐

资源详情

资源评论

收起资源包目录

utility-bill-scraper-0.2.4.tar.gz （11个子文件）

utility-bill-scraper-0.2.4

PKG-INFO 711B

pyproject.toml 750B

src

utility_bill_scraper

bin

ubs.py 3KB

__init__.py 0B

kitchener_utilities.py 24KB

kitchener_wilmot_hydro.py 15KB

__init__.py 6KB

enbridge.py 3KB

_version.py 18KB

setup.py 930B

LICENSE.md 1KB

import calendar import glob import os import random import re import shutil import tempfile import time import arrow import numpy as np import pandas as pd from selenium import webdriver from selenium.common.exceptions import ( NoSuchElementException, StaleElementReferenceException, ) from utility_bill_scraper import convert_divs_to_df, format_fields, process_pdf def get_name(): return "Kitchener Utilities" def get_summary(soup): def find_seq_id(tag): return tag.name == "div" and tag.decode().find("SEQ-ID") >= 0 def find_account_summary(tag): return tag.name == "span" and tag.decode().find("Your Account Summary") >= 0 summary_fields = format_fields(soup.find_all(find_account_summary)[0].contents) summary_data = format_fields( soup.find_all(find_seq_id)[0].next_sibling.contents[0].contents ) summary_dict = dict(zip(summary_fields[1:], summary_data)) def find_charges(name): def find_matching_div(tag): return tag.name == "div" and tag.decode().find(name) >= 0 tag = soup.find(find_matching_div) # Extract the top pixel coordinate. match = re.search(r"top:(?P<top>\d+)px", tag.decode()) top = match.groups()[0] # Find the second div with the same top pixel coordinate. return format_fields( soup.find_all(style=re.compile("top:%spx" % top))[1].span.contents )[0] summary_dict["Water Charges"] = find_charges("Water charges") summary_dict["Gas Charges"] = find_charges("Gas charges") return summary_dict def get_water_consumption(soup): def find_total_consumption(tag): return tag.name == "div" and tag.decode().find("Total Consumption") >= 0 div_list = soup.find_all(find_total_consumption) # Find the div containing 3 fields (gas has an extra # 'Billing Conversion Multiplier'). Note that it is possible to have # more than one consumption section. tags = [x for x in div_list if len(format_fields(x.contents[0])) == 3] consumption = [] for tag in tags: # Extract the top pixel coordinate. match = re.search(r"top:(?P<top>\d+)px", tag.decode()) top = match.groups()[0] # Match all divs with the same top pixel coordinate. def find_matching_top(tag): return tag.name == "div" and tag.decode().find("top:%spx" % top) >= 0 divs = [format_fields(x.contents[0]) for x in soup.find_all(find_matching_top)] consumption.append(dict(zip(divs[0], divs[2]))) return consumption def get_water_and_sewer_charges(soup): def find_water_consumption(tag): return ( (tag.name == "div") and (tag.decode().find("Consumption") >= 0) and (tag.decode().find("Total Consumption") == -1) ) water_div = soup.find_all(find_water_consumption)[0] water_type = format_fields(water_div.next_sibling.contents[0]) result = {"Time period": water_type[0]} water_type = water_type[1:] consumption = format_fields(water_div.next_sibling.next_sibling.contents[0]) rates = format_fields( water_div.next_sibling.next_sibling.next_sibling.next_sibling.contents[0] ) charges = format_fields( water_div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.contents[ 0 ] ) for x in range(len(water_type)): result[water_type[x]] = { "Consumption": consumption[x], "Rate": rates[x], "Charges": charges[x], } return result def get_water_charges(soup): data = get_water_and_sewer_charges(soup) types = ["Water", "Sewer"] return dict(zip(types, [data[x]["Charges"] for x in types])) def get_water_rates(soup): data = get_water_and_sewer_charges(soup) types = ["Water", "Sewer"] return dict(zip(types, [data[x]["Rate"] for x in types])) def get_gas_consumption(soup): def find_total_consumption(tag): return tag.name == "div" and tag.decode().find("Total Consumption") >= 0 div_list = soup.find_all(find_total_consumption) # Find divs containing 4 fields (gas has an extra # 'Billing Conversion Multiplier'). Note that it is possible to have # more than one consumption section. tags = [x for x in div_list if len(format_fields(x.contents[0])) > 3] consumption = [] for tag in tags: # Extract the top pixel coordinate. match = re.search(r"top:(?P<top>\d+)px", tag.decode()) top = match.groups()[0] # Match all divs with the same top pixel coordinate. def find_matching_top(tag): return tag.name == "div" and tag.decode().find("top:%spx" % top) >= 0 divs = [format_fields(x.contents[0]) for x in soup.find_all(find_matching_top)] consumption.append(dict(zip(divs[0], divs[2]))) return consumption def get_gas_charges(soup): try: # Find the bounding box that defines the gas section. pos_re = r"left:(?P<left>\d+)px.*top:(?P<top>\d+)px.*width:(?P<width>\d+)px.*height:(?P<height>\d+)" def find_gas_section(tag): return tag.name == "div" and tag.decode().find("GAS") >= 0 tag = soup.find(find_gas_section) pos = re.search(pos_re, tag.decode()).groupdict() top_bound = int(pos["top"]) def find_gas_charges(tag): return tag.name == "div" and tag.decode().find("Gas charges") >= 0 tag = soup.find(find_gas_charges) pos = re.search(pos_re, tag.decode()).groupdict() bottom_bound = int(pos["top"]) # Find all of the div tags within this bounding box. def find_divs_within_bounds(tag): match = re.search(pos_re, tag.decode()) if match: top = int(match.groupdict()["top"]) return top >= top_bound and top < bottom_bound and tag.name == "div" return False df = convert_divs_to_df(soup.find_all(find_divs_within_bounds)) df["fields_str"] = [str(x) for x in df["fields"]] df = df.sort_values(["top", "left"]) # Charges can be grouped in different sections (e.g., if the gas rate # changes in the middle of the month). We only care about the last # section, because it contains the Fixed Delivery Charge. charges = df[ df["left"] > df[df["fields_str"] == "[u'Charges']"]["left"].iloc[0] ].iloc[-1]["fields"] charge_desc = df[df["fields_str"].str.find(" days") >= 0].iloc[-1]["fields"][1:] return dict(zip(charge_desc[-len(charges) :], charges)) except AttributeError as error: print("Error scraping gas charges") return {} def get_gas_rates(soup): def find_gas_rates(tag): return tag.name == "div" and tag.decode().find("Gas Fixed Delivery Charge") >= 0 gas_div = soup.find_all(find_gas_rates)[0] gas_fields = format_fields(gas_div.contents[0]) gas_fields = gas_fields[1:] gas_rates = format_fields( gas_div.next_sibling.next_sibling.next_sibling.contents[0] ) format_fields( gas_div.next_sibling.next_sibling.next_sibling.next_sibling.contents[0] ) return dict( zip( [ x + " Rate" for x in gas_fields if (x.find("HST") == -1) and x.find("Fixed") == -1 ], gas_rates, ) ) def convert_data_to_df(data): cols = list(data[0]["summary"].keys()) if "Pre-authorized Withdrawal" in cols: cols.remove("Pre-authorized Withdrawal") cols.append("Total Due") for x in data: if "Pre-authorized Withdrawal" in x["summary"].keys(): x["summary"]["Total Due"] = x["summary"]["Pre-authorized Withdrawal"] data_sets = [] for col in cols: data_sets.append([x["summary"][col] for x in data]) df = pd.DataFrame(data=dict(zip(cols, data_sets))) df["Issue Date"] = [

评论收藏

内容反馈

版权申诉