from typing import List, Literal, Optional, Union
from pandas import DataFrame, to_datetime, to_timedelta
from pydantic import Field
from weaverbird.render_variables import StepWithVariablesMixin
from weaverbird.steps.base import BaseStep
from weaverbird.types import ColumnName
BASIC_DATE_PARTS = Literal[
'year',
'month',
'day',
'week',
'quarter',
'dayOfWeek',
'dayOfYear',
'isoYear',
'isoWeek',
'isoDayOfWeek',
'hour',
'minutes',
'seconds',
'milliseconds',
]
DATE_INFO = Union[
BASIC_DATE_PARTS,
Literal[
'firstDayOfYear',
'firstDayOfMonth',
'firstDayOfWeek',
'firstDayOfQuarter',
'firstDayOfIsoWeek',
'previousDay',
'firstDayOfPreviousYear',
'firstDayOfPreviousMonth',
'firstDayOfPreviousWeek',
'firstDayOfPreviousQuarter',
'firstDayOfPreviousIsoWeek',
'previousYear',
'previousMonth',
'previousWeek',
'previousQuarter',
'previousIsoWeek',
],
]
OPERATIONS_MAPPING = {
'minutes': 'minute',
'seconds': 'second',
'dayOfYear': 'dayofyear',
}
class DateExtractStep(BaseStep):
name = Field('dateextract', const=True)
column: str
date_info: List[DATE_INFO] = Field([], alias=('dateInfo'))
new_columns: List[ColumnName] = Field([], alias='newColumns')
operation: Optional[BASIC_DATE_PARTS]
new_column_name: Optional[ColumnName]
def execute(self, df: DataFrame, domain_retriever=None, execute_pipeline=None) -> DataFrame:
date_info: List[DATE_INFO]
if self.operation and self.new_column_name: # for retrocompatibility
date_info = [self.operation]
new_columns = [self.new_column_name]
else:
date_info = self.date_info
new_columns = self.new_columns
for dt_info, new_col in zip(date_info, new_columns):
serie_dt = df[self.column].dt
if dt_info == 'week':
# cast in float and not in int to manage NaN properly
result = serie_dt.strftime('%U').astype(float)
elif dt_info == 'dayOfWeek':
# result should be between 1 (sunday) and 7 (saturday)
result = (serie_dt.dayofweek + 2) % 7
result = result.replace({0: 7})
elif dt_info == 'isoYear':
result = serie_dt.isocalendar().year
elif dt_info == 'isoWeek':
result = serie_dt.isocalendar().week
elif dt_info == 'isoDayOfWeek':
result = serie_dt.isocalendar().day
elif dt_info == 'firstDayOfYear':
result = to_datetime(DataFrame({'year': serie_dt.year, 'month': 1, 'day': 1}))
elif dt_info == 'firstDayOfMonth':
result = to_datetime(
DataFrame({'year': serie_dt.year, 'month': serie_dt.month, 'day': 1})
)
elif dt_info == 'firstDayOfWeek':
# dayofweek should be between 1 (sunday) and 7 (saturday)
dayofweek = (serie_dt.dayofweek + 2) % 7
dayofweek = dayofweek.replace({0: 7})
# we subtract a number of days corresponding to(dayOfWeek - 1)
result = df[self.column] - to_timedelta(dayofweek - 1, unit='d')
# the result should be returned with 0-ed time information
result = to_datetime(result.dt.date)
elif dt_info == 'firstDayOfQuarter':
result = to_datetime(
DataFrame(
{
'year': serie_dt.year,
'month': 3 * ((serie_dt.month - 1) // 3) + 1,
'day': 1,
}
)
)
elif dt_info == 'firstDayOfIsoWeek':
dayofweek = serie_dt.isocalendar().day
# we subtract a number of days corresponding to(dayOfWeek - 1)
result = df[self.column] - to_timedelta(dayofweek - 1, unit='d')
# the result should be returned with 0-ed time information
result = to_datetime(result.dt.date)
elif dt_info == 'previousDay':
result = df[self.column] - to_timedelta(1, unit='d')
# the result should be returned with 0-ed time information
result = to_datetime(result.dt.date)
elif dt_info == 'firstDayOfPreviousYear':
result = to_datetime(DataFrame({'year': serie_dt.year - 1, 'month': 1, 'day': 1}))
elif dt_info == 'firstDayOfPreviousMonth':
prev_month = serie_dt.month - 1
prev_month = prev_month.replace({0: 12})
result = to_datetime(
DataFrame(
{
'year': serie_dt.year - (prev_month == 12),
'month': prev_month,
'day': 1,
}
)
)
elif dt_info == 'firstDayOfPreviousWeek':
prev_week_date = df[self.column] - to_timedelta(7, unit='d')
# dayofweek should be between 1 (sunday) and 7 (saturday)
dayofweek = (prev_week_date.dt.dayofweek + 2) % 7
dayofweek = dayofweek.replace({0: 7})
# we subtract a number of days corresponding to(dayOfWeek - 1)
result = prev_week_date - to_timedelta(dayofweek - 1, unit='d')
# the result should be returned with 0-ed time information
result = to_datetime(result.dt.date)
elif dt_info == 'firstDayOfPreviousQuarter':
first_month_of_quarter = 3 * ((serie_dt.month - 1) // 3) + 1
first_month_of_prev_q = first_month_of_quarter - 3
first_month_of_prev_q = first_month_of_prev_q.replace({-2: 10})
result = to_datetime(
DataFrame(
{
'year': serie_dt.year - (first_month_of_prev_q == 10),
'month': first_month_of_prev_q,
'day': 1,
}
)
)
elif dt_info == 'firstDayOfPreviousIsoWeek':
prev_week_date = df[self.column] - to_timedelta(7, unit='d')
dayofweek = prev_week_date.dt.isocalendar().day
# we subtract a number of days corresponding to(dayOfWeek - 1)
result = prev_week_date - to_timedelta(dayofweek - 1, unit='d')
# the result should be returned with 0-ed time information
result = to_datetime(result.dt.date)
elif dt_info == 'previousYear':
result = serie_dt.year - 1
elif dt_info == 'previousMonth':
month = serie_dt.month
result = month - 1
result = result.replace({0: 12})
elif dt_info == 'previousWeek':
prev_week_date = df[self.column] - to_timedelta(7, unit='d')
result = prev_week_date.dt.strftime('%U').astype(float)
elif dt_info == 'previousQuarter':
result = serie_dt.quarter - 1
result = result.replace({0: 4})
elif dt_info == 'previousIsoWeek':
prev_week_date = df[self.column] - to_timedelta(7, unit='d')
result = prev_week_date.dt.isocalendar().week
elif dt_info == 'milliseconds':
result = serie_dt.microsecond / 1000
else:
operation = OPERATIONS_MAPPING.get(dt_info, dt_info)
result = getattr(serie_dt, operation)
df = df.assign(**{new_col: result})
return df
class DateExtractStepWithVariable(DateExtractStep, StepWithVariablesMixin):
...
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
weaverbird-0.3.1.tar.gz (61个子文件)
weaverbird-0.3.1
PKG-INFO 495B
weaverbird
steps.py 0B
utils
size.py 308B
__init__.py 112B
cleaning.py 2KB
stopwatch.py 231B
exceptions.py 203B
types.py 295B
formula.py 1KB
pipeline_executor.py 3KB
__init__.py 0B
conditions.py 3KB
steps
split.py 909B
rename.py 994B
base.py 478B
duplicate.py 436B
todate.py 655B
cumsum.py 1KB
statistics.py 2KB
fromdate.py 582B
evolution.py 2KB
uniquegroups.py 641B
replace.py 1KB
text.py 536B
join.py 1KB
domain.py 375B
rank.py 1KB
select.py 523B
sort.py 821B
argmin.py 1KB
substring.py 1KB
waterfall.py 7KB
date_extract.py 8KB
argmax.py 1KB
pivot.py 991B
delete.py 407B
addmissingdates.py 2KB
fillna.py 926B
lowercase.py 515B
aggregate.py 3KB
uppercase.py 515B
percentage.py 911B
moving_average.py 1KB
formula.py 668B
convert.py 1KB
comparetext.py 965B
totals.py 4KB
unpivot.py 1KB
concatenate.py 915B
__init__.py 2KB
rollup.py 3KB
append.py 980B
combination.py 734B
top.py 1KB
duration.py 1KB
ifthenelse.py 2KB
filter.py 556B
render_variables.py 420B
pipeline.py 4KB
pyproject.toml 734B
setup.py 696B
共 61 条
- 1
资源评论
- jbxhsn2022-06-30用户下载后在一定时间内未进行评价,系统默认好评。
挣扎的蓝藻
- 粉丝: 13w+
- 资源: 15万+
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功