#!/usr/bin/env python
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
# Author: "Chris Ward" <cward@redhat.com>
import logging
logger = logging.getLogger(__name__)
from decorator import decorator
from datetime import datetime
from pandas import DataFrame, Series
import pandas.tseries.offsets as off
from pandas.tslib import Timestamp
import pandas as pd
import numpy as np
from calendar import timegm
from IPython.display import HTML
NUMPY_NUMERICAL = [np.float16, np.float32, np.float64, np.float128,
np.int8, np.int16, np.int32, np.int64]
def mask_filter(f):
'''
Generic function for getting back filtered frame
data according to True/False mask filter frame matching
:param Pandas.DataFrame mask_frame:
DataFrame that maps index:True/False where True means it
matches the filter and False means it does not
:param Boolean filter_ :
True will return back a DataFrame that contains only items
which matched the mask_frame filter. False returns back the
opposite.
'''
return decorator(_mask_filter, f)
def _mask_filter(f, self, *args, **kwargs):
filter_ = args[-1] # by convention, filter_ expected as last arg
mask_frame = f(self, *args, **kwargs)
if filter_ is None:
return mask_frame
else:
return type(self)(self[mask_frame == filter_])
def filtered(f):
def _filter(f, self, *args, **kwargs):
frame = f(self, *args, **kwargs)
ret = type(self)(frame)
ret._lbound = self._lbound
ret._rbound = self._rbound
return ret
return decorator(_filter, f)
def to_timestamp(d):
return timegm(d.utctimetuple())
class Result(DataFrame):
''' Custom DataFrame implementation for Metrique '''
def __init__(self, data=None):
super(Result, self).__init__(data)
self._result_data = data
# The converts are here so that None is converted to NaT
self.to_datetime('_start')
self.to_datetime('_end')
self._lbound = self._rbound = None
def to_datetime(self, column):
if column in self:
if self[column].dtype in NUMPY_NUMERICAL:
self[column] = pd.to_datetime(self[column], unit='s')
else:
self[column] = pd.to_datetime(self[column], utc=True)
def set_date_bounds(self, date):
'''
Pass in the date used in the original query.
:param String date:
Date (date range) that was queried:
date -> 'd', '~d', 'd~', 'd~d'
d -> '%Y-%m-%d %H:%M:%S,%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'
'''
if date is not None:
split = date.split('~')
if len(split) == 1:
self._lbound = Timestamp(date)
self._rbound = Timestamp(date)
elif len(split) == 2:
if split[0] != '':
self._lbound = Timestamp(split[0])
if split[1] != '':
self._rbound = Timestamp(split[1])
else:
raise Exception('Date %s is not in the correct format' % date)
def check_in_bounds(self, date):
''' Check that left and right bounds are sane '''
dt = Timestamp(date)
return ((self._lbound is None or dt >= self._lbound) and
(self._rbound is None or dt <= self._rbound))
def on_date(self, date, only_count=False):
'''
Filters out only the rows that match the spectified date.
Works only on a Result that has _start and _end columns.
:param String date:
date can be anything Pandas.Timestamp supports parsing
'''
if not self.check_in_bounds(date):
raise ValueError('Date %s is not in the queried range.' % date)
date = Timestamp(date)
after_start = self._start <= date
before_end = (self._end > date) | self._end.isnull()
if only_count:
return np.sum(before_end & after_start)
else:
return self.filter(before_end & after_start)
def _auto_select_scale(self, dts, start=None, end=None, ideal=300):
'''
Guess what a good timeseries scale might be,
given a particular data set, attempting to
make the total number of x values as close to
`ideal` as possible
This is a helper for plotting
'''
start = min(dts) if start is None else start
end = max(dts) if end is None else end
maximum_count = len(filter(lambda dt: start <= dt and dt <= end, dts))
daily_count = (end - start).days
if maximum_count <= ideal:
return 'maximum'
elif daily_count <= ideal:
return 'daily'
elif daily_count / 7 <= ideal:
return 'weekly'
elif daily_count / 30 <= ideal:
return 'monthly'
elif daily_count / 91 <= ideal:
return 'quarterly'
else:
return 'yearly'
def history(self, dates=None, counts=True,
predict_since=None, lin_reg_days=20):
'''
Works only on a Result that has _start and _end columns.
most_recent=False should be set for this to work
:param: List dates:
List of dates
:param Boolean counts:
If True counts will be returned
If False ids will be returned
:param datetime predict_since:
If not None, the values on the dates after this will be estimated
using linear regression.
If not None, the parameter counts must be set to True.
:param integer lin_reg_days:
Specifies how many past days should be used in the linear
regression.
'''
if dates is None:
dates = self.get_dates_range()
idx, vals = [], []
for dt in dates:
idx.append(dt)
if counts:
vals.append(self.on_date(dt, only_count=True))
else:
vals.append(list(self.on_date(dt)._oid))
ret = Series(vals, index=idx)
if predict_since is not None:
if not counts:
raise ValueError('counts must be True if predict_future_since'
'is not None.')
ret = self.predict_future(ret, predict_since, lin_reg_days)
return ret.sort_index()
def predict_future(self, series, since, days=20):
'''
Predicts future using linear regression.
:param pandas.Series series:
A series in which the values will be places.
The index will not be touched.
Only the values on dates > `since` will be predicted.
:param datetime since:
The starting date from which the future will be predicted.
:param integer days:
Specifies how many past days should be used in the linear
regression.
'''
last_days = pd.date_range(end=since, periods=days)
hist = self.history(last_days)
xi = np.array([to_timestamp(d) for d in hist.index])
A = np.array([xi, np.ones(len(hist))])
y = hist.values
w = np.linalg.lstsq(A.T, y)[0]
for d in series.index[series.index > since]:
series[d] = w[0] * to_timestamp(d) + w[1]
series[d] = 0 if series[d] < 0 else series[d]
return series
def get_dates_range(self, scale='auto', start=None, end=None):
'''
Returns a list of dates sampled according to the specified parameters.
:param String scale: {'auto', 'maximum', 'daily', 'weekly', 'monthly',
'quarterly', 'yearly'}
Scale specifies the sampling intervals.
'auto' will heuritically choose such scale that will give you
fast results.
:param String start:
First date that will be included.