Source code for pyiso.base

import os
import ssl
import warnings
import zipfile
from collections import namedtuple
from datetime import datetime, timedelta
from io import StringIO, BytesIO
from time import sleep

import certifi
import pandas as pd
import pytz
import requests
from dateutil.parser import parse as dateutil_parse
from pytz import AmbiguousTimeError

from pyiso import LOGGER

try:
    from urllib2 import urlopen
except ImportError:
    from urllib.request import urlopen  # Changed from urllib2 for python3.x

# named tuple for time period interval labels
IntervalChoices = namedtuple('IntervalChoices', ['hourly', 'fivemin', 'tenmin', 'fifteenmin', 'na', 'dam'])

# list of fuel choices
FUEL_CHOICES = ['biogas', 'biomass', 'coal', 'geo', 'hydro',
                'natgas', 'nonwind', 'nuclear', 'oil', 'other',
                'refuse', 'renewable', 'smhydro', 'solar', 'solarpv',
                'solarth', 'thermal', 'wind', 'fossil', 'dual', 'ccgt']


[docs]class BaseClient(object): """ Base class for scraper/parser clients. """ # choices for market and frequency interval labels MARKET_CHOICES = IntervalChoices(hourly='RTHR', fivemin='RT5M', tenmin='RT5M', fifteenmin='RTPD', na='RT5M', dam='DAHR') FREQUENCY_CHOICES = IntervalChoices(hourly='1hr', fivemin='5m', tenmin='10m', fifteenmin='15m', na='n/a', dam='1hr') # timezone TZ_NAME = 'UTC' # name NAME = '' def __init__(self, timeout_seconds=30): # will hold query options self.options = {} # connection timeout self.timeout_seconds = timeout_seconds
[docs] def get_generation(self, latest=False, yesterday=False, start_at=False, end_at=False, **kwargs): """ Scrape and parse generation fuel mix data. :param bool latest: If True, only get the generation mix at the one most recent available time point. Available for all regions. :param bool yesterday: If True, get the generation mix for every time point yesterday. Not available for all regions. :param datetime start_at: If the datetime is naive, it is assummed to be in the timezone of the Balancing Authority. The timestamp of all returned data points will be greater than or equal to this value. If using, must provide both ``start_at`` and ``end_at`` parameters. Not available for all regions. :param datetime end_at: If the datetime is naive, it is assummed to be in the timezone of the Balancing Authority. The timestamp of all returned data points will be less than or equal to this value. If using, must provide both ``start_at`` and ``end_at`` parameters. Not available for all regions. :return: List of dicts, each with keys ``[ba_name, timestamp, freq, market, fuel_name, gen_MW]``. Timestamps are in UTC. :rtype: list """ raise NotImplementedError('Derived classes must implement the get_generation method.')
[docs] def get_load(self, latest=False, yesterday=False, start_at=False, end_at=False, **kwargs): """ Scrape and parse load data. :param bool latest: If True, only get the load at the one most recent available time point. Available for all regions. :param bool yesterday: If True, get the load for every time point yesterday. Not available for all regions. :param datetime start_at: If the datetime is naive, it is assummed to be in the timezone of the Balancing Authority. The timestamp of all returned data points will be greater than or equal to this value. If using, must provide both ``start_at`` and ``end_at`` parameters. Not available for all regions. :param datetime end_at: If the datetime is naive, it is assummed to be in the timezone of the Balancing Authority. The timestamp of all returned data points will be less than or equal to this value. If using, must provide both ``start_at`` and ``end_at`` parameters. Not available for all regions. :return: List of dicts, each with keys ``[ba_name, timestamp, freq, market, load_MW]``. Timestamps are in UTC. :rtype: list """ raise NotImplementedError('Derived classes must implement the get_load method.')
[docs] def get_trade(self, latest=False, yesterday=False, start_at=False, end_at=False, **kwargs): """ Scrape and parse import/export data. Value is net export (export - import), can be positive or negative. :param bool latest: If True, only get the trade at the one most recent available time point. Available for all regions. :param bool yesterday: If True, get the trade for every time point yesterday. Not available for all regions. :param datetime start_at: If the datetime is naive, it is assummed to be in the timezone of the Balancing Authority. The timestamp of all returned data points will be greater than or equal to this value. If using, must provide both ``start_at`` and ``end_at`` parameters. Not available for all regions. :param datetime end_at: If the datetime is naive, it is assummed to be in the timezone of the Balancing Authority. The timestamp of all returned data points will be less than or equal to this value. If using, must provide both ``start_at`` and ``end_at`` parameters. Not available for all regions. :return: List of dicts, each with keys ``[ba_name, timestamp, freq, market, net_exp_MW]``. Timestamps are in UTC. :rtype: list """ raise NotImplementedError('Derived classes must implement the get_trade method.')
[docs] def get_lmp(self, **kwargs): """ Locational Marginal Price (LMP) is no longer considered a useful measure in reducing carbon emissions. As such, this method has been removed. Please see http://watttime.org/lmp for details. """ warnings.warn('PyISO no longer supports the get_lmp method. See http://watttime.org/lmp.', DeprecationWarning)
[docs] def handle_options(self, **kwargs): """ Process and store keyword argument options. """ self.options = kwargs # check start_at and end_at args if self.options.get('start_at', None) and self.options.get('end_at', None): assert self.options['start_at'] < self.options['end_at'] self.options['start_at'] = self.utcify(self.options['start_at']) self.options['end_at'] = self.utcify(self.options['end_at']) self.options['sliceable'] = True self.options['latest'] = False # force forecast to be True if end_at is in the future if self.options['end_at'] > pytz.utc.localize(datetime.utcnow()): self.options['forecast'] = True else: self.options['forecast'] = False # set start_at and end_at for yesterday in local time elif self.options.get('yesterday', None): local_now = pytz.utc.localize(datetime.utcnow()).astimezone(pytz.timezone(self.TZ_NAME)) self.options['end_at'] = local_now.replace(hour=0, minute=0, second=0, microsecond=0) self.options['start_at'] = self.options['end_at'] - timedelta(days=1) self.options['sliceable'] = True self.options['latest'] = False self.options['forecast'] = False # set start_at and end_at for today+tomorrow in local time elif self.options.get('forecast', None): local_now = pytz.utc.localize(datetime.utcnow()).astimezone(pytz.timezone(self.TZ_NAME)) self.options['start_at'] = local_now.replace(microsecond=0) self.options['end_at'] = self.options['start_at'] + timedelta(days=2) self.options['sliceable'] = True self.options['latest'] = False self.options['forecast'] = True else: self.options['sliceable'] = False self.options['forecast'] = False
[docs] def utcify(self, local_ts_str, tz_name=None, is_dst=None): """ Convert a datetime or datetime string to UTC. Uses the default behavior of dateutil.parser.parse to convert the string to a datetime object. :param string local_ts: The local datetime to be converted. :param string tz_name: If local_ts is naive, it is assumed to be in timezone tz. If tz is not provided, the client's default timezone is used. :param bool is_dst: If provided, explicitly set daylight savings time as True or False. :return: Datetime in UTC. :rtype: datetime """ # set up tz if tz_name is None: tz = pytz.timezone(self.TZ_NAME) else: tz = pytz.timezone(tz_name) # parse try: local_ts = dateutil_parse(local_ts_str) except (AttributeError, TypeError): # already parsed local_ts = local_ts_str # localize if local_ts.tzinfo is None: # unaware if is_dst is None: aware_local_ts = tz.localize(local_ts) else: aware_local_ts = tz.localize(local_ts, is_dst=is_dst) else: # already aware aware_local_ts = local_ts # convert to utc aware_utc_ts = aware_local_ts.astimezone(pytz.utc) # return return aware_utc_ts
[docs] def parse_row(self, row, delimiter=',', datetime_col=None, drop_vals=None): raw_vals = row.split(delimiter) if datetime_col is not None: raw_vals[datetime_col] = self.utcify(raw_vals[datetime_col]) if drop_vals is not None: cleaned_vals = [val for val in raw_vals if val not in drop_vals] else: cleaned_vals = raw_vals return cleaned_vals
[docs] def fetch_xls(self, url): """ :param url: The URL of the .xls file to request. :return: The .xls document's content as a pandas object. :rtype: pandas.io.excel.ExcelFile """ # follow http://stackoverflow.com/questions/27835619/ssl-certificate-verify-failed-error context = ssl.create_default_context(cafile=certifi.where()) socket = urlopen(url, context=context) xd = pd.ExcelFile(socket) return xd
[docs] def request(self, url, mode='get', retry_sec=5, retries_remaining=5, **kwargs): """ Get or post to a URL with the provided kwargs. Returns the response, or None if an error was encountered. If the mode is not 'get' or 'post', raises ValueError. """ # check args allowed_modes = ['get', 'post'] if mode not in allowed_modes: raise ValueError('Invalid request mode %s' % mode) # check for session try: session = getattr(self, 'session') except AttributeError: self.session = requests.Session() session = self.session # carry out request try: response = getattr(session, mode)(url, verify=True, timeout=self.timeout_seconds, **kwargs) # except requests.exceptions.ChunkedEncodingError as e: # # JSON incomplete or not found # msg = '%s: chunked encoding error for %s, %s:\n%s' % (self.NAME, url, kwargs, e) # LOGGER.error(msg) # return None except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: # eg max retries exceeded msg = '%s: connection error for %s, %s:\n%s' % (self.NAME, url, kwargs, e) LOGGER.error(msg) return None # except requests.exceptions.RequestException: # msg = '%s: request exception for %s, %s:\n%s' % (self.NAME, url, kwargs, e) # LOGGER.error(msg) # return None if response.status_code == 200: # success LOGGER.debug('%s: request success for %s, %s with cache hit %s' % (self.NAME, url, kwargs, getattr(response, 'from_cache', None))) elif response.status_code == 429: if retries_remaining > 0: # retry on throttle LOGGER.warn('%s: retrying in %d seconds (%d retries remaining), throttled for %s, %s' % (self.NAME, retry_sec, retries_remaining, url, kwargs)) sleep(retry_sec) retries_remaining -= 1 return self.request(url, mode=mode, retry_sec=retry_sec*2, retries_remaining=retries_remaining, **kwargs) else: # exhausted retries LOGGER.warn('%s: exhausted retries for %s, %s' % (self.NAME, url, kwargs)) return None else: # non-throttle error LOGGER.error('%s: request failure with code %s for %s, %s' % (self.NAME, response.status_code, url, kwargs)) if os.environ.get('VERBOSE_REQUESTS') == 'verbose': LOGGER.info(mode) LOGGER.info(url) LOGGER.info(kwargs) LOGGER.info(response.status_code) print(response.text) return response
[docs] def unzip(self, content): """ Unzip encoded data. Returns the unzipped content as an array of strings, each representing one file's content or returns None if an error was encountered. ***Previous behavior: Only returned the content from the first file*** """ # create zip file try: filecontent = BytesIO(content) except TypeError: filecontent = StringIO(content) try: # have zipfile z = zipfile.ZipFile(filecontent) except zipfile.BadZipfile: LOGGER.error('%s: unzip failure for content beginning:\n%s' % (self.NAME, str(content)[0:100])) LOGGER.debug('%s: Faulty unzip content:\n%s' % (self.NAME, content)) return None # have unzipped content unzipped = [z.read(thisfile) for thisfile in z.namelist()] z.close() # return return unzipped
[docs] def parse_to_df(self, filelike, mode='csv', header_names=None, sheet_names=None, **kwargs): """ Parse a delimited or excel file from the provided content and return a DataFrame. Any extra kwargs are passed to the appropriate pandas parser; read the pandas docs for details. Recommended kwargs: skiprows, parse_cols, header. :param filelike: string-like or filelike object containing formatted data :paramtype: string or file :param string mode: Choose from 'csv' or 'xls'. Default 'csv'. If 'csv', kwargs are passed to pandas.read_csv. :param list header_names: List of strings to use as column names. If provided, this will override the header extracted by pandas. :param list sheet_names: List of strings for excel sheet names to read. Default is to concatenate all sheets. """ # check mode allowed_modes = ['csv', 'xls'] if mode not in allowed_modes: raise ValueError('Invalid mode %s' % mode) # do csv/tsv if mode == 'csv': # convert string to filelike if needed try: filelike.closed except AttributeError: # string, unicode, etc try: filelike = BytesIO(filelike) # This was changed from StringIO for Python 3.x except TypeError: filelike = StringIO(filelike) # read csv kwargs['engine'] = 'python' df = pd.read_csv(filelike, **kwargs) # do xls elif mode == 'xls': # parse_dates is not implemented for excel, so pop it off if 'parse_dates' in kwargs: parse_dates = kwargs.pop('parse_dates') else: parse_dates = False pieces = [] for sheet in sheet_names: pieces.append(filelike.parse(sheet, **kwargs)) df = pd.concat(pieces) # parse date index df.index = pd.to_datetime(df.index, infer_datetime_format=True, errors='coerce') # set names if header_names is not None: df.columns = header_names # drop na df = df.dropna() return df
[docs] def utcify_index(self, local_index, tz_name=None, tz_col=None): """ Convert a DateTimeIndex to UTC. :param DateTimeIndex local_index: The local DateTimeIndex to be converted. :param string tz_name: If local_ts is naive, it is assumed to be in timezone tz. If tz is not provided, the client's default timezone is used. :return: DatetimeIndex in UTC. :rtype: DatetimeIndex """ # set up tz if tz_name is None: tz_name = self.TZ_NAME # use tz col if given if tz_col is not None: # it seems like we shouldn't have to iterate, but all the smart ways aren't working aware_utc_list = [] for i in range(len(local_index)): try: aware_local_ts = pytz.timezone(tz_col[i]).localize(local_index[i]) except pytz.UnknownTimeZoneError: # fall back to local ts aware_local_ts = pytz.timezone(tz_name).localize(local_index[i]) # utcify aware_utc_ts = self.utcify(aware_local_ts) aware_utc_list.append(aware_utc_ts) # indexify aware_utc_index = pd.DatetimeIndex(aware_utc_list) else: # localize try: aware_local_index = local_index.tz_localize(tz_name) except AmbiguousTimeError as e: LOGGER.debug(e) try: aware_local_index = local_index.tz_localize(tz_name, ambiguous='infer') except AmbiguousTimeError: LOGGER.warn('Second DatetimeIndex localization fallback, assuming DST transition day.') dst_active_list = self._dst_active_hours_for_transition_day(local_dt_index=local_index) aware_local_index = local_index.tz_localize(tz_name, ambiguous=dst_active_list) except TypeError as e: # already aware LOGGER.debug(e) aware_local_index = local_index # convert to utc aware_utc_index = aware_local_index.tz_convert('UTC') # return return aware_utc_index
[docs] def slice_times(self, df, options=None): if options is None: options = self.options if len(df) == 0: # if empty, end here return df if options.get('latest', None): start_at = df.iloc[-1].name end_at = start_at else: try: start_at = options['start_at'] end_at = options['end_at'] except KeyError: raise ValueError('Slicing by time requires start_at and end_at') # sort before truncate eliminates DST KeyError sorteddf = df.sort_index() sliced = sorteddf.truncate(before=start_at, after=end_at) # return return sliced
[docs] def unpivot(self, df): return df.stack().reset_index(level=1)
[docs] def serialize(self, df, header, extras={}): data = [] for row in df.itertuples(): dp = dict(zip(header, list(row))) dp.update(extras) data.append(dp) return data
[docs] def serialize_faster(self, df, extras={}, drop_index=False): """DF is a DataFrame with DateTimeIndex and columns fuel_type and gen_MW (or load_mW). Index and columns are already properly named.""" df = df.reset_index(drop=drop_index) for key in extras: df[key] = extras[key] return df.to_dict(orient='records')
[docs] def local_now(self): """Returns a tz-aware datetime equal to the current moment, in the local timezone""" return pytz.utc.localize(datetime.utcnow()).astimezone(pytz.timezone(self.TZ_NAME))
[docs] def dates(self): """Returns a list of dates in local time""" # set up storage dates = [] # if latest, use date in local time if self.options['latest']: local_now = self.local_now() if local_now.date() != (local_now - timedelta(minutes=30)).date(): dates.append((local_now - timedelta(minutes=30)).date()) dates.append(local_now.date()) # if start and end, use all dates in range elif self.options['start_at'] and self.options['end_at']: local_start = self.options['start_at'].astimezone(pytz.timezone(self.TZ_NAME)) local_end = self.options['end_at'].astimezone(pytz.timezone(self.TZ_NAME)) this_date = local_start.date() while this_date <= local_end.date(): dates.append(this_date) this_date += timedelta(days=1) # have to have some sort of dates else: raise ValueError( 'Either latest must be True, or start_at and end_at must both be provided.') # return return dates
def _dst_active_hours_for_transition_day(self, local_dt_index): """ When attempting to localize a timezone-naive list of dates, the daylight savings status may be ambigous. This method is meant as a fallback when the ambiguous='infer' datetime handling in pandas fails. It assumes that the datetime index is a daylight saving transition day. :param pandas.DatetimeIndex local_dt_index: A list of timezone-naive DatetimeIndex values. :return: A list of bool values indicating whether daylight savings time is active for the list provided. This returned list of boolean value is useful for passing to pandas 'ambiguous' kwarg. :rtype: list """ dst_active_list = [] hour_idx = local_dt_index.hour if len(hour_idx) > 3: starting_timestamp = local_dt_index[0] starting_month = starting_timestamp.month starting_hour = starting_timestamp.hour if starting_month == 3 and starting_hour == 0: dst_active_list = [h > 1 for h in hour_idx] elif starting_month == 11 and starting_hour == 0: dst_active_list = [h < 2 for h in hour_idx] elif 3 < starting_month < 11: dst_active_list = [True for h in hour_idx] elif starting_month < 3 or starting_month > 11: dst_active_list = [False for h in hour_idx] else: LOGGER.warn("Uanble to infer fallback DST status for ambiguous DatetimeIndex values.") return dst_active_list