Source code for statsmodels.tsa.deterministic

from __future__ import annotations

from statsmodels.compat.pandas import (
    PD_LT_2_2_0,
    Appender,
    is_int_index,
    to_numpy,
)

from abc import ABC, abstractmethod
import datetime as dt
from typing import TYPE_CHECKING, Optional, Union

import numpy as np
import pandas as pd
from scipy.linalg import qr

from statsmodels.iolib.summary import d_or_f
from statsmodels.tools.validation import (
    bool_like,
    float_like,
    required_int_like,
    string_like,
)
from statsmodels.tsa.tsatools import freq_to_period

if TYPE_CHECKING:
    from collections.abc import Hashable, Sequence

DateLike = Union[dt.datetime, pd.Timestamp, np.datetime64]
IntLike = Union[int, np.integer]


START_BEFORE_INDEX_ERR = """\
start is less than the first observation in the index. Values can only be \
created for observations after the start of the index.
"""


[docs] class DeterministicTerm(ABC): """Abstract Base Class for all Deterministic Terms""" # Set _is_dummy if the term is a dummy variable process _is_dummy = False @property def is_dummy(self) -> bool: """Flag indicating whether the values produced are dummy variables""" return self._is_dummy
[docs] @abstractmethod def in_sample(self, index: Sequence[Hashable]) -> pd.DataFrame: """ Produce deterministic trends for in-sample fitting. Parameters ---------- index : index_like An index-like object. If not an index, it is converted to an index. Returns ------- DataFrame A DataFrame containing the deterministic terms. """
[docs] @abstractmethod def out_of_sample( self, steps: int, index: Sequence[Hashable], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: """ Produce deterministic trends for out-of-sample forecasts Parameters ---------- steps : int The number of steps to forecast index : index_like An index-like object. If not an index, it is converted to an index. forecast_index : index_like An Index or index-like object to use for the forecasts. If provided must have steps elements. Returns ------- DataFrame A DataFrame containing the deterministic terms. """
@abstractmethod def __str__(self) -> str: """A meaningful string representation of the term""" def __hash__(self) -> int: name: tuple[Hashable, ...] = (type(self).__name__,) return hash(name + self._eq_attr) @property @abstractmethod def _eq_attr(self) -> tuple[Hashable, ...]: """tuple of attributes that are used for equality comparison""" @staticmethod def _index_like(index: Sequence[Hashable]) -> pd.Index: if isinstance(index, pd.Index): return index try: return pd.Index(index) except Exception as exc: raise TypeError("index must be a pandas Index or index-like") from exc @staticmethod def _extend_index( index: pd.Index, steps: int, forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.Index: """Extend the forecast index""" if forecast_index is not None: forecast_index = DeterministicTerm._index_like(forecast_index) assert isinstance(forecast_index, pd.Index) if forecast_index.shape[0] != steps: raise ValueError( "The number of values in forecast_index " f"({forecast_index.shape[0]}) must match steps ({steps})." ) return forecast_index if isinstance(index, pd.PeriodIndex): return pd.period_range( index[-1] + 1, periods=steps, freq=index.freq ) elif isinstance(index, pd.DatetimeIndex) and index.freq is not None: next_obs = pd.date_range(index[-1], freq=index.freq, periods=2)[1] return pd.date_range(next_obs, freq=index.freq, periods=steps) elif isinstance(index, pd.RangeIndex): assert isinstance(index, pd.RangeIndex) try: step = index.step start = index.stop except AttributeError: # TODO: Remove after pandas min ver is 1.0.0+ step = index[-1] - index[-2] if len(index) > 1 else 1 start = index[-1] + step stop = start + step * steps return pd.RangeIndex(start, stop, step=step) elif is_int_index(index) and np.all(np.diff(index) == 1): idx_arr = np.arange(index[-1] + 1, index[-1] + steps + 1) return pd.Index(idx_arr) # default range index import warnings warnings.warn( "Only PeriodIndexes, DatetimeIndexes with a frequency set, " "RangesIndexes, and Index with a unit increment support " "extending. The index is set will contain the position relative " "to the data length.", UserWarning, stacklevel=2, ) nobs = index.shape[0] return pd.RangeIndex(nobs + 1, nobs + steps + 1) def __repr__(self) -> str: return self.__str__() + f" at 0x{id(self):0x}" def __eq__(self, other: object) -> bool: if isinstance(other, type(self)): own_attr = self._eq_attr oth_attr = other._eq_attr if len(own_attr) != len(oth_attr): return False return all(a == b for a, b in zip(own_attr, oth_attr)) else: return False
[docs] class TimeTrendDeterministicTerm(DeterministicTerm, ABC): """Abstract Base Class for all Time Trend Deterministic Terms""" def __init__(self, constant: bool = True, order: int = 0) -> None: self._constant = bool_like(constant, "constant") self._order = required_int_like(order, "order") @property def constant(self) -> bool: """Flag indicating that a constant is included""" return self._constant @property def order(self) -> int: """Order of the time trend""" return self._order @property def _columns(self) -> list[str]: columns = [] trend_names = {1: "trend", 2: "trend_squared", 3: "trend_cubed"} if self._constant: columns.append("const") for power in range(1, self._order + 1): if power in trend_names: columns.append(trend_names[power]) else: columns.append(f"trend**{power}") return columns def _get_terms(self, locs: np.ndarray) -> np.ndarray: nterms = int(self._constant) + self._order terms = np.tile(locs, (1, nterms)) power = np.zeros((1, nterms), dtype=int) power[0, int(self._constant) :] = np.arange(1, self._order + 1) terms **= power return terms def __str__(self) -> str: terms = [] if self._constant: terms.append("Constant") if self._order: terms.append(f"Powers 1 to {self._order + 1}") if not terms: terms = ["Empty"] terms_str = ",".join(terms) return f"TimeTrend({terms_str})"
[docs] class TimeTrend(TimeTrendDeterministicTerm): """ Constant and time trend determinstic terms Parameters ---------- constant : bool Flag indicating whether a constant should be included. order : int A non-negative int containing the powers to include (1, 2, ..., order). See Also -------- DeterministicProcess Seasonality Fourier CalendarTimeTrend Examples -------- >>> from statsmodels.datasets import sunspots >>> from statsmodels.tsa.deterministic import TimeTrend >>> data = sunspots.load_pandas().data >>> trend_gen = TimeTrend(True, 3) >>> trend_gen.in_sample(data.index) """ def __init__(self, constant: bool = True, order: int = 0) -> None: super().__init__(constant, order)
[docs] @classmethod def from_string(cls, trend: str) -> "TimeTrend": """ Create a TimeTrend from a string description. Provided for compatibility with common string names. Parameters ---------- trend : {"n", "c", "t", "ct", "ctt"} The string representation of the time trend. The terms are: * "n": No trend terms * "c": A constant only * "t": Linear time trend only * "ct": A constant and a time trend * "ctt": A constant, a time trend and a quadratic time trend Returns ------- TimeTrend The TimeTrend instance. """ constant = trend.startswith("c") order = 0 if "tt" in trend: order = 2 elif "t" in trend: order = 1 return cls(constant=constant, order=order)
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample( self, index: Union[Sequence[Hashable], pd.Index] ) -> pd.DataFrame: index = self._index_like(index) nobs = index.shape[0] locs = np.arange(1, nobs + 1, dtype=np.double)[:, None] terms = self._get_terms(locs) return pd.DataFrame(terms, columns=self._columns, index=index)
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, index: Union[Sequence[Hashable], pd.Index], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: index = self._index_like(index) nobs = index.shape[0] fcast_index = self._extend_index(index, steps, forecast_index) locs = np.arange(nobs + 1, nobs + steps + 1, dtype=np.double)[:, None] terms = self._get_terms(locs) return pd.DataFrame(terms, columns=self._columns, index=fcast_index)
@property def _eq_attr(self) -> tuple[Hashable, ...]: return self._constant, self._order
[docs] class Seasonality(DeterministicTerm): """ Seasonal dummy deterministic terms Parameters ---------- period : int The length of a full cycle. Must be >= 2. initial_period : int The seasonal index of the first observation. 1-indexed so must be in {1, 2, ..., period}. See Also -------- DeterministicProcess TimeTrend Fourier CalendarSeasonality Examples -------- Solar data has an 11-year cycle >>> from statsmodels.datasets import sunspots >>> from statsmodels.tsa.deterministic import Seasonality >>> data = sunspots.load_pandas().data >>> seas_gen = Seasonality(11) >>> seas_gen.in_sample(data.index) To start at a season other than 1 >>> seas_gen = Seasonality(11, initial_period=4) >>> seas_gen.in_sample(data.index) """ _is_dummy = True def __init__(self, period: int, initial_period: int = 1) -> None: self._period = required_int_like(period, "period") self._initial_period = required_int_like( initial_period, "initial_period" ) if period < 2: raise ValueError("period must be >= 2") if not 1 <= self._initial_period <= period: raise ValueError("initial_period must be in {1, 2, ..., period}") @property def period(self) -> int: """The period of the seasonality""" return self._period @property def initial_period(self) -> int: """The seasonal index of the first observation""" return self._initial_period
[docs] @classmethod def from_index( cls, index: Union[Sequence[Hashable], pd.DatetimeIndex, pd.PeriodIndex] ) -> "Seasonality": """ Construct a seasonality directly from an index using its frequency. Parameters ---------- index : {DatetimeIndex, PeriodIndex} An index with its frequency (`freq`) set. Returns ------- Seasonality The initialized Seasonality instance. """ index = cls._index_like(index) if isinstance(index, pd.PeriodIndex): freq = index.freq elif isinstance(index, pd.DatetimeIndex): freq = index.freq if index.freq else index.inferred_freq else: raise TypeError("index must be a DatetimeIndex or PeriodIndex") if freq is None: raise ValueError("index must have a freq or inferred_freq set") period = freq_to_period(freq) return cls(period=period)
@property def _eq_attr(self) -> tuple[Hashable, ...]: return self._period, self._initial_period def __str__(self) -> str: return f"Seasonality(period={self._period})" @property def _columns(self) -> list[str]: period = self._period columns = [] for i in range(1, period + 1): columns.append(f"s({i},{period})") return columns
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample( self, index: Union[Sequence[Hashable], pd.Index] ) -> pd.DataFrame: index = self._index_like(index) nobs = index.shape[0] period = self._period term = np.zeros((nobs, period)) offset = self._initial_period - 1 for i in range(period): col = (i + offset) % period term[i::period, col] = 1 return pd.DataFrame(term, columns=self._columns, index=index)
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, index: Union[Sequence[Hashable], pd.Index], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: index = self._index_like(index) fcast_index = self._extend_index(index, steps, forecast_index) nobs = index.shape[0] period = self._period term = np.zeros((steps, period)) offset = self._initial_period - 1 for i in range(period): col_loc = (nobs + offset + i) % period term[i::period, col_loc] = 1 return pd.DataFrame(term, columns=self._columns, index=fcast_index)
[docs] class FourierDeterministicTerm(DeterministicTerm, ABC): """Abstract Base Class for all Fourier Deterministic Terms""" def __init__(self, order: int) -> None: self._order = required_int_like(order, "terms") @property def order(self) -> int: """The order of the Fourier terms included""" return self._order def _get_terms(self, locs: np.ndarray) -> np.ndarray: locs = 2 * np.pi * locs.astype(np.double) terms = np.empty((locs.shape[0], 2 * self._order)) for i in range(self._order): for j, func in enumerate((np.sin, np.cos)): terms[:, 2 * i + j] = func((i + 1) * locs) return terms
[docs] class Fourier(FourierDeterministicTerm): r""" Fourier series deterministic terms Parameters ---------- period : int The length of a full cycle. Must be >= 2. order : int The number of Fourier components to include. Must be <= 2*period. See Also -------- DeterministicProcess TimeTrend Seasonality CalendarFourier Notes ----- Both a sine and a cosine term are included for each i=1, ..., order .. math:: f_{i,s,t} & = \sin\left(2 \pi i \times \frac{t}{m} \right) \\ f_{i,c,t} & = \cos\left(2 \pi i \times \frac{t}{m} \right) where m is the length of the period. Examples -------- Solar data has an 11-year cycle >>> from statsmodels.datasets import sunspots >>> from statsmodels.tsa.deterministic import Fourier >>> data = sunspots.load_pandas().data >>> fourier_gen = Fourier(11, order=2) >>> fourier_gen.in_sample(data.index) """ _is_dummy = False def __init__(self, period: float, order: int): super().__init__(order) self._period = float_like(period, "period") if 2 * self._order > self._period: raise ValueError("2 * order must be <= period") @property def period(self) -> float: """The period of the Fourier terms""" return self._period @property def _columns(self) -> list[str]: period = self._period fmt_period = d_or_f(period).strip() columns = [] for i in range(1, self._order + 1): for typ in ("sin", "cos"): columns.append(f"{typ}({i},{fmt_period})") return columns
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample( self, index: Union[Sequence[Hashable], pd.Index] ) -> pd.DataFrame: index = self._index_like(index) nobs = index.shape[0] terms = self._get_terms(np.arange(nobs) / self._period) return pd.DataFrame(terms, index=index, columns=self._columns)
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, index: Union[Sequence[Hashable], pd.Index], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: index = self._index_like(index) fcast_index = self._extend_index(index, steps, forecast_index) nobs = index.shape[0] terms = self._get_terms(np.arange(nobs, nobs + steps) / self._period) return pd.DataFrame(terms, index=fcast_index, columns=self._columns)
@property def _eq_attr(self) -> tuple[Hashable, ...]: return self._period, self._order def __str__(self) -> str: return f"Fourier(period={self._period}, order={self._order})"
[docs] class CalendarDeterministicTerm(DeterministicTerm, ABC): """Abstract Base Class for calendar deterministic terms""" def __init__(self, freq: str) -> None: try: index = pd.date_range("2020-01-01", freq=freq, periods=1) self._freq = index.freq except ValueError as exc: raise ValueError("freq is not understood by pandas") from exc @property def freq(self) -> str: """The frequency of the deterministic terms""" return self._freq.freqstr def _compute_ratio( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: if isinstance(index, pd.PeriodIndex): index = index.to_timestamp() delta = index - index.to_period(self._freq).to_timestamp() pi = index.to_period(self._freq) gap = (pi + 1).to_timestamp() - pi.to_timestamp() return to_numpy(delta) / to_numpy(gap) def _check_index_type( self, index: pd.Index, allowed: Union[type, tuple[type, ...]] = ( pd.DatetimeIndex, pd.PeriodIndex, ), ) -> Union[pd.DatetimeIndex, pd.PeriodIndex]: if isinstance(allowed, type): allowed = (allowed,) if not isinstance(index, allowed): if len(allowed) == 1: allowed_types = "a " + allowed[0].__name__ else: allowed_types = ", ".join(a.__name__ for a in allowed[:-1]) if len(allowed) > 2: allowed_types += "," allowed_types += " and " + allowed[-1].__name__ msg = ( f"{type(self).__name__} terms can only be computed from " f"{allowed_types}" ) raise TypeError(msg) assert isinstance(index, (pd.DatetimeIndex, pd.PeriodIndex)) return index
[docs] class CalendarFourier(CalendarDeterministicTerm, FourierDeterministicTerm): r""" Fourier series deterministic terms based on calendar time Parameters ---------- freq : str A string convertible to a pandas frequency. order : int The number of Fourier components to include. Must be <= 2*period. See Also -------- DeterministicProcess CalendarTimeTrend CalendarSeasonality Fourier Notes ----- Both a sine and a cosine term are included for each i=1, ..., order .. math:: f_{i,s,t} & = \sin\left(2 \pi i \tau_t \right) \\ f_{i,c,t} & = \cos\left(2 \pi i \tau_t \right) where m is the length of the period and :math:`\tau_t` is the frequency normalized time. For example, when freq is "D" then an observation with a timestamp of 12:00:00 would have :math:`\tau_t=0.5`. Examples -------- Here we simulate irregularly spaced hourly data and construct the calendar Fourier terms for the data. >>> import numpy as np >>> import pandas as pd >>> base = pd.Timestamp("2020-1-1") >>> gen = np.random.default_rng() >>> gaps = np.cumsum(gen.integers(0, 1800, size=1000)) >>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps] >>> index = pd.DatetimeIndex(pd.to_datetime(times)) >>> from statsmodels.tsa.deterministic import CalendarFourier >>> cal_fourier_gen = CalendarFourier("D", 2) >>> cal_fourier_gen.in_sample(index) """ def __init__(self, freq: str, order: int) -> None: super().__init__(freq) FourierDeterministicTerm.__init__(self, order) self._order = required_int_like(order, "terms") @property def _columns(self) -> list[str]: columns = [] for i in range(1, self._order + 1): for typ in ("sin", "cos"): columns.append(f"{typ}({i},freq={self._freq.freqstr})") return columns
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample( self, index: Union[Sequence[Hashable], pd.Index] ) -> pd.DataFrame: index = self._index_like(index) index = self._check_index_type(index) ratio = self._compute_ratio(index) terms = self._get_terms(ratio) return pd.DataFrame(terms, index=index, columns=self._columns)
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, index: Union[Sequence[Hashable], pd.Index], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: index = self._index_like(index) fcast_index = self._extend_index(index, steps, forecast_index) self._check_index_type(fcast_index) assert isinstance(fcast_index, (pd.DatetimeIndex, pd.PeriodIndex)) ratio = self._compute_ratio(fcast_index) terms = self._get_terms(ratio) return pd.DataFrame(terms, index=fcast_index, columns=self._columns)
@property def _eq_attr(self) -> tuple[Hashable, ...]: return self._freq.freqstr, self._order def __str__(self) -> str: return f"Fourier(freq={self._freq.freqstr}, order={self._order})"
[docs] class CalendarSeasonality(CalendarDeterministicTerm): """ Seasonal dummy deterministic terms based on calendar time Parameters ---------- freq : str The frequency of the seasonal effect. period : str The pandas frequency string describing the full period. See Also -------- DeterministicProcess CalendarTimeTrend CalendarFourier Seasonality Examples -------- Here we simulate irregularly spaced data (in time) and hourly seasonal dummies for the data. >>> import numpy as np >>> import pandas as pd >>> base = pd.Timestamp("2020-1-1") >>> gen = np.random.default_rng() >>> gaps = np.cumsum(gen.integers(0, 1800, size=1000)) >>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps] >>> index = pd.DatetimeIndex(pd.to_datetime(times)) >>> from statsmodels.tsa.deterministic import CalendarSeasonality >>> cal_seas_gen = CalendarSeasonality("H", "D") >>> cal_seas_gen.in_sample(index) """ _is_dummy = True # out_of: freq if PD_LT_2_2_0: _supported = { "W": {"B": 5, "D": 7, "h": 24 * 7, "H": 24 * 7}, "D": {"h": 24, "H": 24}, "Q": {"MS": 3, "M": 3}, "A": {"MS": 12, "M": 12}, "Y": {"MS": 12, "Q": 4, "M": 12}, } else: _supported = { "W": {"B": 5, "D": 7, "h": 24 * 7}, "D": {"h": 24}, "Q": {"MS": 3, "ME": 3}, "A": {"MS": 12, "ME": 12, "QE": 4}, "Y": {"MS": 12, "ME": 12, "QE": 4}, "QE": {"ME": 3}, "YE": {"ME": 12, "QE": 4}, } def __init__(self, freq: str, period: str) -> None: freq_options: set[str] = set() freq_options.update( *[list(val.keys()) for val in self._supported.values()] ) period_options = tuple(self._supported.keys()) freq = string_like( freq, "freq", options=tuple(freq_options), lower=False ) period = string_like( period, "period", options=period_options, lower=False ) if freq not in self._supported[period]: raise ValueError( f"The combination of freq={freq} and " f"period={period} is not supported." ) super().__init__(freq) self._period = period self._freq_str = self._freq.freqstr.split("-")[0] @property def freq(self) -> str: """The frequency of the deterministic terms""" return self._freq.freqstr @property def period(self) -> str: """The full period""" return self._period def _weekly_to_loc( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: if self._freq.freqstr in ("h", "H"): return index.hour + 24 * index.dayofweek elif self._freq.freqstr == "D": return index.dayofweek else: # "B" bdays = pd.bdate_range("2000-1-1", periods=10).dayofweek.unique() loc = index.dayofweek if not loc.isin(bdays).all(): raise ValueError( "freq is B but index contains days that are not business " "days." ) return loc def _daily_to_loc( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: return index.hour def _quarterly_to_loc( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: return (index.month - 1) % 3 def _annual_to_loc( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: if self._freq.freqstr in ("M", "ME", "MS"): return index.month - 1 else: # "Q" return index.quarter - 1 def _get_terms( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: if self._period == "D": locs = self._daily_to_loc(index) elif self._period == "W": locs = self._weekly_to_loc(index) elif self._period in ("Q", "QE"): locs = self._quarterly_to_loc(index) else: # "A", "Y": locs = self._annual_to_loc(index) full_cycle = self._supported[self._period][self._freq_str] terms = np.zeros((locs.shape[0], full_cycle)) terms[np.arange(locs.shape[0]), locs] = 1 return terms @property def _columns(self) -> list[str]: columns = [] count = self._supported[self._period][self._freq_str] for i in range(count): columns.append( f"s({self._freq_str}={i + 1}, period={self._period})" ) return columns
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample( self, index: Union[Sequence[Hashable], pd.Index] ) -> pd.DataFrame: index = self._index_like(index) index = self._check_index_type(index) terms = self._get_terms(index) return pd.DataFrame(terms, index=index, columns=self._columns)
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, index: Union[Sequence[Hashable], pd.Index], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: index = self._index_like(index) fcast_index = self._extend_index(index, steps, forecast_index) self._check_index_type(fcast_index) assert isinstance(fcast_index, (pd.DatetimeIndex, pd.PeriodIndex)) terms = self._get_terms(fcast_index) return pd.DataFrame(terms, index=fcast_index, columns=self._columns)
@property def _eq_attr(self) -> tuple[Hashable, ...]: return self._period, self._freq_str def __str__(self) -> str: return f"Seasonal(freq={self._freq_str})"
[docs] class CalendarTimeTrend(CalendarDeterministicTerm, TimeTrendDeterministicTerm): r""" Constant and time trend determinstic terms based on calendar time Parameters ---------- freq : str A string convertible to a pandas frequency. constant : bool Flag indicating whether a constant should be included. order : int A non-negative int containing the powers to include (1, 2, ..., order). base_period : {str, pd.Timestamp}, default None The base period to use when computing the time stamps. This value is treated as 1 and so all other time indices are defined as the number of periods since or before this time stamp. If not provided, defaults to pandas base period for a PeriodIndex. See Also -------- DeterministicProcess CalendarFourier CalendarSeasonality TimeTrend Notes ----- The time stamp, :math:`\tau_t`, is the number of periods that have elapsed since the base_period. :math:`\tau_t` may be fractional. Examples -------- Here we simulate irregularly spaced hourly data and construct the calendar time trend terms for the data. >>> import numpy as np >>> import pandas as pd >>> base = pd.Timestamp("2020-1-1") >>> gen = np.random.default_rng() >>> gaps = np.cumsum(gen.integers(0, 1800, size=1000)) >>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps] >>> index = pd.DatetimeIndex(pd.to_datetime(times)) >>> from statsmodels.tsa.deterministic import CalendarTimeTrend >>> cal_trend_gen = CalendarTimeTrend("D", True, order=1) >>> cal_trend_gen.in_sample(index) Next, we normalize using the first time stamp >>> cal_trend_gen = CalendarTimeTrend("D", True, order=1, ... base_period=index[0]) >>> cal_trend_gen.in_sample(index) """ def __init__( self, freq: str, constant: bool = True, order: int = 0, *, base_period: Optional[Union[str, DateLike]] = None, ) -> None: super().__init__(freq) TimeTrendDeterministicTerm.__init__( self, constant=constant, order=order ) self._ref_i8 = 0 if base_period is not None: pr = pd.period_range(base_period, periods=1, freq=self._freq) self._ref_i8 = pr.asi8[0] self._base_period = None if base_period is None else str(base_period) @property def base_period(self) -> Optional[str]: """The base period""" return self._base_period
[docs] @classmethod def from_string( cls, freq: str, trend: str, base_period: Optional[Union[str, DateLike]] = None, ) -> "CalendarTimeTrend": """ Create a TimeTrend from a string description. Provided for compatibility with common string names. Parameters ---------- freq : str A string convertible to a pandas frequency. trend : {"n", "c", "t", "ct", "ctt"} The string representation of the time trend. The terms are: * "n": No trend terms * "c": A constant only * "t": Linear time trend only * "ct": A constant and a time trend * "ctt": A constant, a time trend and a quadratic time trend base_period : {str, pd.Timestamp}, default None The base period to use when computing the time stamps. This value is treated as 1 and so all other time indices are defined as the number of periods since or before this time stamp. If not provided, defaults to pandas base period for a PeriodIndex. Returns ------- TimeTrend The TimeTrend instance. """ constant = trend.startswith("c") order = 0 if "tt" in trend: order = 2 elif "t" in trend: order = 1 return cls(freq, constant, order, base_period=base_period)
def _terms( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex], ratio: np.ndarray ) -> pd.DataFrame: if isinstance(index, pd.DatetimeIndex): index = index.to_period(self._freq) index_i8 = index.asi8 index_i8 = index_i8 - self._ref_i8 + 1 time = index_i8.astype(np.double) + ratio time = time[:, None] terms = self._get_terms(time) return pd.DataFrame(terms, columns=self._columns, index=index)
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample( self, index: Union[Sequence[Hashable], pd.Index] ) -> pd.DataFrame: index = self._index_like(index) index = self._check_index_type(index) ratio = self._compute_ratio(index) return self._terms(index, ratio)
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, index: Union[Sequence[Hashable], pd.Index], forecast_index: Optional[Sequence[Hashable]] = None, ) -> pd.DataFrame: index = self._index_like(index) fcast_index = self._extend_index(index, steps, forecast_index) self._check_index_type(fcast_index) assert isinstance(fcast_index, (pd.PeriodIndex, pd.DatetimeIndex)) ratio = self._compute_ratio(fcast_index) return self._terms(fcast_index, ratio)
@property def _eq_attr(self) -> tuple[Hashable, ...]: attr: tuple[Hashable, ...] = ( self._constant, self._order, self._freq.freqstr, ) if self._base_period is not None: attr += (self._base_period,) return attr def __str__(self) -> str: value = TimeTrendDeterministicTerm.__str__(self) value = "Calendar" + value[:-1] + f", freq={self._freq.freqstr})" if self._base_period is not None: value = value[:-1] + f"base_period={self._base_period})" return value
[docs] class DeterministicProcess: """ Container class for deterministic terms. Directly supports constants, time trends, and either seasonal dummies or fourier terms for a single cycle. Additional deterministic terms beyond the set that can be directly initialized through the constructor can be added. Parameters ---------- index : {Sequence[Hashable], pd.Index} The index of the process. Should usually be the "in-sample" index when used in forecasting applications. period : {float, int}, default None The period of the seasonal or fourier components. Must be an int for seasonal dummies. If not provided, freq is read from index if available. constant : bool, default False Whether to include a constant. order : int, default 0 The order of the tim trend to include. For example, 2 will include both linear and quadratic terms. 0 exclude time trend terms. seasonal : bool = False Whether to include seasonal dummies fourier : int = 0 The order of the fourier terms to included. additional_terms : Sequence[DeterministicTerm] A sequence of additional deterministic terms to include in the process. drop : bool, default False A flag indicating to check for perfect collinearity and to drop any linearly dependent terms. See Also -------- TimeTrend Seasonality Fourier CalendarTimeTrend CalendarSeasonality CalendarFourier Notes ----- See the notebook `Deterministic Terms in Time Series Models <../examples/notebooks/generated/deterministics.html>`__ for an overview. Examples -------- >>> from statsmodels.tsa.deterministic import DeterministicProcess >>> from pandas import date_range >>> index = date_range("2000-1-1", freq="M", periods=240) First a determinstic process with a constant and quadratic time trend. >>> dp = DeterministicProcess(index, constant=True, order=2) >>> dp.in_sample().head(3) const trend trend_squared 2000-01-31 1.0 1.0 1.0 2000-02-29 1.0 2.0 4.0 2000-03-31 1.0 3.0 9.0 Seasonal dummies are included by setting seasonal to True. >>> dp = DeterministicProcess(index, constant=True, seasonal=True) >>> dp.in_sample().iloc[:3,:5] const s(2,12) s(3,12) s(4,12) s(5,12) 2000-01-31 1.0 0.0 0.0 0.0 0.0 2000-02-29 1.0 1.0 0.0 0.0 0.0 2000-03-31 1.0 0.0 1.0 0.0 0.0 Fourier components can be used to alternatively capture seasonal patterns, >>> dp = DeterministicProcess(index, constant=True, fourier=2) >>> dp.in_sample().head(3) const sin(1,12) cos(1,12) sin(2,12) cos(2,12) 2000-01-31 1.0 0.000000 1.000000 0.000000 1.0 2000-02-29 1.0 0.500000 0.866025 0.866025 0.5 2000-03-31 1.0 0.866025 0.500000 0.866025 -0.5 Multiple Seasonalities can be captured using additional terms. >>> from statsmodels.tsa.deterministic import Fourier >>> index = date_range("2000-1-1", freq="D", periods=5000) >>> fourier = Fourier(period=365.25, order=1) >>> dp = DeterministicProcess(index, period=3, constant=True, ... seasonal=True, additional_terms=[fourier]) >>> dp.in_sample().head(3) const s(2,3) s(3,3) sin(1,365.25) cos(1,365.25) 2000-01-01 1.0 0.0 0.0 0.000000 1.000000 2000-01-02 1.0 1.0 0.0 0.017202 0.999852 2000-01-03 1.0 0.0 1.0 0.034398 0.999408 """ def __init__( self, index: Union[Sequence[Hashable], pd.Index], *, period: Optional[Union[float, int]] = None, constant: bool = False, order: int = 0, seasonal: bool = False, fourier: int = 0, additional_terms: Sequence[DeterministicTerm] = (), drop: bool = False, ): if not isinstance(index, pd.Index): index = pd.Index(index) self._index = index self._deterministic_terms: list[DeterministicTerm] = [] self._extendable = False self._index_freq = None self._validate_index() period = float_like(period, "period", optional=True) self._constant = constant = bool_like(constant, "constant") self._order = required_int_like(order, "order") self._seasonal = seasonal = bool_like(seasonal, "seasonal") self._fourier = required_int_like(fourier, "fourier") additional_terms = tuple(additional_terms) self._cached_in_sample = None self._drop = bool_like(drop, "drop") self._additional_terms = additional_terms if constant or order: self._deterministic_terms.append(TimeTrend(constant, order)) if seasonal and fourier: raise ValueError( """seasonal and fourier can be initialized through the \ constructor since these will be necessarily perfectly collinear. Instead, \ you can pass additional components using the additional_terms input.""" ) if (seasonal or fourier) and period is None: if period is None: self._period = period = freq_to_period(self._index_freq) if seasonal: period = required_int_like(period, "period") self._deterministic_terms.append(Seasonality(period)) elif fourier: period = float_like(period, "period") assert period is not None self._deterministic_terms.append(Fourier(period, order=fourier)) for term in additional_terms: if not isinstance(term, DeterministicTerm): raise TypeError( "All additional terms must be instances of subsclasses " "of DeterministicTerm" ) if term not in self._deterministic_terms: self._deterministic_terms.append(term) else: raise ValueError( "One or more terms in additional_terms has been added " "through the parameters of the constructor. Terms must " "be unique." ) self._period = period self._retain_cols: Optional[list[Hashable]] = None @property def index(self) -> pd.Index: """The index of the process""" return self._index @property def terms(self) -> list[DeterministicTerm]: """The deterministic terms included in the process""" return self._deterministic_terms def _adjust_dummies(self, terms: list[pd.DataFrame]) -> list[pd.DataFrame]: has_const: Optional[bool] = None for dterm in self._deterministic_terms: if isinstance(dterm, (TimeTrend, CalendarTimeTrend)): has_const = has_const or dterm.constant if has_const is None: has_const = False for term in terms: const_col = (term == term.iloc[0]).all() & (term.iloc[0] != 0) has_const = has_const or const_col.any() drop_first = has_const for i, dterm in enumerate(self._deterministic_terms): is_dummy = dterm.is_dummy if is_dummy and drop_first: # drop first terms[i] = terms[i].iloc[:, 1:] drop_first = drop_first or is_dummy return terms def _remove_zeros_ones(self, terms: pd.DataFrame) -> pd.DataFrame: all_zero = np.all(terms == 0, axis=0) if np.any(all_zero): terms = terms.loc[:, ~all_zero] is_constant = terms.max(axis=0) == terms.min(axis=0) if np.sum(is_constant) > 1: # flag surplus constant columns surplus_consts = is_constant & is_constant.duplicated() terms = terms.loc[:, ~surplus_consts] return terms
[docs] @Appender(DeterministicTerm.in_sample.__doc__) def in_sample(self) -> pd.DataFrame: if self._cached_in_sample is not None: return self._cached_in_sample index = self._index if not self._deterministic_terms: return pd.DataFrame(np.empty((index.shape[0], 0)), index=index) raw_terms = [] for term in self._deterministic_terms: raw_terms.append(term.in_sample(index)) raw_terms = self._adjust_dummies(raw_terms) terms: pd.DataFrame = pd.concat(raw_terms, axis=1) terms = self._remove_zeros_ones(terms) if self._drop: terms_arr = to_numpy(terms) res = qr(terms_arr, mode="r", pivoting=True) r = res[0] p = res[-1] abs_diag = np.abs(np.diag(r)) tol = abs_diag[0] * terms_arr.shape[1] * np.finfo(float).eps rank = int(np.sum(abs_diag > tol)) rpx = r.T @ terms_arr keep = [0] last_rank = 1 # Find the left-most columns that produce full rank for i in range(1, terms_arr.shape[1]): curr_rank = np.linalg.matrix_rank(rpx[: i + 1, : i + 1]) if curr_rank > last_rank: keep.append(i) last_rank = curr_rank if curr_rank == rank: break if len(keep) == rank: terms = terms.iloc[:, keep] else: terms = terms.iloc[:, np.sort(p[:rank])] self._retain_cols = terms.columns self._cached_in_sample = terms return terms
[docs] @Appender(DeterministicTerm.out_of_sample.__doc__) def out_of_sample( self, steps: int, forecast_index: Optional[Union[Sequence[Hashable], pd.Index]] = None, ) -> pd.DataFrame: steps = required_int_like(steps, "steps") if self._drop and self._retain_cols is None: self.in_sample() index = self._index if not self._deterministic_terms: return pd.DataFrame(np.empty((index.shape[0], 0)), index=index) raw_terms = [] for term in self._deterministic_terms: raw_terms.append(term.out_of_sample(steps, index, forecast_index)) terms: pd.DataFrame = pd.concat(raw_terms, axis=1) assert self._retain_cols is not None if terms.shape[1] != len(self._retain_cols): terms = terms[self._retain_cols] return terms
def _extend_time_index( self, stop: pd.Timestamp, ) -> Union[pd.DatetimeIndex, pd.PeriodIndex]: index = self._index if isinstance(index, pd.PeriodIndex): return pd.period_range(index[0], end=stop, freq=index.freq) return pd.date_range(start=index[0], end=stop, freq=self._index_freq) def _range_from_range_index(self, start: int, stop: int) -> pd.DataFrame: index = self._index is_int64_index = is_int_index(index) assert isinstance(index, pd.RangeIndex) or is_int64_index if start < index[0]: raise ValueError(START_BEFORE_INDEX_ERR) if isinstance(index, pd.RangeIndex): idx_step = index.step else: idx_step = np.diff(index).max() if len(index) > 1 else 1 if idx_step != 1 and ((start - index[0]) % idx_step) != 0: raise ValueError( f"The step of the index is not 1 (actual step={idx_step})." " start must be in the sequence that would have been " "generated by the index." ) if is_int64_index: new_idx = pd.Index(np.arange(start, stop)) else: new_idx = pd.RangeIndex(start, stop, step=idx_step) if new_idx[-1] <= self._index[-1]: # In-sample only in_sample = self.in_sample() in_sample = in_sample.loc[new_idx] return in_sample elif new_idx[0] > self._index[-1]: # Out of-sample only next_value = index[-1] + idx_step if new_idx[0] != next_value: tmp = pd.RangeIndex(next_value, stop, step=idx_step) oos = self.out_of_sample(tmp.shape[0], forecast_index=tmp) return oos.loc[new_idx] return self.out_of_sample(new_idx.shape[0], forecast_index=new_idx) # Using some from each in and out of sample in_sample_loc = new_idx <= self._index[-1] in_sample_idx = new_idx[in_sample_loc] out_of_sample_idx = new_idx[~in_sample_loc] in_sample_exog = self.in_sample().loc[in_sample_idx] oos_exog = self.out_of_sample( steps=out_of_sample_idx.shape[0], forecast_index=out_of_sample_idx ) return pd.concat([in_sample_exog, oos_exog], axis=0) def _range_from_time_index( self, start: pd.Timestamp, stop: pd.Timestamp ) -> pd.DataFrame: index = self._index if isinstance(self._index, pd.PeriodIndex): if isinstance(start, pd.Timestamp): start = start.to_period(freq=self._index_freq) if isinstance(stop, pd.Timestamp): stop = stop.to_period(freq=self._index_freq) if start < index[0]: raise ValueError(START_BEFORE_INDEX_ERR) if stop <= self._index[-1]: return self.in_sample().loc[start:stop] new_idx = self._extend_time_index(stop) oos_idx = new_idx[new_idx > index[-1]] oos = self.out_of_sample(oos_idx.shape[0], oos_idx) if start >= oos_idx[0]: return oos.loc[start:stop] both = pd.concat([self.in_sample(), oos], axis=0) return both.loc[start:stop] def _int_to_timestamp(self, value: int, name: str) -> pd.Timestamp: if value < 0: raise ValueError(f"{name} must be non-negative.") if value < self._index.shape[0]: return self._index[value] add_periods = value - (self._index.shape[0] - 1) + 1 index = self._index if isinstance(self._index, pd.PeriodIndex): pr = pd.period_range( index[-1], freq=self._index_freq, periods=add_periods ) return pr[-1].to_timestamp() dr = pd.date_range( index[-1], freq=self._index_freq, periods=add_periods ) return dr[-1]
[docs] def range( self, start: Union[IntLike, DateLike, str], stop: Union[IntLike, DateLike, str], ) -> pd.DataFrame: """ Deterministic terms spanning a range of observations Parameters ---------- start : {int, str, dt.datetime, pd.Timestamp, np.datetime64} The first observation. stop : {int, str, dt.datetime, pd.Timestamp, np.datetime64} The final observation. Inclusive to match most prediction function in statsmodels. Returns ------- DataFrame A data frame of deterministic terms """ if not self._extendable: raise TypeError( """The index in the deterministic process does not \ support extension. Only PeriodIndex, DatetimeIndex with a frequency, \ RangeIndex, and integral Indexes that start at 0 and have only unit \ differences can be extended when producing out-of-sample forecasts. """ ) if type(self._index) in (pd.RangeIndex,) or is_int_index(self._index): start = required_int_like(start, "start") stop = required_int_like(stop, "stop") # Add 1 to ensure that the end point is inclusive stop += 1 return self._range_from_range_index(start, stop) if isinstance(start, (int, np.integer)): start = self._int_to_timestamp(start, "start") else: start = pd.Timestamp(start) if isinstance(stop, (int, np.integer)): stop = self._int_to_timestamp(stop, "stop") else: stop = pd.Timestamp(stop) return self._range_from_time_index(start, stop)
def _validate_index(self) -> None: if isinstance(self._index, pd.PeriodIndex): self._index_freq = self._index.freq self._extendable = True elif isinstance(self._index, pd.DatetimeIndex): self._index_freq = self._index.freq or self._index.inferred_freq self._extendable = self._index_freq is not None elif isinstance(self._index, pd.RangeIndex): self._extendable = True elif is_int_index(self._index): self._extendable = self._index[0] == 0 and np.all( np.diff(self._index) == 1 )
[docs] def apply(self, index): """ Create an identical determinstic process with a different index Parameters ---------- index : index_like An index-like object. If not an index, it is converted to an index. Returns ------- DeterministicProcess The deterministic process applied to a different index """ return DeterministicProcess( index, period=self._period, constant=self._constant, order=self._order, seasonal=self._seasonal, fourier=self._fourier, additional_terms=self._additional_terms, drop=self._drop, )