from __future__ import annotations
from statsmodels.compat.pandas import (
PD_LT_2_2_0,
Appender,
is_int_index,
to_numpy,
)
from abc import ABC, abstractmethod
import datetime as dt
from typing import TYPE_CHECKING, Optional, Union
import numpy as np
import pandas as pd
from scipy.linalg import qr
from statsmodels.iolib.summary import d_or_f
from statsmodels.tools.validation import (
bool_like,
float_like,
required_int_like,
string_like,
)
from statsmodels.tsa.tsatools import freq_to_period
if TYPE_CHECKING:
from collections.abc import Hashable, Sequence
DateLike = Union[dt.datetime, pd.Timestamp, np.datetime64]
IntLike = Union[int, np.integer]
START_BEFORE_INDEX_ERR = """\
start is less than the first observation in the index. Values can only be \
created for observations after the start of the index.
"""
[docs]
class DeterministicTerm(ABC):
"""Abstract Base Class for all Deterministic Terms"""
# Set _is_dummy if the term is a dummy variable process
_is_dummy = False
@property
def is_dummy(self) -> bool:
"""Flag indicating whether the values produced are dummy variables"""
return self._is_dummy
[docs]
@abstractmethod
def in_sample(self, index: Sequence[Hashable]) -> pd.DataFrame:
"""
Produce deterministic trends for in-sample fitting.
Parameters
----------
index : index_like
An index-like object. If not an index, it is converted to an
index.
Returns
-------
DataFrame
A DataFrame containing the deterministic terms.
"""
[docs]
@abstractmethod
def out_of_sample(
self,
steps: int,
index: Sequence[Hashable],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
"""
Produce deterministic trends for out-of-sample forecasts
Parameters
----------
steps : int
The number of steps to forecast
index : index_like
An index-like object. If not an index, it is converted to an
index.
forecast_index : index_like
An Index or index-like object to use for the forecasts. If
provided must have steps elements.
Returns
-------
DataFrame
A DataFrame containing the deterministic terms.
"""
@abstractmethod
def __str__(self) -> str:
"""A meaningful string representation of the term"""
def __hash__(self) -> int:
name: tuple[Hashable, ...] = (type(self).__name__,)
return hash(name + self._eq_attr)
@property
@abstractmethod
def _eq_attr(self) -> tuple[Hashable, ...]:
"""tuple of attributes that are used for equality comparison"""
@staticmethod
def _index_like(index: Sequence[Hashable]) -> pd.Index:
if isinstance(index, pd.Index):
return index
try:
return pd.Index(index)
except Exception as exc:
raise TypeError("index must be a pandas Index or index-like") from exc
@staticmethod
def _extend_index(
index: pd.Index,
steps: int,
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.Index:
"""Extend the forecast index"""
if forecast_index is not None:
forecast_index = DeterministicTerm._index_like(forecast_index)
assert isinstance(forecast_index, pd.Index)
if forecast_index.shape[0] != steps:
raise ValueError(
"The number of values in forecast_index "
f"({forecast_index.shape[0]}) must match steps ({steps})."
)
return forecast_index
if isinstance(index, pd.PeriodIndex):
return pd.period_range(
index[-1] + 1, periods=steps, freq=index.freq
)
elif isinstance(index, pd.DatetimeIndex) and index.freq is not None:
next_obs = pd.date_range(index[-1], freq=index.freq, periods=2)[1]
return pd.date_range(next_obs, freq=index.freq, periods=steps)
elif isinstance(index, pd.RangeIndex):
assert isinstance(index, pd.RangeIndex)
try:
step = index.step
start = index.stop
except AttributeError:
# TODO: Remove after pandas min ver is 1.0.0+
step = index[-1] - index[-2] if len(index) > 1 else 1
start = index[-1] + step
stop = start + step * steps
return pd.RangeIndex(start, stop, step=step)
elif is_int_index(index) and np.all(np.diff(index) == 1):
idx_arr = np.arange(index[-1] + 1, index[-1] + steps + 1)
return pd.Index(idx_arr)
# default range index
import warnings
warnings.warn(
"Only PeriodIndexes, DatetimeIndexes with a frequency set, "
"RangesIndexes, and Index with a unit increment support "
"extending. The index is set will contain the position relative "
"to the data length.",
UserWarning,
stacklevel=2,
)
nobs = index.shape[0]
return pd.RangeIndex(nobs + 1, nobs + steps + 1)
def __repr__(self) -> str:
return self.__str__() + f" at 0x{id(self):0x}"
def __eq__(self, other: object) -> bool:
if isinstance(other, type(self)):
own_attr = self._eq_attr
oth_attr = other._eq_attr
if len(own_attr) != len(oth_attr):
return False
return all(a == b for a, b in zip(own_attr, oth_attr))
else:
return False
[docs]
class TimeTrendDeterministicTerm(DeterministicTerm, ABC):
"""Abstract Base Class for all Time Trend Deterministic Terms"""
def __init__(self, constant: bool = True, order: int = 0) -> None:
self._constant = bool_like(constant, "constant")
self._order = required_int_like(order, "order")
@property
def constant(self) -> bool:
"""Flag indicating that a constant is included"""
return self._constant
@property
def order(self) -> int:
"""Order of the time trend"""
return self._order
@property
def _columns(self) -> list[str]:
columns = []
trend_names = {1: "trend", 2: "trend_squared", 3: "trend_cubed"}
if self._constant:
columns.append("const")
for power in range(1, self._order + 1):
if power in trend_names:
columns.append(trend_names[power])
else:
columns.append(f"trend**{power}")
return columns
def _get_terms(self, locs: np.ndarray) -> np.ndarray:
nterms = int(self._constant) + self._order
terms = np.tile(locs, (1, nterms))
power = np.zeros((1, nterms), dtype=int)
power[0, int(self._constant) :] = np.arange(1, self._order + 1)
terms **= power
return terms
def __str__(self) -> str:
terms = []
if self._constant:
terms.append("Constant")
if self._order:
terms.append(f"Powers 1 to {self._order + 1}")
if not terms:
terms = ["Empty"]
terms_str = ",".join(terms)
return f"TimeTrend({terms_str})"
[docs]
class TimeTrend(TimeTrendDeterministicTerm):
"""
Constant and time trend determinstic terms
Parameters
----------
constant : bool
Flag indicating whether a constant should be included.
order : int
A non-negative int containing the powers to include (1, 2, ..., order).
See Also
--------
DeterministicProcess
Seasonality
Fourier
CalendarTimeTrend
Examples
--------
>>> from statsmodels.datasets import sunspots
>>> from statsmodels.tsa.deterministic import TimeTrend
>>> data = sunspots.load_pandas().data
>>> trend_gen = TimeTrend(True, 3)
>>> trend_gen.in_sample(data.index)
"""
def __init__(self, constant: bool = True, order: int = 0) -> None:
super().__init__(constant, order)
[docs]
@classmethod
def from_string(cls, trend: str) -> "TimeTrend":
"""
Create a TimeTrend from a string description.
Provided for compatibility with common string names.
Parameters
----------
trend : {"n", "c", "t", "ct", "ctt"}
The string representation of the time trend. The terms are:
* "n": No trend terms
* "c": A constant only
* "t": Linear time trend only
* "ct": A constant and a time trend
* "ctt": A constant, a time trend and a quadratic time trend
Returns
-------
TimeTrend
The TimeTrend instance.
"""
constant = trend.startswith("c")
order = 0
if "tt" in trend:
order = 2
elif "t" in trend:
order = 1
return cls(constant=constant, order=order)
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(
self, index: Union[Sequence[Hashable], pd.Index]
) -> pd.DataFrame:
index = self._index_like(index)
nobs = index.shape[0]
locs = np.arange(1, nobs + 1, dtype=np.double)[:, None]
terms = self._get_terms(locs)
return pd.DataFrame(terms, columns=self._columns, index=index)
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
index: Union[Sequence[Hashable], pd.Index],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
index = self._index_like(index)
nobs = index.shape[0]
fcast_index = self._extend_index(index, steps, forecast_index)
locs = np.arange(nobs + 1, nobs + steps + 1, dtype=np.double)[:, None]
terms = self._get_terms(locs)
return pd.DataFrame(terms, columns=self._columns, index=fcast_index)
@property
def _eq_attr(self) -> tuple[Hashable, ...]:
return self._constant, self._order
[docs]
class Seasonality(DeterministicTerm):
"""
Seasonal dummy deterministic terms
Parameters
----------
period : int
The length of a full cycle. Must be >= 2.
initial_period : int
The seasonal index of the first observation. 1-indexed so must
be in {1, 2, ..., period}.
See Also
--------
DeterministicProcess
TimeTrend
Fourier
CalendarSeasonality
Examples
--------
Solar data has an 11-year cycle
>>> from statsmodels.datasets import sunspots
>>> from statsmodels.tsa.deterministic import Seasonality
>>> data = sunspots.load_pandas().data
>>> seas_gen = Seasonality(11)
>>> seas_gen.in_sample(data.index)
To start at a season other than 1
>>> seas_gen = Seasonality(11, initial_period=4)
>>> seas_gen.in_sample(data.index)
"""
_is_dummy = True
def __init__(self, period: int, initial_period: int = 1) -> None:
self._period = required_int_like(period, "period")
self._initial_period = required_int_like(
initial_period, "initial_period"
)
if period < 2:
raise ValueError("period must be >= 2")
if not 1 <= self._initial_period <= period:
raise ValueError("initial_period must be in {1, 2, ..., period}")
@property
def period(self) -> int:
"""The period of the seasonality"""
return self._period
@property
def initial_period(self) -> int:
"""The seasonal index of the first observation"""
return self._initial_period
[docs]
@classmethod
def from_index(
cls, index: Union[Sequence[Hashable], pd.DatetimeIndex, pd.PeriodIndex]
) -> "Seasonality":
"""
Construct a seasonality directly from an index using its frequency.
Parameters
----------
index : {DatetimeIndex, PeriodIndex}
An index with its frequency (`freq`) set.
Returns
-------
Seasonality
The initialized Seasonality instance.
"""
index = cls._index_like(index)
if isinstance(index, pd.PeriodIndex):
freq = index.freq
elif isinstance(index, pd.DatetimeIndex):
freq = index.freq if index.freq else index.inferred_freq
else:
raise TypeError("index must be a DatetimeIndex or PeriodIndex")
if freq is None:
raise ValueError("index must have a freq or inferred_freq set")
period = freq_to_period(freq)
return cls(period=period)
@property
def _eq_attr(self) -> tuple[Hashable, ...]:
return self._period, self._initial_period
def __str__(self) -> str:
return f"Seasonality(period={self._period})"
@property
def _columns(self) -> list[str]:
period = self._period
columns = []
for i in range(1, period + 1):
columns.append(f"s({i},{period})")
return columns
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(
self, index: Union[Sequence[Hashable], pd.Index]
) -> pd.DataFrame:
index = self._index_like(index)
nobs = index.shape[0]
period = self._period
term = np.zeros((nobs, period))
offset = self._initial_period - 1
for i in range(period):
col = (i + offset) % period
term[i::period, col] = 1
return pd.DataFrame(term, columns=self._columns, index=index)
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
index: Union[Sequence[Hashable], pd.Index],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
index = self._index_like(index)
fcast_index = self._extend_index(index, steps, forecast_index)
nobs = index.shape[0]
period = self._period
term = np.zeros((steps, period))
offset = self._initial_period - 1
for i in range(period):
col_loc = (nobs + offset + i) % period
term[i::period, col_loc] = 1
return pd.DataFrame(term, columns=self._columns, index=fcast_index)
[docs]
class FourierDeterministicTerm(DeterministicTerm, ABC):
"""Abstract Base Class for all Fourier Deterministic Terms"""
def __init__(self, order: int) -> None:
self._order = required_int_like(order, "terms")
@property
def order(self) -> int:
"""The order of the Fourier terms included"""
return self._order
def _get_terms(self, locs: np.ndarray) -> np.ndarray:
locs = 2 * np.pi * locs.astype(np.double)
terms = np.empty((locs.shape[0], 2 * self._order))
for i in range(self._order):
for j, func in enumerate((np.sin, np.cos)):
terms[:, 2 * i + j] = func((i + 1) * locs)
return terms
[docs]
class Fourier(FourierDeterministicTerm):
r"""
Fourier series deterministic terms
Parameters
----------
period : int
The length of a full cycle. Must be >= 2.
order : int
The number of Fourier components to include. Must be <= 2*period.
See Also
--------
DeterministicProcess
TimeTrend
Seasonality
CalendarFourier
Notes
-----
Both a sine and a cosine term are included for each i=1, ..., order
.. math::
f_{i,s,t} & = \sin\left(2 \pi i \times \frac{t}{m} \right) \\
f_{i,c,t} & = \cos\left(2 \pi i \times \frac{t}{m} \right)
where m is the length of the period.
Examples
--------
Solar data has an 11-year cycle
>>> from statsmodels.datasets import sunspots
>>> from statsmodels.tsa.deterministic import Fourier
>>> data = sunspots.load_pandas().data
>>> fourier_gen = Fourier(11, order=2)
>>> fourier_gen.in_sample(data.index)
"""
_is_dummy = False
def __init__(self, period: float, order: int):
super().__init__(order)
self._period = float_like(period, "period")
if 2 * self._order > self._period:
raise ValueError("2 * order must be <= period")
@property
def period(self) -> float:
"""The period of the Fourier terms"""
return self._period
@property
def _columns(self) -> list[str]:
period = self._period
fmt_period = d_or_f(period).strip()
columns = []
for i in range(1, self._order + 1):
for typ in ("sin", "cos"):
columns.append(f"{typ}({i},{fmt_period})")
return columns
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(
self, index: Union[Sequence[Hashable], pd.Index]
) -> pd.DataFrame:
index = self._index_like(index)
nobs = index.shape[0]
terms = self._get_terms(np.arange(nobs) / self._period)
return pd.DataFrame(terms, index=index, columns=self._columns)
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
index: Union[Sequence[Hashable], pd.Index],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
index = self._index_like(index)
fcast_index = self._extend_index(index, steps, forecast_index)
nobs = index.shape[0]
terms = self._get_terms(np.arange(nobs, nobs + steps) / self._period)
return pd.DataFrame(terms, index=fcast_index, columns=self._columns)
@property
def _eq_attr(self) -> tuple[Hashable, ...]:
return self._period, self._order
def __str__(self) -> str:
return f"Fourier(period={self._period}, order={self._order})"
[docs]
class CalendarDeterministicTerm(DeterministicTerm, ABC):
"""Abstract Base Class for calendar deterministic terms"""
def __init__(self, freq: str) -> None:
try:
index = pd.date_range("2020-01-01", freq=freq, periods=1)
self._freq = index.freq
except ValueError as exc:
raise ValueError("freq is not understood by pandas") from exc
@property
def freq(self) -> str:
"""The frequency of the deterministic terms"""
return self._freq.freqstr
def _compute_ratio(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
) -> np.ndarray:
if isinstance(index, pd.PeriodIndex):
index = index.to_timestamp()
delta = index - index.to_period(self._freq).to_timestamp()
pi = index.to_period(self._freq)
gap = (pi + 1).to_timestamp() - pi.to_timestamp()
return to_numpy(delta) / to_numpy(gap)
def _check_index_type(
self,
index: pd.Index,
allowed: Union[type, tuple[type, ...]] = (
pd.DatetimeIndex,
pd.PeriodIndex,
),
) -> Union[pd.DatetimeIndex, pd.PeriodIndex]:
if isinstance(allowed, type):
allowed = (allowed,)
if not isinstance(index, allowed):
if len(allowed) == 1:
allowed_types = "a " + allowed[0].__name__
else:
allowed_types = ", ".join(a.__name__ for a in allowed[:-1])
if len(allowed) > 2:
allowed_types += ","
allowed_types += " and " + allowed[-1].__name__
msg = (
f"{type(self).__name__} terms can only be computed from "
f"{allowed_types}"
)
raise TypeError(msg)
assert isinstance(index, (pd.DatetimeIndex, pd.PeriodIndex))
return index
[docs]
class CalendarFourier(CalendarDeterministicTerm, FourierDeterministicTerm):
r"""
Fourier series deterministic terms based on calendar time
Parameters
----------
freq : str
A string convertible to a pandas frequency.
order : int
The number of Fourier components to include. Must be <= 2*period.
See Also
--------
DeterministicProcess
CalendarTimeTrend
CalendarSeasonality
Fourier
Notes
-----
Both a sine and a cosine term are included for each i=1, ..., order
.. math::
f_{i,s,t} & = \sin\left(2 \pi i \tau_t \right) \\
f_{i,c,t} & = \cos\left(2 \pi i \tau_t \right)
where m is the length of the period and :math:`\tau_t` is the frequency
normalized time. For example, when freq is "D" then an observation with
a timestamp of 12:00:00 would have :math:`\tau_t=0.5`.
Examples
--------
Here we simulate irregularly spaced hourly data and construct the calendar
Fourier terms for the data.
>>> import numpy as np
>>> import pandas as pd
>>> base = pd.Timestamp("2020-1-1")
>>> gen = np.random.default_rng()
>>> gaps = np.cumsum(gen.integers(0, 1800, size=1000))
>>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps]
>>> index = pd.DatetimeIndex(pd.to_datetime(times))
>>> from statsmodels.tsa.deterministic import CalendarFourier
>>> cal_fourier_gen = CalendarFourier("D", 2)
>>> cal_fourier_gen.in_sample(index)
"""
def __init__(self, freq: str, order: int) -> None:
super().__init__(freq)
FourierDeterministicTerm.__init__(self, order)
self._order = required_int_like(order, "terms")
@property
def _columns(self) -> list[str]:
columns = []
for i in range(1, self._order + 1):
for typ in ("sin", "cos"):
columns.append(f"{typ}({i},freq={self._freq.freqstr})")
return columns
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(
self, index: Union[Sequence[Hashable], pd.Index]
) -> pd.DataFrame:
index = self._index_like(index)
index = self._check_index_type(index)
ratio = self._compute_ratio(index)
terms = self._get_terms(ratio)
return pd.DataFrame(terms, index=index, columns=self._columns)
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
index: Union[Sequence[Hashable], pd.Index],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
index = self._index_like(index)
fcast_index = self._extend_index(index, steps, forecast_index)
self._check_index_type(fcast_index)
assert isinstance(fcast_index, (pd.DatetimeIndex, pd.PeriodIndex))
ratio = self._compute_ratio(fcast_index)
terms = self._get_terms(ratio)
return pd.DataFrame(terms, index=fcast_index, columns=self._columns)
@property
def _eq_attr(self) -> tuple[Hashable, ...]:
return self._freq.freqstr, self._order
def __str__(self) -> str:
return f"Fourier(freq={self._freq.freqstr}, order={self._order})"
[docs]
class CalendarSeasonality(CalendarDeterministicTerm):
"""
Seasonal dummy deterministic terms based on calendar time
Parameters
----------
freq : str
The frequency of the seasonal effect.
period : str
The pandas frequency string describing the full period.
See Also
--------
DeterministicProcess
CalendarTimeTrend
CalendarFourier
Seasonality
Examples
--------
Here we simulate irregularly spaced data (in time) and hourly seasonal
dummies for the data.
>>> import numpy as np
>>> import pandas as pd
>>> base = pd.Timestamp("2020-1-1")
>>> gen = np.random.default_rng()
>>> gaps = np.cumsum(gen.integers(0, 1800, size=1000))
>>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps]
>>> index = pd.DatetimeIndex(pd.to_datetime(times))
>>> from statsmodels.tsa.deterministic import CalendarSeasonality
>>> cal_seas_gen = CalendarSeasonality("H", "D")
>>> cal_seas_gen.in_sample(index)
"""
_is_dummy = True
# out_of: freq
if PD_LT_2_2_0:
_supported = {
"W": {"B": 5, "D": 7, "h": 24 * 7, "H": 24 * 7},
"D": {"h": 24, "H": 24},
"Q": {"MS": 3, "M": 3},
"A": {"MS": 12, "M": 12},
"Y": {"MS": 12, "Q": 4, "M": 12},
}
else:
_supported = {
"W": {"B": 5, "D": 7, "h": 24 * 7},
"D": {"h": 24},
"Q": {"MS": 3, "ME": 3},
"A": {"MS": 12, "ME": 12, "QE": 4},
"Y": {"MS": 12, "ME": 12, "QE": 4},
"QE": {"ME": 3},
"YE": {"ME": 12, "QE": 4},
}
def __init__(self, freq: str, period: str) -> None:
freq_options: set[str] = set()
freq_options.update(
*[list(val.keys()) for val in self._supported.values()]
)
period_options = tuple(self._supported.keys())
freq = string_like(
freq, "freq", options=tuple(freq_options), lower=False
)
period = string_like(
period, "period", options=period_options, lower=False
)
if freq not in self._supported[period]:
raise ValueError(
f"The combination of freq={freq} and "
f"period={period} is not supported."
)
super().__init__(freq)
self._period = period
self._freq_str = self._freq.freqstr.split("-")[0]
@property
def freq(self) -> str:
"""The frequency of the deterministic terms"""
return self._freq.freqstr
@property
def period(self) -> str:
"""The full period"""
return self._period
def _weekly_to_loc(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
) -> np.ndarray:
if self._freq.freqstr in ("h", "H"):
return index.hour + 24 * index.dayofweek
elif self._freq.freqstr == "D":
return index.dayofweek
else: # "B"
bdays = pd.bdate_range("2000-1-1", periods=10).dayofweek.unique()
loc = index.dayofweek
if not loc.isin(bdays).all():
raise ValueError(
"freq is B but index contains days that are not business "
"days."
)
return loc
def _daily_to_loc(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
) -> np.ndarray:
return index.hour
def _quarterly_to_loc(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
) -> np.ndarray:
return (index.month - 1) % 3
def _annual_to_loc(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
) -> np.ndarray:
if self._freq.freqstr in ("M", "ME", "MS"):
return index.month - 1
else: # "Q"
return index.quarter - 1
def _get_terms(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
) -> np.ndarray:
if self._period == "D":
locs = self._daily_to_loc(index)
elif self._period == "W":
locs = self._weekly_to_loc(index)
elif self._period in ("Q", "QE"):
locs = self._quarterly_to_loc(index)
else: # "A", "Y":
locs = self._annual_to_loc(index)
full_cycle = self._supported[self._period][self._freq_str]
terms = np.zeros((locs.shape[0], full_cycle))
terms[np.arange(locs.shape[0]), locs] = 1
return terms
@property
def _columns(self) -> list[str]:
columns = []
count = self._supported[self._period][self._freq_str]
for i in range(count):
columns.append(
f"s({self._freq_str}={i + 1}, period={self._period})"
)
return columns
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(
self, index: Union[Sequence[Hashable], pd.Index]
) -> pd.DataFrame:
index = self._index_like(index)
index = self._check_index_type(index)
terms = self._get_terms(index)
return pd.DataFrame(terms, index=index, columns=self._columns)
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
index: Union[Sequence[Hashable], pd.Index],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
index = self._index_like(index)
fcast_index = self._extend_index(index, steps, forecast_index)
self._check_index_type(fcast_index)
assert isinstance(fcast_index, (pd.DatetimeIndex, pd.PeriodIndex))
terms = self._get_terms(fcast_index)
return pd.DataFrame(terms, index=fcast_index, columns=self._columns)
@property
def _eq_attr(self) -> tuple[Hashable, ...]:
return self._period, self._freq_str
def __str__(self) -> str:
return f"Seasonal(freq={self._freq_str})"
[docs]
class CalendarTimeTrend(CalendarDeterministicTerm, TimeTrendDeterministicTerm):
r"""
Constant and time trend determinstic terms based on calendar time
Parameters
----------
freq : str
A string convertible to a pandas frequency.
constant : bool
Flag indicating whether a constant should be included.
order : int
A non-negative int containing the powers to include (1, 2, ..., order).
base_period : {str, pd.Timestamp}, default None
The base period to use when computing the time stamps. This value is
treated as 1 and so all other time indices are defined as the number
of periods since or before this time stamp. If not provided, defaults
to pandas base period for a PeriodIndex.
See Also
--------
DeterministicProcess
CalendarFourier
CalendarSeasonality
TimeTrend
Notes
-----
The time stamp, :math:`\tau_t`, is the number of periods that have elapsed
since the base_period. :math:`\tau_t` may be fractional.
Examples
--------
Here we simulate irregularly spaced hourly data and construct the calendar
time trend terms for the data.
>>> import numpy as np
>>> import pandas as pd
>>> base = pd.Timestamp("2020-1-1")
>>> gen = np.random.default_rng()
>>> gaps = np.cumsum(gen.integers(0, 1800, size=1000))
>>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps]
>>> index = pd.DatetimeIndex(pd.to_datetime(times))
>>> from statsmodels.tsa.deterministic import CalendarTimeTrend
>>> cal_trend_gen = CalendarTimeTrend("D", True, order=1)
>>> cal_trend_gen.in_sample(index)
Next, we normalize using the first time stamp
>>> cal_trend_gen = CalendarTimeTrend("D", True, order=1,
... base_period=index[0])
>>> cal_trend_gen.in_sample(index)
"""
def __init__(
self,
freq: str,
constant: bool = True,
order: int = 0,
*,
base_period: Optional[Union[str, DateLike]] = None,
) -> None:
super().__init__(freq)
TimeTrendDeterministicTerm.__init__(
self, constant=constant, order=order
)
self._ref_i8 = 0
if base_period is not None:
pr = pd.period_range(base_period, periods=1, freq=self._freq)
self._ref_i8 = pr.asi8[0]
self._base_period = None if base_period is None else str(base_period)
@property
def base_period(self) -> Optional[str]:
"""The base period"""
return self._base_period
[docs]
@classmethod
def from_string(
cls,
freq: str,
trend: str,
base_period: Optional[Union[str, DateLike]] = None,
) -> "CalendarTimeTrend":
"""
Create a TimeTrend from a string description.
Provided for compatibility with common string names.
Parameters
----------
freq : str
A string convertible to a pandas frequency.
trend : {"n", "c", "t", "ct", "ctt"}
The string representation of the time trend. The terms are:
* "n": No trend terms
* "c": A constant only
* "t": Linear time trend only
* "ct": A constant and a time trend
* "ctt": A constant, a time trend and a quadratic time trend
base_period : {str, pd.Timestamp}, default None
The base period to use when computing the time stamps. This value
is treated as 1 and so all other time indices are defined as the
number of periods since or before this time stamp. If not
provided, defaults to pandas base period for a PeriodIndex.
Returns
-------
TimeTrend
The TimeTrend instance.
"""
constant = trend.startswith("c")
order = 0
if "tt" in trend:
order = 2
elif "t" in trend:
order = 1
return cls(freq, constant, order, base_period=base_period)
def _terms(
self, index: Union[pd.DatetimeIndex, pd.PeriodIndex], ratio: np.ndarray
) -> pd.DataFrame:
if isinstance(index, pd.DatetimeIndex):
index = index.to_period(self._freq)
index_i8 = index.asi8
index_i8 = index_i8 - self._ref_i8 + 1
time = index_i8.astype(np.double) + ratio
time = time[:, None]
terms = self._get_terms(time)
return pd.DataFrame(terms, columns=self._columns, index=index)
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(
self, index: Union[Sequence[Hashable], pd.Index]
) -> pd.DataFrame:
index = self._index_like(index)
index = self._check_index_type(index)
ratio = self._compute_ratio(index)
return self._terms(index, ratio)
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
index: Union[Sequence[Hashable], pd.Index],
forecast_index: Optional[Sequence[Hashable]] = None,
) -> pd.DataFrame:
index = self._index_like(index)
fcast_index = self._extend_index(index, steps, forecast_index)
self._check_index_type(fcast_index)
assert isinstance(fcast_index, (pd.PeriodIndex, pd.DatetimeIndex))
ratio = self._compute_ratio(fcast_index)
return self._terms(fcast_index, ratio)
@property
def _eq_attr(self) -> tuple[Hashable, ...]:
attr: tuple[Hashable, ...] = (
self._constant,
self._order,
self._freq.freqstr,
)
if self._base_period is not None:
attr += (self._base_period,)
return attr
def __str__(self) -> str:
value = TimeTrendDeterministicTerm.__str__(self)
value = "Calendar" + value[:-1] + f", freq={self._freq.freqstr})"
if self._base_period is not None:
value = value[:-1] + f"base_period={self._base_period})"
return value
[docs]
class DeterministicProcess:
"""
Container class for deterministic terms.
Directly supports constants, time trends, and either seasonal dummies or
fourier terms for a single cycle. Additional deterministic terms beyond
the set that can be directly initialized through the constructor can be
added.
Parameters
----------
index : {Sequence[Hashable], pd.Index}
The index of the process. Should usually be the "in-sample" index when
used in forecasting applications.
period : {float, int}, default None
The period of the seasonal or fourier components. Must be an int for
seasonal dummies. If not provided, freq is read from index if
available.
constant : bool, default False
Whether to include a constant.
order : int, default 0
The order of the tim trend to include. For example, 2 will include
both linear and quadratic terms. 0 exclude time trend terms.
seasonal : bool = False
Whether to include seasonal dummies
fourier : int = 0
The order of the fourier terms to included.
additional_terms : Sequence[DeterministicTerm]
A sequence of additional deterministic terms to include in the process.
drop : bool, default False
A flag indicating to check for perfect collinearity and to drop any
linearly dependent terms.
See Also
--------
TimeTrend
Seasonality
Fourier
CalendarTimeTrend
CalendarSeasonality
CalendarFourier
Notes
-----
See the notebook `Deterministic Terms in Time Series Models
<../examples/notebooks/generated/deterministics.html>`__ for an overview.
Examples
--------
>>> from statsmodels.tsa.deterministic import DeterministicProcess
>>> from pandas import date_range
>>> index = date_range("2000-1-1", freq="M", periods=240)
First a determinstic process with a constant and quadratic time trend.
>>> dp = DeterministicProcess(index, constant=True, order=2)
>>> dp.in_sample().head(3)
const trend trend_squared
2000-01-31 1.0 1.0 1.0
2000-02-29 1.0 2.0 4.0
2000-03-31 1.0 3.0 9.0
Seasonal dummies are included by setting seasonal to True.
>>> dp = DeterministicProcess(index, constant=True, seasonal=True)
>>> dp.in_sample().iloc[:3,:5]
const s(2,12) s(3,12) s(4,12) s(5,12)
2000-01-31 1.0 0.0 0.0 0.0 0.0
2000-02-29 1.0 1.0 0.0 0.0 0.0
2000-03-31 1.0 0.0 1.0 0.0 0.0
Fourier components can be used to alternatively capture seasonal patterns,
>>> dp = DeterministicProcess(index, constant=True, fourier=2)
>>> dp.in_sample().head(3)
const sin(1,12) cos(1,12) sin(2,12) cos(2,12)
2000-01-31 1.0 0.000000 1.000000 0.000000 1.0
2000-02-29 1.0 0.500000 0.866025 0.866025 0.5
2000-03-31 1.0 0.866025 0.500000 0.866025 -0.5
Multiple Seasonalities can be captured using additional terms.
>>> from statsmodels.tsa.deterministic import Fourier
>>> index = date_range("2000-1-1", freq="D", periods=5000)
>>> fourier = Fourier(period=365.25, order=1)
>>> dp = DeterministicProcess(index, period=3, constant=True,
... seasonal=True, additional_terms=[fourier])
>>> dp.in_sample().head(3)
const s(2,3) s(3,3) sin(1,365.25) cos(1,365.25)
2000-01-01 1.0 0.0 0.0 0.000000 1.000000
2000-01-02 1.0 1.0 0.0 0.017202 0.999852
2000-01-03 1.0 0.0 1.0 0.034398 0.999408
"""
def __init__(
self,
index: Union[Sequence[Hashable], pd.Index],
*,
period: Optional[Union[float, int]] = None,
constant: bool = False,
order: int = 0,
seasonal: bool = False,
fourier: int = 0,
additional_terms: Sequence[DeterministicTerm] = (),
drop: bool = False,
):
if not isinstance(index, pd.Index):
index = pd.Index(index)
self._index = index
self._deterministic_terms: list[DeterministicTerm] = []
self._extendable = False
self._index_freq = None
self._validate_index()
period = float_like(period, "period", optional=True)
self._constant = constant = bool_like(constant, "constant")
self._order = required_int_like(order, "order")
self._seasonal = seasonal = bool_like(seasonal, "seasonal")
self._fourier = required_int_like(fourier, "fourier")
additional_terms = tuple(additional_terms)
self._cached_in_sample = None
self._drop = bool_like(drop, "drop")
self._additional_terms = additional_terms
if constant or order:
self._deterministic_terms.append(TimeTrend(constant, order))
if seasonal and fourier:
raise ValueError(
"""seasonal and fourier can be initialized through the \
constructor since these will be necessarily perfectly collinear. Instead, \
you can pass additional components using the additional_terms input."""
)
if (seasonal or fourier) and period is None:
if period is None:
self._period = period = freq_to_period(self._index_freq)
if seasonal:
period = required_int_like(period, "period")
self._deterministic_terms.append(Seasonality(period))
elif fourier:
period = float_like(period, "period")
assert period is not None
self._deterministic_terms.append(Fourier(period, order=fourier))
for term in additional_terms:
if not isinstance(term, DeterministicTerm):
raise TypeError(
"All additional terms must be instances of subsclasses "
"of DeterministicTerm"
)
if term not in self._deterministic_terms:
self._deterministic_terms.append(term)
else:
raise ValueError(
"One or more terms in additional_terms has been added "
"through the parameters of the constructor. Terms must "
"be unique."
)
self._period = period
self._retain_cols: Optional[list[Hashable]] = None
@property
def index(self) -> pd.Index:
"""The index of the process"""
return self._index
@property
def terms(self) -> list[DeterministicTerm]:
"""The deterministic terms included in the process"""
return self._deterministic_terms
def _adjust_dummies(self, terms: list[pd.DataFrame]) -> list[pd.DataFrame]:
has_const: Optional[bool] = None
for dterm in self._deterministic_terms:
if isinstance(dterm, (TimeTrend, CalendarTimeTrend)):
has_const = has_const or dterm.constant
if has_const is None:
has_const = False
for term in terms:
const_col = (term == term.iloc[0]).all() & (term.iloc[0] != 0)
has_const = has_const or const_col.any()
drop_first = has_const
for i, dterm in enumerate(self._deterministic_terms):
is_dummy = dterm.is_dummy
if is_dummy and drop_first:
# drop first
terms[i] = terms[i].iloc[:, 1:]
drop_first = drop_first or is_dummy
return terms
def _remove_zeros_ones(self, terms: pd.DataFrame) -> pd.DataFrame:
all_zero = np.all(terms == 0, axis=0)
if np.any(all_zero):
terms = terms.loc[:, ~all_zero]
is_constant = terms.max(axis=0) == terms.min(axis=0)
if np.sum(is_constant) > 1:
# flag surplus constant columns
surplus_consts = is_constant & is_constant.duplicated()
terms = terms.loc[:, ~surplus_consts]
return terms
[docs]
@Appender(DeterministicTerm.in_sample.__doc__)
def in_sample(self) -> pd.DataFrame:
if self._cached_in_sample is not None:
return self._cached_in_sample
index = self._index
if not self._deterministic_terms:
return pd.DataFrame(np.empty((index.shape[0], 0)), index=index)
raw_terms = []
for term in self._deterministic_terms:
raw_terms.append(term.in_sample(index))
raw_terms = self._adjust_dummies(raw_terms)
terms: pd.DataFrame = pd.concat(raw_terms, axis=1)
terms = self._remove_zeros_ones(terms)
if self._drop:
terms_arr = to_numpy(terms)
res = qr(terms_arr, mode="r", pivoting=True)
r = res[0]
p = res[-1]
abs_diag = np.abs(np.diag(r))
tol = abs_diag[0] * terms_arr.shape[1] * np.finfo(float).eps
rank = int(np.sum(abs_diag > tol))
rpx = r.T @ terms_arr
keep = [0]
last_rank = 1
# Find the left-most columns that produce full rank
for i in range(1, terms_arr.shape[1]):
curr_rank = np.linalg.matrix_rank(rpx[: i + 1, : i + 1])
if curr_rank > last_rank:
keep.append(i)
last_rank = curr_rank
if curr_rank == rank:
break
if len(keep) == rank:
terms = terms.iloc[:, keep]
else:
terms = terms.iloc[:, np.sort(p[:rank])]
self._retain_cols = terms.columns
self._cached_in_sample = terms
return terms
[docs]
@Appender(DeterministicTerm.out_of_sample.__doc__)
def out_of_sample(
self,
steps: int,
forecast_index: Optional[Union[Sequence[Hashable], pd.Index]] = None,
) -> pd.DataFrame:
steps = required_int_like(steps, "steps")
if self._drop and self._retain_cols is None:
self.in_sample()
index = self._index
if not self._deterministic_terms:
return pd.DataFrame(np.empty((index.shape[0], 0)), index=index)
raw_terms = []
for term in self._deterministic_terms:
raw_terms.append(term.out_of_sample(steps, index, forecast_index))
terms: pd.DataFrame = pd.concat(raw_terms, axis=1)
assert self._retain_cols is not None
if terms.shape[1] != len(self._retain_cols):
terms = terms[self._retain_cols]
return terms
def _extend_time_index(
self,
stop: pd.Timestamp,
) -> Union[pd.DatetimeIndex, pd.PeriodIndex]:
index = self._index
if isinstance(index, pd.PeriodIndex):
return pd.period_range(index[0], end=stop, freq=index.freq)
return pd.date_range(start=index[0], end=stop, freq=self._index_freq)
def _range_from_range_index(self, start: int, stop: int) -> pd.DataFrame:
index = self._index
is_int64_index = is_int_index(index)
assert isinstance(index, pd.RangeIndex) or is_int64_index
if start < index[0]:
raise ValueError(START_BEFORE_INDEX_ERR)
if isinstance(index, pd.RangeIndex):
idx_step = index.step
else:
idx_step = np.diff(index).max() if len(index) > 1 else 1
if idx_step != 1 and ((start - index[0]) % idx_step) != 0:
raise ValueError(
f"The step of the index is not 1 (actual step={idx_step})."
" start must be in the sequence that would have been "
"generated by the index."
)
if is_int64_index:
new_idx = pd.Index(np.arange(start, stop))
else:
new_idx = pd.RangeIndex(start, stop, step=idx_step)
if new_idx[-1] <= self._index[-1]:
# In-sample only
in_sample = self.in_sample()
in_sample = in_sample.loc[new_idx]
return in_sample
elif new_idx[0] > self._index[-1]:
# Out of-sample only
next_value = index[-1] + idx_step
if new_idx[0] != next_value:
tmp = pd.RangeIndex(next_value, stop, step=idx_step)
oos = self.out_of_sample(tmp.shape[0], forecast_index=tmp)
return oos.loc[new_idx]
return self.out_of_sample(new_idx.shape[0], forecast_index=new_idx)
# Using some from each in and out of sample
in_sample_loc = new_idx <= self._index[-1]
in_sample_idx = new_idx[in_sample_loc]
out_of_sample_idx = new_idx[~in_sample_loc]
in_sample_exog = self.in_sample().loc[in_sample_idx]
oos_exog = self.out_of_sample(
steps=out_of_sample_idx.shape[0], forecast_index=out_of_sample_idx
)
return pd.concat([in_sample_exog, oos_exog], axis=0)
def _range_from_time_index(
self, start: pd.Timestamp, stop: pd.Timestamp
) -> pd.DataFrame:
index = self._index
if isinstance(self._index, pd.PeriodIndex):
if isinstance(start, pd.Timestamp):
start = start.to_period(freq=self._index_freq)
if isinstance(stop, pd.Timestamp):
stop = stop.to_period(freq=self._index_freq)
if start < index[0]:
raise ValueError(START_BEFORE_INDEX_ERR)
if stop <= self._index[-1]:
return self.in_sample().loc[start:stop]
new_idx = self._extend_time_index(stop)
oos_idx = new_idx[new_idx > index[-1]]
oos = self.out_of_sample(oos_idx.shape[0], oos_idx)
if start >= oos_idx[0]:
return oos.loc[start:stop]
both = pd.concat([self.in_sample(), oos], axis=0)
return both.loc[start:stop]
def _int_to_timestamp(self, value: int, name: str) -> pd.Timestamp:
if value < 0:
raise ValueError(f"{name} must be non-negative.")
if value < self._index.shape[0]:
return self._index[value]
add_periods = value - (self._index.shape[0] - 1) + 1
index = self._index
if isinstance(self._index, pd.PeriodIndex):
pr = pd.period_range(
index[-1], freq=self._index_freq, periods=add_periods
)
return pr[-1].to_timestamp()
dr = pd.date_range(
index[-1], freq=self._index_freq, periods=add_periods
)
return dr[-1]
[docs]
def range(
self,
start: Union[IntLike, DateLike, str],
stop: Union[IntLike, DateLike, str],
) -> pd.DataFrame:
"""
Deterministic terms spanning a range of observations
Parameters
----------
start : {int, str, dt.datetime, pd.Timestamp, np.datetime64}
The first observation.
stop : {int, str, dt.datetime, pd.Timestamp, np.datetime64}
The final observation. Inclusive to match most prediction
function in statsmodels.
Returns
-------
DataFrame
A data frame of deterministic terms
"""
if not self._extendable:
raise TypeError(
"""The index in the deterministic process does not \
support extension. Only PeriodIndex, DatetimeIndex with a frequency, \
RangeIndex, and integral Indexes that start at 0 and have only unit \
differences can be extended when producing out-of-sample forecasts.
"""
)
if type(self._index) in (pd.RangeIndex,) or is_int_index(self._index):
start = required_int_like(start, "start")
stop = required_int_like(stop, "stop")
# Add 1 to ensure that the end point is inclusive
stop += 1
return self._range_from_range_index(start, stop)
if isinstance(start, (int, np.integer)):
start = self._int_to_timestamp(start, "start")
else:
start = pd.Timestamp(start)
if isinstance(stop, (int, np.integer)):
stop = self._int_to_timestamp(stop, "stop")
else:
stop = pd.Timestamp(stop)
return self._range_from_time_index(start, stop)
def _validate_index(self) -> None:
if isinstance(self._index, pd.PeriodIndex):
self._index_freq = self._index.freq
self._extendable = True
elif isinstance(self._index, pd.DatetimeIndex):
self._index_freq = self._index.freq or self._index.inferred_freq
self._extendable = self._index_freq is not None
elif isinstance(self._index, pd.RangeIndex):
self._extendable = True
elif is_int_index(self._index):
self._extendable = self._index[0] == 0 and np.all(
np.diff(self._index) == 1
)
[docs]
def apply(self, index):
"""
Create an identical determinstic process with a different index
Parameters
----------
index : index_like
An index-like object. If not an index, it is converted to an
index.
Returns
-------
DeterministicProcess
The deterministic process applied to a different index
"""
return DeterministicProcess(
index,
period=self._period,
constant=self._constant,
order=self._order,
seasonal=self._seasonal,
fourier=self._fourier,
additional_terms=self._additional_terms,
drop=self._drop,
)