Source code for statsmodels.tsa.deterministic

from statsmodels.compat.pandas import (
    PD_LT_2_2_0,
    Appender,
    is_int_index,
    to_numpy,
)

from abc import ABC, abstractmethod
import datetime as dt
from typing import Optional, Union
from collections.abc import Hashable, Sequence

import numpy as np
import pandas as pd
from scipy.linalg import qr

from statsmodels.iolib.summary import d_or_f
from statsmodels.tools.validation import (
    bool_like,
    float_like,
    required_int_like,
    string_like,
)
from statsmodels.tsa.tsatools import freq_to_period

DateLike = Union[dt.datetime, pd.Timestamp, np.datetime64]
IntLike = Union[int, np.integer]


START_BEFORE_INDEX_ERR = """\
start is less than the first observation in the index. Values can only be \
created for observations after the start of the index.
"""



[docs]
class DeterministicTerm(ABC):
    """Abstract Base Class for all Deterministic Terms"""

    # Set _is_dummy if the term is a dummy variable process
    _is_dummy = False

    @property
    def is_dummy(self) -> bool:
        """Flag indicating whether the values produced are dummy variables"""
        return self._is_dummy


[docs]
    @abstractmethod
    def in_sample(self, index: Sequence[Hashable]) -> pd.DataFrame:
        """
        Produce deterministic trends for in-sample fitting.

        Parameters
        ----------
        index : index_like
            An index-like object. If not an index, it is converted to an
            index.

        Returns
        -------
        DataFrame
            A DataFrame containing the deterministic terms.
        """



[docs]
    @abstractmethod
    def out_of_sample(
        self,
        steps: int,
        index: Sequence[Hashable],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        """
        Produce deterministic trends for out-of-sample forecasts

        Parameters
        ----------
        steps : int
            The number of steps to forecast
        index : index_like
            An index-like object. If not an index, it is converted to an
            index.
        forecast_index : index_like
            An Index or index-like object to use for the forecasts. If
            provided must have steps elements.

        Returns
        -------
        DataFrame
            A DataFrame containing the deterministic terms.
        """


    @abstractmethod
    def __str__(self) -> str:
        """A meaningful string representation of the term"""

    def __hash__(self) -> int:
        name: tuple[Hashable, ...] = (type(self).__name__,)
        return hash(name + self._eq_attr)

    @property
    @abstractmethod
    def _eq_attr(self) -> tuple[Hashable, ...]:
        """tuple of attributes that are used for equality comparison"""

    @staticmethod
    def _index_like(index: Sequence[Hashable]) -> pd.Index:
        if isinstance(index, pd.Index):
            return index
        try:
            return pd.Index(index)
        except Exception:
            raise TypeError("index must be a pandas Index or index-like")

    @staticmethod
    def _extend_index(
        index: pd.Index,
        steps: int,
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.Index:
        """Extend the forecast index"""
        if forecast_index is not None:
            forecast_index = DeterministicTerm._index_like(forecast_index)
            assert isinstance(forecast_index, pd.Index)
            if forecast_index.shape[0] != steps:
                raise ValueError(
                    "The number of values in forecast_index "
                    f"({forecast_index.shape[0]}) must match steps ({steps})."
                )
            return forecast_index
        if isinstance(index, pd.PeriodIndex):
            return pd.period_range(
                index[-1] + 1, periods=steps, freq=index.freq
            )
        elif isinstance(index, pd.DatetimeIndex) and index.freq is not None:
            next_obs = pd.date_range(index[-1], freq=index.freq, periods=2)[1]
            return pd.date_range(next_obs, freq=index.freq, periods=steps)
        elif isinstance(index, pd.RangeIndex):
            assert isinstance(index, pd.RangeIndex)
            try:
                step = index.step
                start = index.stop
            except AttributeError:
                # TODO: Remove after pandas min ver is 1.0.0+
                step = index[-1] - index[-2] if len(index) > 1 else 1
                start = index[-1] + step
            stop = start + step * steps
            return pd.RangeIndex(start, stop, step=step)
        elif is_int_index(index) and np.all(np.diff(index) == 1):
            idx_arr = np.arange(index[-1] + 1, index[-1] + steps + 1)
            return pd.Index(idx_arr)
        # default range index
        import warnings

        warnings.warn(
            "Only PeriodIndexes, DatetimeIndexes with a frequency set, "
            "RangesIndexes, and Index with a unit increment support "
            "extending. The index is set will contain the position relative "
            "to the data length.",
            UserWarning,
            stacklevel=2,
        )
        nobs = index.shape[0]
        return pd.RangeIndex(nobs + 1, nobs + steps + 1)

    def __repr__(self) -> str:
        return self.__str__() + f" at 0x{id(self):0x}"

    def __eq__(self, other: object) -> bool:
        if isinstance(other, type(self)):
            own_attr = self._eq_attr
            oth_attr = other._eq_attr
            if len(own_attr) != len(oth_attr):
                return False
            return all([a == b for a, b in zip(own_attr, oth_attr)])
        else:
            return False




[docs]
class TimeTrendDeterministicTerm(DeterministicTerm, ABC):
    """Abstract Base Class for all Time Trend Deterministic Terms"""

    def __init__(self, constant: bool = True, order: int = 0) -> None:
        self._constant = bool_like(constant, "constant")
        self._order = required_int_like(order, "order")

    @property
    def constant(self) -> bool:
        """Flag indicating that a constant is included"""
        return self._constant

    @property
    def order(self) -> int:
        """Order of the time trend"""
        return self._order

    @property
    def _columns(self) -> list[str]:
        columns = []
        trend_names = {1: "trend", 2: "trend_squared", 3: "trend_cubed"}
        if self._constant:
            columns.append("const")
        for power in range(1, self._order + 1):
            if power in trend_names:
                columns.append(trend_names[power])
            else:
                columns.append(f"trend**{power}")
        return columns

    def _get_terms(self, locs: np.ndarray) -> np.ndarray:
        nterms = int(self._constant) + self._order
        terms = np.tile(locs, (1, nterms))
        power = np.zeros((1, nterms), dtype=int)
        power[0, int(self._constant) :] = np.arange(1, self._order + 1)
        terms **= power
        return terms

    def __str__(self) -> str:
        terms = []
        if self._constant:
            terms.append("Constant")
        if self._order:
            terms.append(f"Powers 1 to {self._order + 1}")
        if not terms:
            terms = ["Empty"]
        terms_str = ",".join(terms)
        return f"TimeTrend({terms_str})"




[docs]
class TimeTrend(TimeTrendDeterministicTerm):
    """
    Constant and time trend determinstic terms

    Parameters
    ----------
    constant : bool
        Flag indicating whether a constant should be included.
    order : int
        A non-negative int containing the powers to include (1, 2, ..., order).

    See Also
    --------
    DeterministicProcess
    Seasonality
    Fourier
    CalendarTimeTrend

    Examples
    --------
    >>> from statsmodels.datasets import sunspots
    >>> from statsmodels.tsa.deterministic import TimeTrend
    >>> data = sunspots.load_pandas().data
    >>> trend_gen = TimeTrend(True, 3)
    >>> trend_gen.in_sample(data.index)
    """

    def __init__(self, constant: bool = True, order: int = 0) -> None:
        super().__init__(constant, order)


[docs]
    @classmethod
    def from_string(cls, trend: str) -> "TimeTrend":
        """
        Create a TimeTrend from a string description.

        Provided for compatibility with common string names.

        Parameters
        ----------
        trend : {"n", "c", "t", "ct", "ctt"}
            The string representation of the time trend. The terms are:

            * "n": No trend terms
            * "c": A constant only
            * "t": Linear time trend only
            * "ct": A constant and a time trend
            * "ctt": A constant, a time trend and a quadratic time trend

        Returns
        -------
        TimeTrend
            The TimeTrend instance.
        """
        constant = trend.startswith("c")
        order = 0
        if "tt" in trend:
            order = 2
        elif "t" in trend:
            order = 1
        return cls(constant=constant, order=order)



[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(
        self, index: Union[Sequence[Hashable], pd.Index]
    ) -> pd.DataFrame:
        index = self._index_like(index)
        nobs = index.shape[0]
        locs = np.arange(1, nobs + 1, dtype=np.double)[:, None]
        terms = self._get_terms(locs)
        return pd.DataFrame(terms, columns=self._columns, index=index)



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        index: Union[Sequence[Hashable], pd.Index],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        index = self._index_like(index)
        nobs = index.shape[0]
        fcast_index = self._extend_index(index, steps, forecast_index)
        locs = np.arange(nobs + 1, nobs + steps + 1, dtype=np.double)[:, None]
        terms = self._get_terms(locs)
        return pd.DataFrame(terms, columns=self._columns, index=fcast_index)


    @property
    def _eq_attr(self) -> tuple[Hashable, ...]:
        return self._constant, self._order




[docs]
class Seasonality(DeterministicTerm):
    """
    Seasonal dummy deterministic terms

    Parameters
    ----------
    period : int
        The length of a full cycle. Must be >= 2.
    initial_period : int
        The seasonal index of the first observation. 1-indexed so must
        be in {1, 2, ..., period}.

    See Also
    --------
    DeterministicProcess
    TimeTrend
    Fourier
    CalendarSeasonality

    Examples
    --------
    Solar data has an 11-year cycle

    >>> from statsmodels.datasets import sunspots
    >>> from statsmodels.tsa.deterministic import Seasonality
    >>> data = sunspots.load_pandas().data
    >>> seas_gen = Seasonality(11)
    >>> seas_gen.in_sample(data.index)

    To start at a season other than 1

    >>> seas_gen = Seasonality(11, initial_period=4)
    >>> seas_gen.in_sample(data.index)
    """

    _is_dummy = True

    def __init__(self, period: int, initial_period: int = 1) -> None:
        self._period = required_int_like(period, "period")
        self._initial_period = required_int_like(
            initial_period, "initial_period"
        )
        if period < 2:
            raise ValueError("period must be >= 2")
        if not 1 <= self._initial_period <= period:
            raise ValueError("initial_period must be in {1, 2, ..., period}")

    @property
    def period(self) -> int:
        """The period of the seasonality"""
        return self._period

    @property
    def initial_period(self) -> int:
        """The seasonal index of the first observation"""
        return self._initial_period


[docs]
    @classmethod
    def from_index(
        cls, index: Union[Sequence[Hashable], pd.DatetimeIndex, pd.PeriodIndex]
    ) -> "Seasonality":
        """
        Construct a seasonality directly from an index using its frequency.

        Parameters
        ----------
        index : {DatetimeIndex, PeriodIndex}
            An index with its frequency (`freq`) set.

        Returns
        -------
        Seasonality
            The initialized Seasonality instance.
        """
        index = cls._index_like(index)
        if isinstance(index, pd.PeriodIndex):
            freq = index.freq
        elif isinstance(index, pd.DatetimeIndex):
            freq = index.freq if index.freq else index.inferred_freq
        else:
            raise TypeError("index must be a DatetimeIndex or PeriodIndex")
        if freq is None:
            raise ValueError("index must have a freq or inferred_freq set")
        period = freq_to_period(freq)
        return cls(period=period)


    @property
    def _eq_attr(self) -> tuple[Hashable, ...]:
        return self._period, self._initial_period

    def __str__(self) -> str:
        return f"Seasonality(period={self._period})"

    @property
    def _columns(self) -> list[str]:
        period = self._period
        columns = []
        for i in range(1, period + 1):
            columns.append(f"s({i},{period})")
        return columns


[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(
        self, index: Union[Sequence[Hashable], pd.Index]
    ) -> pd.DataFrame:
        index = self._index_like(index)
        nobs = index.shape[0]
        period = self._period
        term = np.zeros((nobs, period))
        offset = self._initial_period - 1
        for i in range(period):
            col = (i + offset) % period
            term[i::period, col] = 1
        return pd.DataFrame(term, columns=self._columns, index=index)



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        index: Union[Sequence[Hashable], pd.Index],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        index = self._index_like(index)
        fcast_index = self._extend_index(index, steps, forecast_index)
        nobs = index.shape[0]
        period = self._period
        term = np.zeros((steps, period))
        offset = self._initial_period - 1
        for i in range(period):
            col_loc = (nobs + offset + i) % period
            term[i::period, col_loc] = 1
        return pd.DataFrame(term, columns=self._columns, index=fcast_index)





[docs]
class FourierDeterministicTerm(DeterministicTerm, ABC):
    """Abstract Base Class for all Fourier Deterministic Terms"""

    def __init__(self, order: int) -> None:
        self._order = required_int_like(order, "terms")

    @property
    def order(self) -> int:
        """The order of the Fourier terms included"""
        return self._order

    def _get_terms(self, locs: np.ndarray) -> np.ndarray:
        locs = 2 * np.pi * locs.astype(np.double)
        terms = np.empty((locs.shape[0], 2 * self._order))
        for i in range(self._order):
            for j, func in enumerate((np.sin, np.cos)):
                terms[:, 2 * i + j] = func((i + 1) * locs)
        return terms




[docs]
class Fourier(FourierDeterministicTerm):
    r"""
    Fourier series deterministic terms

    Parameters
    ----------
    period : int
        The length of a full cycle. Must be >= 2.
    order : int
        The number of Fourier components to include. Must be <= 2*period.

    See Also
    --------
    DeterministicProcess
    TimeTrend
    Seasonality
    CalendarFourier

    Notes
    -----
    Both a sine and a cosine term are included for each i=1, ..., order

    .. math::

       f_{i,s,t} & = \sin\left(2 \pi i \times \frac{t}{m} \right)  \\
       f_{i,c,t} & = \cos\left(2 \pi i \times \frac{t}{m} \right)

    where m is the length of the period.

    Examples
    --------
    Solar data has an 11-year cycle

    >>> from statsmodels.datasets import sunspots
    >>> from statsmodels.tsa.deterministic import Fourier
    >>> data = sunspots.load_pandas().data
    >>> fourier_gen = Fourier(11, order=2)
    >>> fourier_gen.in_sample(data.index)
    """
    _is_dummy = False

    def __init__(self, period: float, order: int):
        super().__init__(order)
        self._period = float_like(period, "period")
        if 2 * self._order > self._period:
            raise ValueError("2 * order must be <= period")

    @property
    def period(self) -> float:
        """The period of the Fourier terms"""
        return self._period

    @property
    def _columns(self) -> list[str]:
        period = self._period
        fmt_period = d_or_f(period).strip()
        columns = []
        for i in range(1, self._order + 1):
            for typ in ("sin", "cos"):
                columns.append(f"{typ}({i},{fmt_period})")
        return columns


[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(
        self, index: Union[Sequence[Hashable], pd.Index]
    ) -> pd.DataFrame:
        index = self._index_like(index)
        nobs = index.shape[0]
        terms = self._get_terms(np.arange(nobs) / self._period)
        return pd.DataFrame(terms, index=index, columns=self._columns)



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        index: Union[Sequence[Hashable], pd.Index],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        index = self._index_like(index)
        fcast_index = self._extend_index(index, steps, forecast_index)
        nobs = index.shape[0]
        terms = self._get_terms(np.arange(nobs, nobs + steps) / self._period)
        return pd.DataFrame(terms, index=fcast_index, columns=self._columns)


    @property
    def _eq_attr(self) -> tuple[Hashable, ...]:
        return self._period, self._order

    def __str__(self) -> str:
        return f"Fourier(period={self._period}, order={self._order})"




[docs]
class CalendarDeterministicTerm(DeterministicTerm, ABC):
    """Abstract Base Class for calendar deterministic terms"""

    def __init__(self, freq: str) -> None:
        try:
            index = pd.date_range("2020-01-01", freq=freq, periods=1)
            self._freq = index.freq
        except ValueError:
            raise ValueError("freq is not understood by pandas")

    @property
    def freq(self) -> str:
        """The frequency of the deterministic terms"""
        return self._freq.freqstr

    def _compute_ratio(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
    ) -> np.ndarray:
        if isinstance(index, pd.PeriodIndex):
            index = index.to_timestamp()
        delta = index - index.to_period(self._freq).to_timestamp()
        pi = index.to_period(self._freq)
        gap = (pi + 1).to_timestamp() - pi.to_timestamp()
        return to_numpy(delta) / to_numpy(gap)

    def _check_index_type(
        self,
        index: pd.Index,
        allowed: Union[type, tuple[type, ...]] = (
            pd.DatetimeIndex,
            pd.PeriodIndex,
        ),
    ) -> Union[pd.DatetimeIndex, pd.PeriodIndex]:
        if isinstance(allowed, type):
            allowed = (allowed,)
        if not isinstance(index, allowed):
            if len(allowed) == 1:
                allowed_types = "a " + allowed[0].__name__
            else:
                allowed_types = ", ".join(a.__name__ for a in allowed[:-1])
                if len(allowed) > 2:
                    allowed_types += ","
                allowed_types += " and " + allowed[-1].__name__
            msg = (
                f"{type(self).__name__} terms can only be computed from "
                f"{allowed_types}"
            )
            raise TypeError(msg)
        assert isinstance(index, (pd.DatetimeIndex, pd.PeriodIndex))
        return index




[docs]
class CalendarFourier(CalendarDeterministicTerm, FourierDeterministicTerm):
    r"""
    Fourier series deterministic terms based on calendar time

    Parameters
    ----------
    freq : str
        A string convertible to a pandas frequency.
    order : int
        The number of Fourier components to include. Must be <= 2*period.

    See Also
    --------
    DeterministicProcess
    CalendarTimeTrend
    CalendarSeasonality
    Fourier

    Notes
    -----
    Both a sine and a cosine term are included for each i=1, ..., order

    .. math::

       f_{i,s,t} & = \sin\left(2 \pi i \tau_t \right)  \\
       f_{i,c,t} & = \cos\left(2 \pi i \tau_t \right)

    where m is the length of the period and :math:`\tau_t` is the frequency
    normalized time.  For example, when freq is "D" then an observation with
    a timestamp of 12:00:00 would have :math:`\tau_t=0.5`.

    Examples
    --------
    Here we simulate irregularly spaced hourly data and construct the calendar
    Fourier terms for the data.

    >>> import numpy as np
    >>> import pandas as pd
    >>> base = pd.Timestamp("2020-1-1")
    >>> gen = np.random.default_rng()
    >>> gaps = np.cumsum(gen.integers(0, 1800, size=1000))
    >>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps]
    >>> index = pd.DatetimeIndex(pd.to_datetime(times))

    >>> from statsmodels.tsa.deterministic import CalendarFourier
    >>> cal_fourier_gen = CalendarFourier("D", 2)
    >>> cal_fourier_gen.in_sample(index)
    """

    def __init__(self, freq: str, order: int) -> None:
        super().__init__(freq)
        FourierDeterministicTerm.__init__(self, order)
        self._order = required_int_like(order, "terms")

    @property
    def _columns(self) -> list[str]:
        columns = []
        for i in range(1, self._order + 1):
            for typ in ("sin", "cos"):
                columns.append(f"{typ}({i},freq={self._freq.freqstr})")
        return columns


[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(
        self, index: Union[Sequence[Hashable], pd.Index]
    ) -> pd.DataFrame:
        index = self._index_like(index)
        index = self._check_index_type(index)

        ratio = self._compute_ratio(index)
        terms = self._get_terms(ratio)
        return pd.DataFrame(terms, index=index, columns=self._columns)



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        index: Union[Sequence[Hashable], pd.Index],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        index = self._index_like(index)
        fcast_index = self._extend_index(index, steps, forecast_index)
        self._check_index_type(fcast_index)
        assert isinstance(fcast_index, (pd.DatetimeIndex, pd.PeriodIndex))
        ratio = self._compute_ratio(fcast_index)
        terms = self._get_terms(ratio)
        return pd.DataFrame(terms, index=fcast_index, columns=self._columns)


    @property
    def _eq_attr(self) -> tuple[Hashable, ...]:
        return self._freq.freqstr, self._order

    def __str__(self) -> str:
        return f"Fourier(freq={self._freq.freqstr}, order={self._order})"




[docs]
class CalendarSeasonality(CalendarDeterministicTerm):
    """
    Seasonal dummy deterministic terms based on calendar time

    Parameters
    ----------
    freq : str
        The frequency of the seasonal effect.
    period : str
        The pandas frequency string describing the full period.

    See Also
    --------
    DeterministicProcess
    CalendarTimeTrend
    CalendarFourier
    Seasonality

    Examples
    --------
    Here we simulate irregularly spaced data (in time) and hourly seasonal
    dummies for the data.

    >>> import numpy as np
    >>> import pandas as pd
    >>> base = pd.Timestamp("2020-1-1")
    >>> gen = np.random.default_rng()
    >>> gaps = np.cumsum(gen.integers(0, 1800, size=1000))
    >>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps]
    >>> index = pd.DatetimeIndex(pd.to_datetime(times))

    >>> from statsmodels.tsa.deterministic import CalendarSeasonality
    >>> cal_seas_gen = CalendarSeasonality("H", "D")
    >>> cal_seas_gen.in_sample(index)
    """

    _is_dummy = True

    # out_of: freq
    if PD_LT_2_2_0:
        _supported = {
            "W": {"B": 5, "D": 7, "h": 24 * 7, "H": 24 * 7},
            "D": {"h": 24, "H": 24},
            "Q": {"MS": 3, "M": 3},
            "A": {"MS": 12, "M": 12},
            "Y": {"MS": 12, "Q": 4, "M": 12},
        }
    else:
        _supported = {
            "W": {"B": 5, "D": 7, "h": 24 * 7},
            "D": {"h": 24},
            "Q": {"MS": 3, "ME": 3},
            "A": {"MS": 12, "ME": 12, "QE": 4},
            "Y": {"MS": 12, "ME": 12, "QE": 4},
            "QE": {"ME": 3},
            "YE": {"ME": 12, "QE": 4},
        }

    def __init__(self, freq: str, period: str) -> None:
        freq_options: set[str] = set()
        freq_options.update(
            *[list(val.keys()) for val in self._supported.values()]
        )
        period_options = tuple(self._supported.keys())

        freq = string_like(
            freq, "freq", options=tuple(freq_options), lower=False
        )
        period = string_like(
            period, "period", options=period_options, lower=False
        )
        if freq not in self._supported[period]:
            raise ValueError(
                f"The combination of freq={freq} and "
                f"period={period} is not supported."
            )
        super().__init__(freq)
        self._period = period
        self._freq_str = self._freq.freqstr.split("-")[0]

    @property
    def freq(self) -> str:
        """The frequency of the deterministic terms"""
        return self._freq.freqstr

    @property
    def period(self) -> str:
        """The full period"""
        return self._period

    def _weekly_to_loc(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
    ) -> np.ndarray:
        if self._freq.freqstr in ("h", "H"):
            return index.hour + 24 * index.dayofweek
        elif self._freq.freqstr == "D":
            return index.dayofweek
        else:  # "B"
            bdays = pd.bdate_range("2000-1-1", periods=10).dayofweek.unique()
            loc = index.dayofweek
            if not loc.isin(bdays).all():
                raise ValueError(
                    "freq is B but index contains days that are not business "
                    "days."
                )
            return loc

    def _daily_to_loc(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
    ) -> np.ndarray:
        return index.hour

    def _quarterly_to_loc(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
    ) -> np.ndarray:
        return (index.month - 1) % 3

    def _annual_to_loc(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
    ) -> np.ndarray:
        if self._freq.freqstr in ("M", "ME", "MS"):
            return index.month - 1
        else:  # "Q"
            return index.quarter - 1

    def _get_terms(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex]
    ) -> np.ndarray:
        if self._period == "D":
            locs = self._daily_to_loc(index)
        elif self._period == "W":
            locs = self._weekly_to_loc(index)
        elif self._period in ("Q", "QE"):
            locs = self._quarterly_to_loc(index)
        else:  # "A", "Y":
            locs = self._annual_to_loc(index)
        full_cycle = self._supported[self._period][self._freq_str]
        terms = np.zeros((locs.shape[0], full_cycle))
        terms[np.arange(locs.shape[0]), locs] = 1
        return terms

    @property
    def _columns(self) -> list[str]:
        columns = []
        count = self._supported[self._period][self._freq_str]
        for i in range(count):
            columns.append(
                f"s({self._freq_str}={i + 1}, period={self._period})"
            )
        return columns


[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(
        self, index: Union[Sequence[Hashable], pd.Index]
    ) -> pd.DataFrame:
        index = self._index_like(index)
        index = self._check_index_type(index)
        terms = self._get_terms(index)

        return pd.DataFrame(terms, index=index, columns=self._columns)



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        index: Union[Sequence[Hashable], pd.Index],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        index = self._index_like(index)
        fcast_index = self._extend_index(index, steps, forecast_index)
        self._check_index_type(fcast_index)
        assert isinstance(fcast_index, (pd.DatetimeIndex, pd.PeriodIndex))
        terms = self._get_terms(fcast_index)
        return pd.DataFrame(terms, index=fcast_index, columns=self._columns)


    @property
    def _eq_attr(self) -> tuple[Hashable, ...]:
        return self._period, self._freq_str

    def __str__(self) -> str:
        return f"Seasonal(freq={self._freq_str})"




[docs]
class CalendarTimeTrend(CalendarDeterministicTerm, TimeTrendDeterministicTerm):
    r"""
    Constant and time trend determinstic terms based on calendar time

    Parameters
    ----------
    freq : str
        A string convertible to a pandas frequency.
    constant : bool
        Flag indicating whether a constant should be included.
    order : int
        A non-negative int containing the powers to include (1, 2, ..., order).
    base_period : {str, pd.Timestamp}, default None
        The base period to use when computing the time stamps. This value is
        treated as 1 and so all other time indices are defined as the number
        of periods since or before this time stamp. If not provided, defaults
        to pandas base period for a PeriodIndex.

    See Also
    --------
    DeterministicProcess
    CalendarFourier
    CalendarSeasonality
    TimeTrend

    Notes
    -----
    The time stamp, :math:`\tau_t`, is the number of periods that have elapsed
    since the base_period. :math:`\tau_t` may be fractional.

    Examples
    --------
    Here we simulate irregularly spaced hourly data and construct the calendar
    time trend terms for the data.

    >>> import numpy as np
    >>> import pandas as pd
    >>> base = pd.Timestamp("2020-1-1")
    >>> gen = np.random.default_rng()
    >>> gaps = np.cumsum(gen.integers(0, 1800, size=1000))
    >>> times = [base + pd.Timedelta(gap, unit="s") for gap in gaps]
    >>> index = pd.DatetimeIndex(pd.to_datetime(times))

    >>> from statsmodels.tsa.deterministic import CalendarTimeTrend
    >>> cal_trend_gen = CalendarTimeTrend("D", True, order=1)
    >>> cal_trend_gen.in_sample(index)

    Next, we normalize using the first time stamp

    >>> cal_trend_gen = CalendarTimeTrend("D", True, order=1,
    ...                                   base_period=index[0])
    >>> cal_trend_gen.in_sample(index)
    """

    def __init__(
        self,
        freq: str,
        constant: bool = True,
        order: int = 0,
        *,
        base_period: Optional[Union[str, DateLike]] = None,
    ) -> None:
        super().__init__(freq)
        TimeTrendDeterministicTerm.__init__(
            self, constant=constant, order=order
        )
        self._ref_i8 = 0
        if base_period is not None:
            pr = pd.period_range(base_period, periods=1, freq=self._freq)
            self._ref_i8 = pr.asi8[0]
        self._base_period = None if base_period is None else str(base_period)

    @property
    def base_period(self) -> Optional[str]:
        """The base period"""
        return self._base_period


[docs]
    @classmethod
    def from_string(
        cls,
        freq: str,
        trend: str,
        base_period: Optional[Union[str, DateLike]] = None,
    ) -> "CalendarTimeTrend":
        """
        Create a TimeTrend from a string description.

        Provided for compatibility with common string names.

        Parameters
        ----------
        freq : str
            A string convertible to a pandas frequency.
        trend : {"n", "c", "t", "ct", "ctt"}
            The string representation of the time trend. The terms are:

            * "n": No trend terms
            * "c": A constant only
            * "t": Linear time trend only
            * "ct": A constant and a time trend
            * "ctt": A constant, a time trend and a quadratic time trend
        base_period : {str, pd.Timestamp}, default None
            The base period to use when computing the time stamps. This value
            is treated as 1 and so all other time indices are defined as the
            number of periods since or before this time stamp. If not
            provided, defaults to pandas base period for a PeriodIndex.

        Returns
        -------
        TimeTrend
            The TimeTrend instance.
        """
        constant = trend.startswith("c")
        order = 0
        if "tt" in trend:
            order = 2
        elif "t" in trend:
            order = 1
        return cls(freq, constant, order, base_period=base_period)


    def _terms(
        self, index: Union[pd.DatetimeIndex, pd.PeriodIndex], ratio: np.ndarray
    ) -> pd.DataFrame:
        if isinstance(index, pd.DatetimeIndex):
            index = index.to_period(self._freq)

        index_i8 = index.asi8
        index_i8 = index_i8 - self._ref_i8 + 1
        time = index_i8.astype(np.double) + ratio
        time = time[:, None]
        terms = self._get_terms(time)
        return pd.DataFrame(terms, columns=self._columns, index=index)


[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(
        self, index: Union[Sequence[Hashable], pd.Index]
    ) -> pd.DataFrame:
        index = self._index_like(index)
        index = self._check_index_type(index)
        ratio = self._compute_ratio(index)
        return self._terms(index, ratio)



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        index: Union[Sequence[Hashable], pd.Index],
        forecast_index: Optional[Sequence[Hashable]] = None,
    ) -> pd.DataFrame:
        index = self._index_like(index)
        fcast_index = self._extend_index(index, steps, forecast_index)
        self._check_index_type(fcast_index)
        assert isinstance(fcast_index, (pd.PeriodIndex, pd.DatetimeIndex))
        ratio = self._compute_ratio(fcast_index)
        return self._terms(fcast_index, ratio)


    @property
    def _eq_attr(self) -> tuple[Hashable, ...]:
        attr: tuple[Hashable, ...] = (
            self._constant,
            self._order,
            self._freq.freqstr,
        )
        if self._base_period is not None:
            attr += (self._base_period,)
        return attr

    def __str__(self) -> str:
        value = TimeTrendDeterministicTerm.__str__(self)
        value = "Calendar" + value[:-1] + f", freq={self._freq.freqstr})"
        if self._base_period is not None:
            value = value[:-1] + f"base_period={self._base_period})"
        return value




[docs]
class DeterministicProcess:
    """
    Container class for deterministic terms.

    Directly supports constants, time trends, and either seasonal dummies or
    fourier terms for a single cycle. Additional deterministic terms beyond
    the set that can be directly initialized through the constructor can be
    added.

    Parameters
    ----------
    index : {Sequence[Hashable], pd.Index}
        The index of the process. Should usually be the "in-sample" index when
        used in forecasting applications.
    period : {float, int}, default None
        The period of the seasonal or fourier components. Must be an int for
        seasonal dummies. If not provided, freq is read from index if
        available.
    constant : bool, default False
        Whether to include a constant.
    order : int, default 0
        The order of the tim trend to include. For example, 2 will include
        both linear and quadratic terms. 0 exclude time trend terms.
    seasonal : bool = False
        Whether to include seasonal dummies
    fourier : int = 0
        The order of the fourier terms to included.
    additional_terms : Sequence[DeterministicTerm]
        A sequence of additional deterministic terms to include in the process.
    drop : bool, default False
        A flag indicating to check for perfect collinearity and to drop any
        linearly dependent terms.

    See Also
    --------
    TimeTrend
    Seasonality
    Fourier
    CalendarTimeTrend
    CalendarSeasonality
    CalendarFourier

    Notes
    -----
    See the notebook `Deterministic Terms in Time Series Models
    <../examples/notebooks/generated/deterministics.html>`__ for an overview.

    Examples
    --------
    >>> from statsmodels.tsa.deterministic import DeterministicProcess
    >>> from pandas import date_range
    >>> index = date_range("2000-1-1", freq="M", periods=240)

    First a determinstic process with a constant and quadratic time trend.

    >>> dp = DeterministicProcess(index, constant=True, order=2)
    >>> dp.in_sample().head(3)
                const  trend  trend_squared
    2000-01-31    1.0    1.0            1.0
    2000-02-29    1.0    2.0            4.0
    2000-03-31    1.0    3.0            9.0

    Seasonal dummies are included by setting seasonal to True.

    >>> dp = DeterministicProcess(index, constant=True, seasonal=True)
    >>> dp.in_sample().iloc[:3,:5]
                const  s(2,12)  s(3,12)  s(4,12)  s(5,12)
    2000-01-31    1.0      0.0      0.0      0.0      0.0
    2000-02-29    1.0      1.0      0.0      0.0      0.0
    2000-03-31    1.0      0.0      1.0      0.0      0.0

    Fourier components can be used to alternatively capture seasonal patterns,

    >>> dp = DeterministicProcess(index, constant=True, fourier=2)
    >>> dp.in_sample().head(3)
                const  sin(1,12)  cos(1,12)  sin(2,12)  cos(2,12)
    2000-01-31    1.0   0.000000   1.000000   0.000000        1.0
    2000-02-29    1.0   0.500000   0.866025   0.866025        0.5
    2000-03-31    1.0   0.866025   0.500000   0.866025       -0.5

    Multiple Seasonalities can be captured using additional terms.

    >>> from statsmodels.tsa.deterministic import Fourier
    >>> index = date_range("2000-1-1", freq="D", periods=5000)
    >>> fourier = Fourier(period=365.25, order=1)
    >>> dp = DeterministicProcess(index, period=3, constant=True,
    ...                           seasonal=True, additional_terms=[fourier])
    >>> dp.in_sample().head(3)
                const  s(2,3)  s(3,3)  sin(1,365.25)  cos(1,365.25)
    2000-01-01    1.0     0.0     0.0       0.000000       1.000000
    2000-01-02    1.0     1.0     0.0       0.017202       0.999852
    2000-01-03    1.0     0.0     1.0       0.034398       0.999408
    """

    def __init__(
        self,
        index: Union[Sequence[Hashable], pd.Index],
        *,
        period: Optional[Union[float, int]] = None,
        constant: bool = False,
        order: int = 0,
        seasonal: bool = False,
        fourier: int = 0,
        additional_terms: Sequence[DeterministicTerm] = (),
        drop: bool = False,
    ):
        if not isinstance(index, pd.Index):
            index = pd.Index(index)
        self._index = index
        self._deterministic_terms: list[DeterministicTerm] = []
        self._extendable = False
        self._index_freq = None
        self._validate_index()
        period = float_like(period, "period", optional=True)
        self._constant = constant = bool_like(constant, "constant")
        self._order = required_int_like(order, "order")
        self._seasonal = seasonal = bool_like(seasonal, "seasonal")
        self._fourier = required_int_like(fourier, "fourier")
        additional_terms = tuple(additional_terms)
        self._cached_in_sample = None
        self._drop = bool_like(drop, "drop")
        self._additional_terms = additional_terms
        if constant or order:
            self._deterministic_terms.append(TimeTrend(constant, order))
        if seasonal and fourier:
            raise ValueError(
                """seasonal and fourier can be initialized through the \
constructor since these will be necessarily perfectly collinear. Instead, \
you can pass additional components using the additional_terms input."""
            )
        if (seasonal or fourier) and period is None:
            if period is None:
                self._period = period = freq_to_period(self._index_freq)
        if seasonal:
            period = required_int_like(period, "period")
            self._deterministic_terms.append(Seasonality(period))
        elif fourier:
            period = float_like(period, "period")
            assert period is not None
            self._deterministic_terms.append(Fourier(period, order=fourier))
        for term in additional_terms:
            if not isinstance(term, DeterministicTerm):
                raise TypeError(
                    "All additional terms must be instances of subsclasses "
                    "of DeterministicTerm"
                )
            if term not in self._deterministic_terms:
                self._deterministic_terms.append(term)
            else:
                raise ValueError(
                    "One or more terms in additional_terms has been added "
                    "through the parameters of the constructor. Terms must "
                    "be unique."
                )
        self._period = period
        self._retain_cols: Optional[list[Hashable]] = None

    @property
    def index(self) -> pd.Index:
        """The index of the process"""
        return self._index

    @property
    def terms(self) -> list[DeterministicTerm]:
        """The deterministic terms included in the process"""
        return self._deterministic_terms

    def _adjust_dummies(self, terms: list[pd.DataFrame]) -> list[pd.DataFrame]:
        has_const: Optional[bool] = None
        for dterm in self._deterministic_terms:
            if isinstance(dterm, (TimeTrend, CalendarTimeTrend)):
                has_const = has_const or dterm.constant
        if has_const is None:
            has_const = False
            for term in terms:
                const_col = (term == term.iloc[0]).all() & (term.iloc[0] != 0)
                has_const = has_const or const_col.any()
        drop_first = has_const
        for i, dterm in enumerate(self._deterministic_terms):
            is_dummy = dterm.is_dummy
            if is_dummy and drop_first:
                # drop first
                terms[i] = terms[i].iloc[:, 1:]
            drop_first = drop_first or is_dummy
        return terms

    def _remove_zeros_ones(self, terms: pd.DataFrame) -> pd.DataFrame:
        all_zero = np.all(terms == 0, axis=0)
        if np.any(all_zero):
            terms = terms.loc[:, ~all_zero]
        is_constant = terms.max(axis=0) == terms.min(axis=0)
        if np.sum(is_constant) > 1:
            # flag surplus constant columns
            surplus_consts = is_constant & is_constant.duplicated()
            terms = terms.loc[:, ~surplus_consts]
        return terms


[docs]
    @Appender(DeterministicTerm.in_sample.__doc__)
    def in_sample(self) -> pd.DataFrame:
        if self._cached_in_sample is not None:
            return self._cached_in_sample
        index = self._index
        if not self._deterministic_terms:
            return pd.DataFrame(np.empty((index.shape[0], 0)), index=index)
        raw_terms = []
        for term in self._deterministic_terms:
            raw_terms.append(term.in_sample(index))

        raw_terms = self._adjust_dummies(raw_terms)
        terms: pd.DataFrame = pd.concat(raw_terms, axis=1)
        terms = self._remove_zeros_ones(terms)
        if self._drop:
            terms_arr = to_numpy(terms)
            res = qr(terms_arr, mode="r", pivoting=True)
            r = res[0]
            p = res[-1]
            abs_diag = np.abs(np.diag(r))
            tol = abs_diag[0] * terms_arr.shape[1] * np.finfo(float).eps
            rank = int(np.sum(abs_diag > tol))
            rpx = r.T @ terms_arr
            keep = [0]
            last_rank = 1
            # Find the left-most columns that produce full rank
            for i in range(1, terms_arr.shape[1]):
                curr_rank = np.linalg.matrix_rank(rpx[: i + 1, : i + 1])
                if curr_rank > last_rank:
                    keep.append(i)
                    last_rank = curr_rank
                if curr_rank == rank:
                    break
            if len(keep) == rank:
                terms = terms.iloc[:, keep]
            else:
                terms = terms.iloc[:, np.sort(p[:rank])]
        self._retain_cols = terms.columns
        self._cached_in_sample = terms
        return terms



[docs]
    @Appender(DeterministicTerm.out_of_sample.__doc__)
    def out_of_sample(
        self,
        steps: int,
        forecast_index: Optional[Union[Sequence[Hashable], pd.Index]] = None,
    ) -> pd.DataFrame:
        steps = required_int_like(steps, "steps")
        if self._drop and self._retain_cols is None:
            self.in_sample()
        index = self._index
        if not self._deterministic_terms:
            return pd.DataFrame(np.empty((index.shape[0], 0)), index=index)
        raw_terms = []
        for term in self._deterministic_terms:
            raw_terms.append(term.out_of_sample(steps, index, forecast_index))
        terms: pd.DataFrame = pd.concat(raw_terms, axis=1)
        assert self._retain_cols is not None
        if terms.shape[1] != len(self._retain_cols):
            terms = terms[self._retain_cols]
        return terms


    def _extend_time_index(
        self,
        stop: pd.Timestamp,
    ) -> Union[pd.DatetimeIndex, pd.PeriodIndex]:
        index = self._index
        if isinstance(index, pd.PeriodIndex):
            return pd.period_range(index[0], end=stop, freq=index.freq)
        return pd.date_range(start=index[0], end=stop, freq=self._index_freq)

    def _range_from_range_index(self, start: int, stop: int) -> pd.DataFrame:
        index = self._index
        is_int64_index = is_int_index(index)
        assert isinstance(index, pd.RangeIndex) or is_int64_index
        if start < index[0]:
            raise ValueError(START_BEFORE_INDEX_ERR)
        if isinstance(index, pd.RangeIndex):
            idx_step = index.step
        else:
            idx_step = np.diff(index).max() if len(index) > 1 else 1
        if idx_step != 1 and ((start - index[0]) % idx_step) != 0:
            raise ValueError(
                f"The step of the index is not 1 (actual step={idx_step})."
                " start must be in the sequence that would have been "
                "generated by the index."
            )
        if is_int64_index:
            new_idx = pd.Index(np.arange(start, stop))
        else:
            new_idx = pd.RangeIndex(start, stop, step=idx_step)
        if new_idx[-1] <= self._index[-1]:
            # In-sample only
            in_sample = self.in_sample()
            in_sample = in_sample.loc[new_idx]
            return in_sample
        elif new_idx[0] > self._index[-1]:
            # Out of-sample only
            next_value = index[-1] + idx_step
            if new_idx[0] != next_value:
                tmp = pd.RangeIndex(next_value, stop, step=idx_step)
                oos = self.out_of_sample(tmp.shape[0], forecast_index=tmp)
                return oos.loc[new_idx]
            return self.out_of_sample(new_idx.shape[0], forecast_index=new_idx)
        # Using some from each in and out of sample
        in_sample_loc = new_idx <= self._index[-1]
        in_sample_idx = new_idx[in_sample_loc]
        out_of_sample_idx = new_idx[~in_sample_loc]
        in_sample_exog = self.in_sample().loc[in_sample_idx]
        oos_exog = self.out_of_sample(
            steps=out_of_sample_idx.shape[0], forecast_index=out_of_sample_idx
        )
        return pd.concat([in_sample_exog, oos_exog], axis=0)

    def _range_from_time_index(
        self, start: pd.Timestamp, stop: pd.Timestamp
    ) -> pd.DataFrame:
        index = self._index
        if isinstance(self._index, pd.PeriodIndex):
            if isinstance(start, pd.Timestamp):
                start = start.to_period(freq=self._index_freq)
            if isinstance(stop, pd.Timestamp):
                stop = stop.to_period(freq=self._index_freq)
        if start < index[0]:
            raise ValueError(START_BEFORE_INDEX_ERR)
        if stop <= self._index[-1]:
            return self.in_sample().loc[start:stop]
        new_idx = self._extend_time_index(stop)
        oos_idx = new_idx[new_idx > index[-1]]
        oos = self.out_of_sample(oos_idx.shape[0], oos_idx)
        if start >= oos_idx[0]:
            return oos.loc[start:stop]
        both = pd.concat([self.in_sample(), oos], axis=0)
        return both.loc[start:stop]

    def _int_to_timestamp(self, value: int, name: str) -> pd.Timestamp:
        if value < 0:
            raise ValueError(f"{name} must be non-negative.")
        if value < self._index.shape[0]:
            return self._index[value]
        add_periods = value - (self._index.shape[0] - 1) + 1
        index = self._index
        if isinstance(self._index, pd.PeriodIndex):
            pr = pd.period_range(
                index[-1], freq=self._index_freq, periods=add_periods
            )
            return pr[-1].to_timestamp()
        dr = pd.date_range(
            index[-1], freq=self._index_freq, periods=add_periods
        )
        return dr[-1]


[docs]
    def range(
        self,
        start: Union[IntLike, DateLike, str],
        stop: Union[IntLike, DateLike, str],
    ) -> pd.DataFrame:
        """
        Deterministic terms spanning a range of observations

        Parameters
        ----------
        start : {int, str, dt.datetime, pd.Timestamp, np.datetime64}
            The first observation.
        stop : {int, str, dt.datetime, pd.Timestamp, np.datetime64}
            The final observation. Inclusive to match most prediction
            function in statsmodels.

        Returns
        -------
        DataFrame
            A data frame of deterministic terms
        """
        if not self._extendable:
            raise TypeError(
                """The index in the deterministic process does not \
support extension. Only PeriodIndex, DatetimeIndex with a frequency, \
RangeIndex, and integral Indexes that start at 0 and have only unit \
differences can be extended when producing out-of-sample forecasts.
"""
            )
        if type(self._index) in (pd.RangeIndex,) or is_int_index(self._index):
            start = required_int_like(start, "start")
            stop = required_int_like(stop, "stop")
            # Add 1 to ensure that the end point is inclusive
            stop += 1
            return self._range_from_range_index(start, stop)
        if isinstance(start, (int, np.integer)):
            start = self._int_to_timestamp(start, "start")
        else:
            start = pd.Timestamp(start)
        if isinstance(stop, (int, np.integer)):
            stop = self._int_to_timestamp(stop, "stop")
        else:
            stop = pd.Timestamp(stop)
        return self._range_from_time_index(start, stop)


    def _validate_index(self) -> None:
        if isinstance(self._index, pd.PeriodIndex):
            self._index_freq = self._index.freq
            self._extendable = True
        elif isinstance(self._index, pd.DatetimeIndex):
            self._index_freq = self._index.freq or self._index.inferred_freq
            self._extendable = self._index_freq is not None
        elif isinstance(self._index, pd.RangeIndex):
            self._extendable = True
        elif is_int_index(self._index):
            self._extendable = self._index[0] == 0 and np.all(
                np.diff(self._index) == 1
            )


[docs]
    def apply(self, index):
        """
        Create an identical determinstic process with a different index

        Parameters
        ----------
        index : index_like
            An index-like object. If not an index, it is converted to an
            index.

        Returns
        -------
        DeterministicProcess
            The deterministic process applied to a different index
        """
        return DeterministicProcess(
            index,
            period=self._period,
            constant=self._constant,
            order=self._order,
            seasonal=self._seasonal,
            fourier=self._fourier,
            additional_terms=self._additional_terms,
            drop=self._drop,
        )
Last update: Oct 21, 2024