Source code for pyreco.datasets

"""
Standard chaotic system datasets for reservoir computing.

This module provides time series data from chaotic dynamical systems,
formatted for direct use with PyReCo reservoir computers.

All data generators use numerical integration of the governing equations.
Lorenz system uses scipy.integrate.solve_ivp for stable ODE integration.
Mackey-Glass uses custom Euler integration for delay differential equations.
"""

import numpy as np
from collections import deque
from sklearn.preprocessing import StandardScaler
from scipy.integrate import solve_ivp


def _generate_lorenz(n_timesteps, sigma=10.0, rho=28.0, beta=8.0/3.0, h=0.01, x0=None):
    """
    Generate Lorenz 63 attractor time series using scipy.integrate.solve_ivp.

    Lorenz system equations:
        dx/dt = σ(y - x)
        dy/dt = x(ρ - z) - y
        dz/dt = xy - βz

    Parameters
    ----------
    n_timesteps : int
        Number of time steps to generate
    sigma : float, default=10.0
        Prandtl number
    rho : float, default=28.0
        Rayleigh number
    beta : float, default=8/3
        Geometric factor
    h : float, default=0.01
        Integration time step
    x0 : array-like of shape (3,), optional
        Initial condition [x, y, z]. If None, defaults to [1.0, 1.0, 1.0]

    Returns
    -------
    trajectory : ndarray of shape (n_timesteps, 3)
        Lorenz attractor trajectory
    """
    if x0 is None:
        x0 = np.array([1.0, 1.0, 1.0], dtype=np.float64)
    else:
        x0 = np.asarray(x0, dtype=np.float64)

    # Define Lorenz system for solve_ivp (requires t, y signature)
    def lorenz_deriv(t, state):
        x, y, z = state
        dx = sigma * (y - x)
        dy = x * (rho - z) - y
        dz = x * y - beta * z
        return [dx, dy, dz]

    # Define time points for solution
    t_span = (0, (n_timesteps - 1) * h)
    t_eval = np.linspace(0, (n_timesteps - 1) * h, n_timesteps)

    # Solve ODE using scipy.integrate.solve_ivp with default parameters
    sol = solve_ivp(lorenz_deriv, t_span, x0, t_eval=t_eval)

    # Check if integration was successful
    if not sol.success:
        raise RuntimeError(f"solve_ivp failed: {sol.message}")

    # Return trajectory as (n_timesteps, 3) array
    return sol.y.T


def _generate_mackey_glass(n_timesteps, tau=17, a=0.2, b=0.1, n=10, x0=1.2, h=1.0, seed=None):
    """
    Generate Mackey-Glass time series using delay differential equation integration.

    Mackey-Glass equation:
        dx/dt = a*x(t-τ)/(1 + x(t-τ)^n) - b*x(t)

    Parameters
    ----------
    n_timesteps : int
        Number of time steps to generate
    tau : int, default=17
        Time delay
    a : float, default=0.2
        Production rate
    b : float, default=0.1
        Degradation rate
    n : float, default=10
        Hill coefficient (nonlinearity)
    x0 : float, default=1.2
        Initial condition
    h : float, default=1.0
        Integration time step
    seed : int, optional
        Random seed for initialization noise. If None, uses deterministic initialization

    Returns
    -------
    timeseries : ndarray of shape (n_timesteps, 1)
        Mackey-Glass time series
    """
    if seed is not None:
        rng = np.random.RandomState(seed)
    else:
        rng = None

    # Validate that tau/h is an integer to ensure correct delay alignment
    delay_steps = tau / h
    if abs(delay_steps - round(delay_steps)) > 1e-9:
        raise ValueError(
            f"tau/h must be an integer to correctly align the time delay. "
            f"Got tau={tau}, h={h}, tau/h={delay_steps:.6f}. "
            f"Please adjust tau or h so that tau/h is an integer."
        )

    # Initialize history buffer
    history_len = int(round(delay_steps)) + 1
    if rng is not None:
        # Add small random perturbations to initial history
        history = deque([x0 + rng.normal(0, 0.001) for _ in range(history_len)], maxlen=history_len)
    else:
        history = deque([x0] * history_len, maxlen=history_len)

    # Pre-allocate output
    timeseries = np.empty((n_timesteps,), dtype=np.float64)

    # Define Mackey-Glass derivative
    def mg_deriv(x_current, x_delayed):
        return a * x_delayed / (1 + x_delayed**n) - b * x_current

    # Euler integration (suitable for DDEs with moderate stiffness)
    for i in range(n_timesteps):
        x_current = history[-1]
        x_delayed = history[0]

        timeseries[i] = x_current

        # Euler step
        dx = mg_deriv(x_current, x_delayed)
        x_next = x_current + h * dx

        history.append(x_next)

    return timeseries.reshape(-1, 1)


def _check_sufficient_data(data, n_in, n_out, data_name):
    """
    Check if dataset has enough timesteps for sliding window.

    Parameters
    ----------
    data : ndarray
        Time series data
    n_in : int
        Input window size
    n_out : int
        Output window size
    data_name : str
        Name of dataset for error message

    Raises
    ------
    ValueError
        If data has insufficient timesteps
    """
    min_required = n_in + n_out
    if len(data) < min_required:
        raise ValueError(
            f"{data_name} has only {len(data)} timesteps but needs "
            f"at least {min_required} (n_in={n_in} + n_out={n_out}). "
            f"Increase n_samples or decrease train_fraction/val_fraction."
        )


def _sliding_window(data, n_in, n_out=1):
    """
    Create sliding window views of time series data.

    Parameters
    ----------
    data : ndarray of shape (n_timesteps, n_features)
        Input time series
    n_in : int
        Number of time steps in input window
    n_out : int, default=1
        Number of time steps in output window

    Returns
    -------
    X : ndarray of shape (n_samples, n_in, n_features)
        Input windows
    y : ndarray of shape (n_samples, n_out, n_features)
        Output windows (targets)
    """
    n_timesteps, n_features = data.shape
    n_samples = n_timesteps - n_in - n_out + 1

    if n_samples < 1:
        raise ValueError(
            f"Not enough data: need at least {n_in + n_out} timesteps, got {n_timesteps}"
        )

    X = np.stack([data[i:i+n_in] for i in range(n_samples)], axis=0)
    y = np.stack([data[i+n_in:i+n_in+n_out] for i in range(n_samples)], axis=0)

    return X, y



[docs]
def load(dataset_name, n_samples=5000, train_fraction=0.7, n_in=100, n_out=1, seed=None, val_fraction=None, standardize=False, **kwargs):
    """
    Load chaotic time series dataset with train/test split.

    This function generates data from chaotic dynamical systems using reservoirpy
    and formats it for reservoir computing tasks using sliding windows.

    Parameters
    ----------
    dataset_name : str
        Name of the dataset. Supported options:
        - 'lorentz69' or 'lorenz' or 'lorenz63': Lorenz 1963 attractor (3D system)
        - 'mackey_glass' or 'mackeyglass' or 'mg': Mackey-Glass delay system (1D system)
    n_samples : int, default=5000
        Number of time steps to generate from the chaotic system
    train_fraction : float, default=0.7
        Fraction of data to use for training (between 0 and 1)
    n_in : int, default=100
        Number of time steps in each input window
    n_out : int, default=1
        Number of time steps in each output window (prediction horizon)
    seed : int, optional
        Random seed for reproducibility (used for Mackey-Glass)
    val_fraction : float, optional
        Fraction of training data to use for validation (between 0 and 1).
        If None (default), no validation split is created.
        If set, the function will:
        - Split validation set BEFORE sliding window (prevents data leakage)
        - Automatically enable standardization (standardize=True)
        - Fit StandardScaler only on the final training set
        - Return 7 items: x_train, y_train, x_val, y_val, x_test, y_test, scaler
    standardize : bool, default=False
        Whether to standardize the data using StandardScaler.
        If True (and val_fraction=None), returns 5 items including the scaler.
        If val_fraction is set, standardization is automatically enabled.
        Standardization is recommended for reservoir computing as it:
        - Normalizes features to similar scales
        - Improves Ridge regression performance
        - Accelerates neural network training
    **kwargs : dict
        Additional parameters passed to the generator function:
        - For Lorenz: sigma, rho, beta, h (time step), x0 (initial condition)
        - For Mackey-Glass: tau, a, b, n, h (time step), x0 (initial value)

    Returns
    -------
    When standardize=False and val_fraction=None (default, backward compatible):
        x_train : ndarray of shape (n_train_samples, n_in, n_features)
            Training input windows
        y_train : ndarray of shape (n_train_samples, n_out, n_features)
            Training output windows (targets)
        x_test : ndarray of shape (n_test_samples, n_in, n_features)
            Test input windows
        y_test : ndarray of shape (n_test_samples, n_out, n_features)
            Test output windows (targets)

    When standardize=True and val_fraction=None:
        x_train : ndarray of shape (n_train_samples, n_in, n_features)
            Training input windows (standardized)
        y_train : ndarray of shape (n_train_samples, n_out, n_features)
            Training output windows (standardized)
        x_test : ndarray of shape (n_test_samples, n_in, n_features)
            Test input windows (standardized)
        y_test : ndarray of shape (n_test_samples, n_out, n_features)
            Test output windows (standardized)
        scaler : StandardScaler
            Fitted scaler object (fitted only on training data)

    When val_fraction is set (standardize automatically enabled):
        x_train : ndarray of shape (n_train_final_samples, n_in, n_features)
            Training input windows (standardized)
        y_train : ndarray of shape (n_train_final_samples, n_out, n_features)
            Training output windows (standardized)
        x_val : ndarray of shape (n_val_samples, n_in, n_features)
            Validation input windows (standardized)
        y_val : ndarray of shape (n_val_samples, n_out, n_features)
            Validation output windows (standardized)
        x_test : ndarray of shape (n_test_samples, n_in, n_features)
            Test input windows (standardized)
        y_test : ndarray of shape (n_test_samples, n_out, n_features)
            Test output windows (standardized)
        scaler : StandardScaler
            Fitted scaler object (fitted only on training data)

    Examples
    --------
    >>> # Load Lorenz system data
    >>> x_train, y_train, x_test, y_test = load('lorentz69', n_samples=5000, seed=42)
    >>> print(x_train.shape)  # (3400, 100, 3)
    >>> print(y_train.shape)  # (3400, 1, 3)

    >>> # Load Mackey-Glass data
    >>> x_train, y_train, x_test, y_test = load('mackey_glass', n_samples=5000, seed=42)
    >>> print(x_train.shape)  # (3400, 100, 1)
    >>> print(y_train.shape)  # (3400, 1, 1)

    Notes
    -----
    - Input: Past n_in time steps
    - Output: Future n_out time steps (immediately following the input window)
    - Data is split chronologically (earlier data for training, later for testing)
    - Reproducible when seed is set (for Mackey-Glass)
    - Data generation uses numerical integration:
      * Lorenz: scipy.integrate.solve_ivp for stable ODE integration
      * Mackey-Glass: Euler integration for delay differential equation
    - When val_fraction is used:
      * Validation split happens BEFORE sliding window to prevent data leakage
      * StandardScaler is fitted ONLY on final training data (not on validation)
      * This ensures validation set statistics don't leak into training preprocessing
      * All datasets (train/val/test) are standardized using the same scaler
    """
    # Normalize dataset name
    dataset_name = dataset_name.lower().replace('-', '_').replace(' ', '_')

    # Generate raw time series using custom integrators
    if dataset_name in ['lorentz69', 'lorenz', 'lorenz63']:
        # Extract Lorenz-specific parameters with defaults
        sigma = kwargs.get('sigma', 10.0)
        rho = kwargs.get('rho', 28.0)
        beta = kwargs.get('beta', 8.0/3.0)
        h = kwargs.get('h', 0.01)
        x0 = kwargs.get('x0', [1.0, 1.0, 1.0])

        raw_data = _generate_lorenz(
            n_timesteps=n_samples,
            sigma=sigma,
            rho=rho,
            beta=beta,
            h=h,
            x0=x0
        )

    elif dataset_name in ['mackey_glass', 'mackeyglass', 'mg']:
        # Extract Mackey-Glass-specific parameters with defaults
        tau = kwargs.get('tau', 17)
        a = kwargs.get('a', 0.2)
        b = kwargs.get('b', 0.1)
        n = kwargs.get('n', 10)
        h = kwargs.get('h', 1.0)
        x0 = kwargs.get('x0', 1.2)

        raw_data = _generate_mackey_glass(
            n_timesteps=n_samples,
            tau=tau,
            a=a,
            b=b,
            n=n,
            h=h,
            x0=x0,
            seed=seed
        )

    else:
        raise ValueError(
            f"Unknown dataset: '{dataset_name}'. "
            f"Supported: 'lorentz69', 'lorenz', 'lorenz63', 'mackey_glass', 'mackeyglass', 'mg'"
        )

    # Split time series first (to avoid data leakage)
    if not (0.0 < train_fraction < 1.0):
        raise ValueError(f"train_fraction must be between 0 and 1, got {train_fraction}")

    n_timesteps = len(raw_data)
    n_train_timesteps = int(n_timesteps * train_fraction)

    train_data = raw_data[:n_train_timesteps]
    test_data = raw_data[n_train_timesteps:]

    # Automatically enable standardization when val_fraction is set
    if val_fraction is not None:
        standardize = True

    # Mode 1: No standardization, no validation (backward compatible)
    if not standardize and val_fraction is None:
        # Check data sufficiency before sliding window
        _check_sufficient_data(train_data, n_in, n_out, "Training data")
        _check_sufficient_data(test_data, n_in, n_out, "Test data")

        x_train, y_train = _sliding_window(train_data, n_in=n_in, n_out=n_out)
        x_test, y_test = _sliding_window(test_data, n_in=n_in, n_out=n_out)
        return x_train, y_train, x_test, y_test

    # Mode 2: Standardization without validation split
    elif standardize and val_fraction is None:
        # Check data sufficiency before processing
        _check_sufficient_data(train_data, n_in, n_out, "Training data")
        _check_sufficient_data(test_data, n_in, n_out, "Test data")

        # Fit scaler on training data
        scaler = StandardScaler()
        scaler.fit(train_data)

        # Transform both datasets
        train_scaled = scaler.transform(train_data)
        test_scaled = scaler.transform(test_data)

        # Create sliding windows
        x_train, y_train = _sliding_window(train_scaled, n_in=n_in, n_out=n_out)
        x_test, y_test = _sliding_window(test_scaled, n_in=n_in, n_out=n_out)

        return x_train, y_train, x_test, y_test, scaler

    # Mode 3: With validation set and standardization (strict no-leakage)
    else:
        if not (0.0 < val_fraction < 1.0):
            raise ValueError(f"val_fraction must be between 0 and 1, got {val_fraction}")

        # Split validation from training BEFORE sliding window
        n_train_final_timesteps = int(len(train_data) * (1 - val_fraction))
        train_final_data = train_data[:n_train_final_timesteps]
        val_data = train_data[n_train_final_timesteps:]

        # Check data sufficiency for all three splits
        _check_sufficient_data(train_final_data, n_in, n_out, "Final training data")
        _check_sufficient_data(val_data, n_in, n_out, "Validation data")
        _check_sufficient_data(test_data, n_in, n_out, "Test data")

        # Fit scaler ONLY on final training data (no leakage)
        scaler = StandardScaler()
        scaler.fit(train_final_data)

        # Transform all three datasets
        train_final_scaled = scaler.transform(train_final_data)
        val_scaled = scaler.transform(val_data)
        test_scaled = scaler.transform(test_data)

        # Create sliding windows on scaled data
        x_train, y_train = _sliding_window(train_final_scaled, n_in=n_in, n_out=n_out)
        x_val, y_val = _sliding_window(val_scaled, n_in=n_in, n_out=n_out)
        x_test, y_test = _sliding_window(test_scaled, n_in=n_in, n_out=n_out)

        return x_train, y_train, x_val, y_val, x_test, y_test, scaler