Source code for infomeasure.estimators.entropy.kozachenko_leonenko

"""Module for the Kozachenko-Leonenko entropy estimator."""

from numpy import column_stack, sum as np_sum, nan, isnan
from numpy import inf, log, issubdtype, integer
from scipy.spatial import KDTree
from scipy.special import digamma

from ..base import EntropyEstimator
from ..mixins import RandomGeneratorMixin
from ..utils.array import assure_2d_data
from ..utils.unit_ball_volume import unit_ball_volume
from ... import Config
from ...utils.types import LogBaseType



[docs]
class KozachenkoLeonenkoEntropyEstimator(RandomGeneratorMixin, EntropyEstimator):
    r"""Kozachenko-Leonenko entropy estimator for continuous data.

    The Kozachenko-Leonenko estimator computes the Shannon entropy of continuous
    data using nearest neighbor distances. The estimator is based on the method
    from :cite:p:`kozachenko1987sample` and follows the implementation approach
    described in :cite:p:`miKSG2004`.

    .. math::

        \hat{H}_{KL} = -\psi(k) + \psi(N) + \log(c_d) + \frac{d}{N} \sum_{i=1}^{N} \log(2\rho_{k,i})

    where :math:`\psi` is the digamma function, :math:`k` is the number of nearest
    neighbors, :math:`N` is the number of data points, :math:`d` is the dimensionality,
    :math:`c_d` is the volume of the :math:`d`-dimensional unit ball for the chosen
    Minkowski norm, and :math:`\rho_{k,i}` is the distance to the :math:`k`-th nearest
    neighbor of point :math:`i`.

    This estimator is particularly suitable for continuous multivariate data and
    provides asymptotically unbiased estimates of differential entropy. The method
    works by exploiting the relationship between nearest neighbor distances and
    local density, making it effective for high-dimensional data where traditional
    histogram-based methods fail.

    Parameters
    ----------
    *data : array-like
        The continuous data used to estimate the entropy. For multivariate data,
        each variable should be a column.
    k : int, default=4
        The number of nearest neighbors to consider. Higher values provide more
        stable estimates but may introduce bias. The default value of 4 is
        recommended by :cite:p:`miKSG2004`.
    noise_level : float, default=1e-10
        The standard deviation of Gaussian noise added to the data to avoid
        issues with zero distances between identical points. Set to 0 to disable
        noise addition.
    minkowski_p : float, default=inf
        The power parameter for the Minkowski metric used in distance calculations.
        Common values are 2 (Euclidean distance) and inf (maximum norm/Chebyshev
        distance). Must satisfy :math:`1 \leq p \leq \infty`.
    ksg_id : int, default=1
        The KSG estimator variant to use (1 or 2).
        Type I uses the standard formula. Type II uses a modified formula with
        :math:`\psi(k) - 1/k`.
    base : LogBaseType, default=Config.get("base")
        The logarithm base for entropy calculation. Can be 2, 10, "e", or any
        positive number.

    Attributes
    ----------
    *data : tuple[array-like]
        The processed data used to estimate the entropy, converted to 2D arrays.
    k : int
        The number of nearest neighbors to consider.
    noise_level : float
        The standard deviation of the Gaussian noise added to the data.
    minkowski_p : float
        The power parameter for the Minkowski metric.
    ksg_id : int
        The KSG estimator variant to use.

    Raises
    ------
    ValueError
        If the number of nearest neighbors is not a positive integer.
    ValueError
        If the noise level is negative.
    ValueError
        If the Minkowski power parameter is invalid (not in range [1, ∞]).

    Notes
    -----
    The choice of the number of nearest neighbors :math:`k` affects the bias-variance
    tradeoff of the estimator. Smaller values of :math:`k` reduce bias but increase
    variance, while larger values have the opposite effect. The default value of
    :math:`k=4` provides a good balance for most applications.

    The noise addition helps handle datasets with repeated values or points that
    are exactly identical, which would otherwise result in zero distances and
    numerical issues. The noise level should be small enough not to significantly
    alter the underlying distribution.

    For high-dimensional data, the curse of dimensionality may affect the estimator's
    performance, as nearest neighbor distances become less informative. In such cases,
    dimensionality reduction or alternative entropy estimation methods may be preferable.

    Examples
    --------
    >>> import numpy as np
    >>> import infomeasure as im
    >>>
    >>> # Generate 2D Gaussian data
    >>> np.random.seed(176250)
    >>> data = np.random.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], 1000)
    >>>
    >>> # Estimate entropy
    >>> estimator = im.estimator(data, measure="h", approach="kl", k=4)
    >>> entropy_value = estimator.result()
    >>> print(f"Estimated entropy: {entropy_value:.3f}")
    Estimated entropy: 2.678
    >>> print(f"Local values: {estimator.local_vals()}")
    Local values: [ 3.15330798  2.02688591  2.52250064  2.95236651  3.58801879  1.42033673
        ...
        2.91254223  1.92823136  3.63647704  2.05589055]
    >>> # Use different distance metric
    >>> estimator_euclidean = KozachenkoLeonenkoEntropyEstimator(data, k=4, minkowski_p=2)
    >>> entropy_euclidean = estimator_euclidean.entropy()
    np.float64(2.6772465397252208)
    """

    def __init__(
        self,
        *data,
        k: int = 4,
        ksg_id: int = 1,
        noise_level=1e-10,
        minkowski_p=inf,
        base: LogBaseType = Config.get("base"),
    ):
        r"""Initialize the Kozachenko-Leonenko estimator.

        Parameters
        ----------
        k : int
            The number of nearest neighbors to consider.
        noise_level : float
            The standard deviation of the Gaussian noise to add to the data to avoid
            issues with zero distances.
        minkowski_p : float, :math:`1 \leq p \leq \infty`
            The power parameter for the Minkowski metric.
            Default is np.inf for maximum norm. Use 2 for Euclidean distance.
        """
        if not issubdtype(type(k), integer) or k <= 0:
            raise ValueError(
                "The number of nearest neighbors (k) must be a positive "
                f"integer, but got {k}."
            )
        if noise_level < 0:
            raise ValueError(
                f"The noise level must be non-negative, but got {noise_level}."
            )
        if not (1 <= minkowski_p <= inf):
            raise ValueError(
                "The Minkowski power parameter must be positive, "
                f"but got {minkowski_p}."
            )
        super().__init__(*data, base=base)
        self.data = tuple(assure_2d_data(var) for var in self.data)
        self.k = k
        if ksg_id not in {1, 2}:
            raise ValueError(f"ksg_id must be 1 or 2, but got {ksg_id}.")
        self.ksg_id = ksg_id
        self.noise_level = noise_level
        self.minkowski_p = minkowski_p

    def _simple_entropy(self):
        """Calculate the entropy of the data.

        Returns
        -------
        float
            The calculated entropy.
        """
        # Copy the data to avoid modifying the original
        data_noisy = self.data[0].astype(float).copy()
        # Add small Gaussian noise to data to avoid issues with zero distances
        if self.noise_level and self.noise_level != 0:
            data_noisy += self.rng.normal(0, self.noise_level, self.data[0].shape)

        # Build a KDTree for efficient nearest neighbor search with maximum norm
        tree = KDTree(data_noisy)

        # Find the k-th nearest neighbors for each point
        distances, _ = tree.query(data_noisy, self.k + 1, p=self.minkowski_p)
        # Only keep the k-th nearest neighbor distance
        distances = distances[:, -1]

        # Constants for the entropy formula
        N = self.data[0].shape[0]
        d = self.data[0].shape[1]
        # Volume of the d-dimensional unit ball for maximum norm
        c_d = unit_ball_volume(d, r=1 / 2, p=self.minkowski_p)

        distances[distances == 0] = nan
        # Compute the local entropies
        psi_k = digamma(self.k)
        if self.ksg_id == 2:
            psi_k -= 1.0 / self.k

        local_h = -psi_k + digamma(N) + log(c_d) + d * log(2 * distances)
        local_h[isnan(local_h)] = 0.0
        # return in desired base
        return local_h / log(self.base) if self.base != "e" else local_h

    def _joint_entropy(self):
        """Calculate the joint entropy of the data.

        This is done by joining the variables into one space
        and calculating the entropy.

        Returns
        -------
        float
            The calculated joint entropy.
        """
        self.data = (column_stack(self.data[0]),)
        return self._simple_entropy()

    def _cross_entropy(self) -> float:
        """Calculate the cross-entropy between two distributions.

        Returns
        -------
        float
            The calculated cross-entropy.
        """

        # Copy the data to avoid modifying the original
        data_noisy_p = self.data[0].astype(float).copy()
        data_noisy_q = self.data[1].astype(float).copy()
        # Add small Gaussian noise to data to avoid issues with zero distances
        if self.noise_level and self.noise_level != 0:
            data_noisy_p += self.rng.normal(0, self.noise_level, self.data[0].shape)
            data_noisy_q += self.rng.normal(0, self.noise_level, self.data[1].shape)

        # Build a KDTree for efficient nearest neighbor search with maximum norm
        tree = KDTree(data_noisy_q)

        # Find the k-th nearest neighbors for each point
        distances, _ = tree.query(data_noisy_p, self.k, p=self.minkowski_p)
        # Only keep the k-th nearest neighbor distance
        distances = distances[:, -1]

        # Constants for the entropy formula
        M = self.data[1].shape[0]
        d = self.data[1].shape[1]
        # Volume of the d-dimensional unit ball for maximum norm
        c_d = unit_ball_volume(d, r=1 / 2, p=self.minkowski_p)

        # Compute the cross-entropy
        psi_k = digamma(self.k)
        if self.ksg_id == 2:
            psi_k -= 1.0 / self.k

        hx = (
            -psi_k
            + digamma(M)
            + log(c_d)
            + d * np_sum(log(2 * distances[distances > 0])) / M
        )
        # return in desired base
        return hx / log(self.base) if self.base != "e" else hx