Source code for infomeasure.estimators.entropy.kozachenko_leonenko

"""Module for the Kozachenko-Leonenko entropy estimator."""

from numpy import column_stack, sum as np_sum, nan, isnan
from numpy import inf, log, issubdtype, integer
from scipy.spatial import KDTree
from scipy.special import digamma

from ..base import EntropyEstimator
from ..mixins import RandomGeneratorMixin
from ..utils.array import assure_2d_data
from ..utils.unit_ball_volume import unit_ball_volume
from ... import Config
from ...utils.types import LogBaseType


[docs] class KozachenkoLeonenkoEntropyEstimator(RandomGeneratorMixin, EntropyEstimator): r"""Kozachenko-Leonenko entropy estimator for continuous data. The Kozachenko-Leonenko estimator computes the Shannon entropy of continuous data using nearest neighbor distances. The estimator is based on the method from :cite:p:`kozachenko1987sample` and follows the implementation approach described in :cite:p:`miKSG2004`. .. math:: \hat{H}_{KL} = -\psi(k) + \psi(N) + \log(c_d) + \frac{d}{N} \sum_{i=1}^{N} \log(2\rho_{k,i}) where :math:`\psi` is the digamma function, :math:`k` is the number of nearest neighbors, :math:`N` is the number of data points, :math:`d` is the dimensionality, :math:`c_d` is the volume of the :math:`d`-dimensional unit ball for the chosen Minkowski norm, and :math:`\rho_{k,i}` is the distance to the :math:`k`-th nearest neighbor of point :math:`i`. This estimator is particularly suitable for continuous multivariate data and provides asymptotically unbiased estimates of differential entropy. The method works by exploiting the relationship between nearest neighbor distances and local density, making it effective for high-dimensional data where traditional histogram-based methods fail. Parameters ---------- *data : array-like The continuous data used to estimate the entropy. For multivariate data, each variable should be a column. k : int, default=4 The number of nearest neighbors to consider. Higher values provide more stable estimates but may introduce bias. The default value of 4 is recommended by :cite:p:`miKSG2004`. noise_level : float, default=1e-10 The standard deviation of Gaussian noise added to the data to avoid issues with zero distances between identical points. Set to 0 to disable noise addition. minkowski_p : float, default=inf The power parameter for the Minkowski metric used in distance calculations. Common values are 2 (Euclidean distance) and inf (maximum norm/Chebyshev distance). Must satisfy :math:`1 \leq p \leq \infty`. ksg_id : int, default=1 The KSG estimator variant to use (1 or 2). Type I uses the standard formula. Type II uses a modified formula with :math:`\psi(k) - 1/k`. base : LogBaseType, default=Config.get("base") The logarithm base for entropy calculation. Can be 2, 10, "e", or any positive number. Attributes ---------- *data : tuple[array-like] The processed data used to estimate the entropy, converted to 2D arrays. k : int The number of nearest neighbors to consider. noise_level : float The standard deviation of the Gaussian noise added to the data. minkowski_p : float The power parameter for the Minkowski metric. ksg_id : int The KSG estimator variant to use. Raises ------ ValueError If the number of nearest neighbors is not a positive integer. ValueError If the noise level is negative. ValueError If the Minkowski power parameter is invalid (not in range [1, ∞]). Notes ----- The choice of the number of nearest neighbors :math:`k` affects the bias-variance tradeoff of the estimator. Smaller values of :math:`k` reduce bias but increase variance, while larger values have the opposite effect. The default value of :math:`k=4` provides a good balance for most applications. The noise addition helps handle datasets with repeated values or points that are exactly identical, which would otherwise result in zero distances and numerical issues. The noise level should be small enough not to significantly alter the underlying distribution. For high-dimensional data, the curse of dimensionality may affect the estimator's performance, as nearest neighbor distances become less informative. In such cases, dimensionality reduction or alternative entropy estimation methods may be preferable. Examples -------- >>> import numpy as np >>> import infomeasure as im >>> >>> # Generate 2D Gaussian data >>> np.random.seed(176250) >>> data = np.random.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], 1000) >>> >>> # Estimate entropy >>> estimator = im.estimator(data, measure="h", approach="kl", k=4) >>> entropy_value = estimator.result() >>> print(f"Estimated entropy: {entropy_value:.3f}") Estimated entropy: 2.678 >>> print(f"Local values: {estimator.local_vals()}") Local values: [ 3.15330798 2.02688591 2.52250064 2.95236651 3.58801879 1.42033673 ... 2.91254223 1.92823136 3.63647704 2.05589055] >>> # Use different distance metric >>> estimator_euclidean = KozachenkoLeonenkoEntropyEstimator(data, k=4, minkowski_p=2) >>> entropy_euclidean = estimator_euclidean.entropy() np.float64(2.6772465397252208) """ def __init__( self, *data, k: int = 4, ksg_id: int = 1, noise_level=1e-10, minkowski_p=inf, base: LogBaseType = Config.get("base"), ): r"""Initialize the Kozachenko-Leonenko estimator. Parameters ---------- k : int The number of nearest neighbors to consider. noise_level : float The standard deviation of the Gaussian noise to add to the data to avoid issues with zero distances. minkowski_p : float, :math:`1 \leq p \leq \infty` The power parameter for the Minkowski metric. Default is np.inf for maximum norm. Use 2 for Euclidean distance. """ if not issubdtype(type(k), integer) or k <= 0: raise ValueError( "The number of nearest neighbors (k) must be a positive " f"integer, but got {k}." ) if noise_level < 0: raise ValueError( f"The noise level must be non-negative, but got {noise_level}." ) if not (1 <= minkowski_p <= inf): raise ValueError( "The Minkowski power parameter must be positive, " f"but got {minkowski_p}." ) super().__init__(*data, base=base) self.data = tuple(assure_2d_data(var) for var in self.data) self.k = k if ksg_id not in {1, 2}: raise ValueError(f"ksg_id must be 1 or 2, but got {ksg_id}.") self.ksg_id = ksg_id self.noise_level = noise_level self.minkowski_p = minkowski_p def _simple_entropy(self): """Calculate the entropy of the data. Returns ------- float The calculated entropy. """ # Copy the data to avoid modifying the original data_noisy = self.data[0].astype(float).copy() # Add small Gaussian noise to data to avoid issues with zero distances if self.noise_level and self.noise_level != 0: data_noisy += self.rng.normal(0, self.noise_level, self.data[0].shape) # Build a KDTree for efficient nearest neighbor search with maximum norm tree = KDTree(data_noisy) # Find the k-th nearest neighbors for each point distances, _ = tree.query(data_noisy, self.k + 1, p=self.minkowski_p) # Only keep the k-th nearest neighbor distance distances = distances[:, -1] # Constants for the entropy formula N = self.data[0].shape[0] d = self.data[0].shape[1] # Volume of the d-dimensional unit ball for maximum norm c_d = unit_ball_volume(d, r=1 / 2, p=self.minkowski_p) distances[distances == 0] = nan # Compute the local entropies psi_k = digamma(self.k) if self.ksg_id == 2: psi_k -= 1.0 / self.k local_h = -psi_k + digamma(N) + log(c_d) + d * log(2 * distances) local_h[isnan(local_h)] = 0.0 # return in desired base return local_h / log(self.base) if self.base != "e" else local_h def _joint_entropy(self): """Calculate the joint entropy of the data. This is done by joining the variables into one space and calculating the entropy. Returns ------- float The calculated joint entropy. """ self.data = (column_stack(self.data[0]),) return self._simple_entropy() def _cross_entropy(self) -> float: """Calculate the cross-entropy between two distributions. Returns ------- float The calculated cross-entropy. """ # Copy the data to avoid modifying the original data_noisy_p = self.data[0].astype(float).copy() data_noisy_q = self.data[1].astype(float).copy() # Add small Gaussian noise to data to avoid issues with zero distances if self.noise_level and self.noise_level != 0: data_noisy_p += self.rng.normal(0, self.noise_level, self.data[0].shape) data_noisy_q += self.rng.normal(0, self.noise_level, self.data[1].shape) # Build a KDTree for efficient nearest neighbor search with maximum norm tree = KDTree(data_noisy_q) # Find the k-th nearest neighbors for each point distances, _ = tree.query(data_noisy_p, self.k, p=self.minkowski_p) # Only keep the k-th nearest neighbor distance distances = distances[:, -1] # Constants for the entropy formula M = self.data[1].shape[0] d = self.data[1].shape[1] # Volume of the d-dimensional unit ball for maximum norm c_d = unit_ball_volume(d, r=1 / 2, p=self.minkowski_p) # Compute the cross-entropy psi_k = digamma(self.k) if self.ksg_id == 2: psi_k -= 1.0 / self.k hx = ( -psi_k + digamma(M) + log(c_d) + d * np_sum(log(2 * distances[distances > 0])) / M ) # return in desired base return hx / log(self.base) if self.base != "e" else hx