Source code for infomeasure.estimators.entropy.kozachenko_leonenko

"""Module for the Kozacenko-Leonenko entropy estimator."""

from numpy import column_stack
from numpy import inf, log, issubdtype, integer
from scipy.spatial import KDTree
from scipy.special import digamma

from ..base import EntropyEstimator, RandomGeneratorMixin
from ..utils.array import assure_2d_data
from ..utils.unit_ball_volume import unit_ball_volume
from ... import Config
from ...utils.types import LogBaseType


[docs] class KozachenkoLeonenkoEntropyEstimator(RandomGeneratorMixin, EntropyEstimator): r"""Kozachenko-Leonenko estimator for Shannon entropies. Attributes ---------- data : array-like The data used to estimate the entropy. k : int The number of nearest neighbors to consider. noise_level : float The standard deviation of the Gaussian noise to add to the data to avoid issues with zero distances. minkowski_p : float, :math:`1 \leq p \leq \infty` The power parameter for the Minkowski metric. Default is np.inf for maximum norm. Use 2 for Euclidean distance. Raises ------ ValueError If the number of nearest neighbors is not a positive integer ValueError If the noise level is negative ValueError If the Minkowski power parameter is invalid Notes ----- Changing the number of nearest neighbors ``k`` can change the outcome, but the default value of :math:`k=4` is recommended by :cite:p:`miKSG2004`. """ def __init__( self, data, *, # all following parameters are keyword-only k: int = 4, noise_level=1e-10, minkowski_p=inf, base: LogBaseType = Config.get("base"), ): r"""Initialize the Kozachenko-Leonenko estimator. Parameters ---------- k : int The number of nearest neighbors to consider. noise_level : float The standard deviation of the Gaussian noise to add to the data to avoid issues with zero distances. minkowski_p : float, :math:`1 \leq p \leq \infty` The power parameter for the Minkowski metric. Default is np.inf for maximum norm. Use 2 for Euclidean distance. """ if not issubdtype(type(k), integer) or k <= 0: raise ValueError( "The number of nearest neighbors (k) must be a positive " f"integer, but got {k}." ) if noise_level < 0: raise ValueError( f"The noise level must be non-negative, but got {noise_level}." ) if not (1 <= minkowski_p <= inf): raise ValueError( "The Minkowski power parameter must be positive, " f"but got {minkowski_p}." ) super().__init__(data, base=base) self.data = assure_2d_data(data) self.k = k self.noise_level = noise_level self.minkowski_p = minkowski_p def _simple_entropy(self): """Calculate the entropy of the data. Returns ------- float The calculated entropy. """ # Copy the data to avoid modifying the original data_noisy = self.data.astype(float).copy() # Add small Gaussian noise to data to avoid issues with zero distances if self.noise_level and self.noise_level != 0: data_noisy += self.rng.normal(0, self.noise_level, self.data.shape) # Build a KDTree for efficient nearest neighbor search with maximum norm tree = KDTree(data_noisy) # Find the k-th nearest neighbors for each point distances, _ = tree.query(data_noisy, self.k + 1, p=self.minkowski_p) # Only keep the k-th nearest neighbor distance distances = distances[:, -1] # Constants for the entropy formula N = self.data.shape[0] d = self.data.shape[1] # Volume of the d-dimensional unit ball for maximum norm c_d = unit_ball_volume(d, r=1 / 2, p=self.minkowski_p) # Compute the local entropies local_h = -digamma(self.k) + digamma(N) + log(c_d) + d * log(2 * distances) # return in desired base return local_h / log(self.base) if self.base != "e" else local_h def _joint_entropy(self): """Calculate the joint entropy of the data. This is done by joining the variables into one space and calculating the entropy. Returns ------- float The calculated joint entropy. """ self.data = column_stack(self.data) return self._simple_entropy()