Source code for infomeasure.estimators.entropy.ansb

"""Module for the Asymptotic NSB entropy estimator."""

from numpy import euler_gamma, log
from scipy.special import digamma

from infomeasure.estimators.base import DiscreteHEstimator
from ...utils.config import logger
from ...utils.exceptions import TheoreticalInconsistencyError
from ... import Config
from ...utils.types import LogBaseType


[docs] class AnsbEntropyEstimator(DiscreteHEstimator): r"""Asymptotic NSB entropy estimator. The Asymptotic NSB (ANSB) estimator provides entropy estimation for extremely undersampled discrete data where the number of unique values K is comparable to the sample size N. .. math:: \hat{H}_{\text{ANSB}} = (C_\gamma - \log(2)) + 2 \log(N) - \psi(\Delta) where :math:`C_\gamma \approx 0.5772156649\dots` is Euler's constant, :math:`\psi` is the digamma function, and :math:`\Delta = N - K` is the number of coincidences (repeated observations) in the data. This estimator is specifically designed for the extremely undersampled regime where :math:`K \sim N` and diverges with N when the data is well-sampled. The ANSB estimator requires that :math:`N/K \to 0`, which is checked by default using the ``undersampled`` parameter :cite:p:`nemenmanEntropyInformationNeural2004`. If there are no coincidences in the data (:math:`\Delta = 0`), ANSB returns NaN as the estimator is undefined in this case. Parameters ---------- *data : array-like The data used to estimate the entropy. K : int, optional The support size. If not provided, uses the observed support size. undersampled : float, default=0.1 Maximum allowed ratio N/K to consider data sufficiently undersampled. A warning is issued if this threshold is exceeded. base : LogBaseType, default=Config.get("base") The logarithm base for entropy calculation. Attributes ---------- *data : array-like The data used to estimate the entropy. Notes ----- The ANSB estimator is based on the asymptotic expansion of the NSB estimator for the case of extreme undersampling. It provides a computationally efficient alternative to the full NSB estimator when :math:`K \sim N`. Examples -------- >>> import infomeasure as im >>> data = [1, 2, 3, 4, 5, 1, 2] # Some repeated values >>> im.entropy(data, approach='ansb') np.float64(3.353104447353747) """ def __init__( self, *data, K: int = None, undersampled: float = 0.1, base: LogBaseType = Config.get("base"), ): """Initialize the ANSB entropy estimator.""" super().__init__(*data, base=base) self.k_given = K self.undersampled = undersampled def _simple_entropy(self): """Calculate the ANSB entropy of the data. Returns ------- float or tuple The calculated ANSB entropy. If std_dev=True, returns (entropy, std_dev). """ N = self.data[0].N K = self.data[0].K if self.k_given is None else self.k_given # Check if data is sufficiently undersampled ratio = N / K if K > 0 else float("inf") if ratio > self.undersampled: logger.warning( f"Data is not sufficiently undersampled (N/K = {ratio:.3f} > {self.undersampled}), " "so calculation may diverge..." ) # Calculate coincidences (number of repeated observations) coincidences = N - K if coincidences == 0: logger.warning("No coincidences in data - ANSB estimator is undefined") return float("nan") # ANSB formula: (γ - log(2)) + 2 * log(N) - ψ(Δ) entropy_nats = (euler_gamma - log(2)) + 2 * log(N) - digamma(coincidences) # Convert to the desired base if needed if self.base != "e": entropy_nats /= log(self.base) return entropy_nats def _extract_local_values(self): """Extract local values for ANSB estimator. Raises ------ TheoreticalInconsistencyError Local values cannot be meaningfully extracted for ANSB estimator. """ raise TheoreticalInconsistencyError( "Local values extraction is not implemented for ANSB estimator. " "The ANSB estimator is based on global statistics (coincidences) " "and does not provide meaningful local entropy values for individual data points." ) def _cross_entropy(self): """Calculate cross-entropy between two distributions. Raises ------ TheoreticalInconsistencyError Cross-entropy is not theoretically sound for ANSB estimator. """ raise TheoreticalInconsistencyError( "Cross-entropy is not implemented for ANSB estimator. " "The ANSB estimator is designed for single distribution entropy estimation " "in the extremely undersampled regime and does not extend to cross-entropy " "calculations between different distributions." )