Source code for infomeasure.estimators.entropy.chao_shen
"""Module for the Chao-Shen entropy estimator."""
from numpy import sum as np_sum, log
from infomeasure.estimators.base import DiscreteHEstimator
from ...utils.exceptions import TheoreticalInconsistencyError
[docs]
class ChaoShenEntropyEstimator(DiscreteHEstimator):
r"""Chao-Shen entropy estimator.
.. math::
\hat{H}_{CS} = - \sum_{i=1}^{K} \frac{\hat{p}_i^{CS} \log \hat{p}_i^{CS}}{1 - (1 - \hat{p}_i^{ML} C)^N}
where
.. math::
\hat{p}_i^{CS} = C \cdot \hat{p}_i^{ML}
and :math:`C = 1 - \frac{f_1}{N}` is the estimated coverage,
:math:`f_1` is the number of singletons (species observed exactly once),
:math:`\hat{p}_i^{ML}` is the maximum likelihood probability estimate,
:math:`N` is the sample size, and :math:`K` is the number of observed species
:cite:p:`chaoNonparametricEstimationShannons2003`.
The Chao-Shen estimator provides a bias-corrected estimate of Shannon entropy
that accounts for unobserved species through coverage estimation.
Attributes
----------
*data : array-like
The data used to estimate the entropy.
"""
def _simple_entropy(self):
"""Calculate the Chao-Shen entropy of the data.
Returns
-------
ndarray[float]
The calculated local values of entropy.
"""
N = self.data[0].N
# Number of singletons
f1 = np_sum(self.data[0].counts == 1)
if f1 == N:
f1 -= 1 # Avoid C=0
# Estimated coverage
C = 1 - f1 / N
pa = ( # Coverage adjusted empirical frequencies
C * self.data[0].probabilities
)
la = 1 - (1 - pa) ** N # Probability to see a bin (species) in the sample
# Chao-Shen (2003) entropy estimator
h = -np_sum(pa * log(pa) / la)
if self.base != "e":
h /= log(self.base)
return h
def _cross_entropy(self):
"""Calculate cross-entropy between two distributions.
Raises
------
TheoreticalInconsistencyError
Cross-entropy is not theoretically sound for Chao-Shen estimator
due to fundamental issues with mixing bias corrections from different distributions.
"""
raise TheoreticalInconsistencyError(
"Cross-entropy is not implemented for Chao-Shen estimator. "
"The Chao-Shen correction creates theoretical inconsistencies when applied to cross-entropy: "
"(1) Asymmetric nature problem - unclear which distribution should use Chao-Shen correction; "
"(2) Coverage estimation issues - different coverage estimates for each distribution lack "
"theoretical foundation when mixed; "
"(3) Denominator complexity - the correction involves sample-specific denominators "
"that are tied to individual distributions, making cross-distribution application arbitrary."
)