Source code for infomeasure.estimators.entropy.chao_wang_jost

"""Module for the Chao Wang Jost entropy estimator."""

from numpy import log
from numpy import sum as np_sum
from scipy.special import digamma

from infomeasure.estimators.base import DiscreteHEstimator
from ...utils.config import logger
from ...utils.exceptions import TheoreticalInconsistencyError



[docs]
class ChaoWangJostEntropyEstimator(DiscreteHEstimator):
    r"""Advanced bias-corrected Shannon entropy estimator using coverage estimation.

    The Chao-Wang-Jost estimator provides improved entropy estimates for incomplete sampling
    scenarios by accounting for unobserved species through sophisticated statistical corrections.
    This estimator is particularly valuable when dealing with ecological data, text analysis,
    or any discrete distribution where the sample may not capture all possible outcomes.

    The Chao-Wang-Jost estimator addresses the systematic underestimation of entropy in
    finite samples by applying sophisticated statistical corrections.
    Through coverage estimation using singleton and doubleton counts,
    it provides reliable entropy estimates even with small or incomplete samples.
    Based on species accumulation theory and Good-Turing estimation principles,
    this approach is particularly valuable when the sample doesn't capture all
    possible outcomes, such as in ecological diversity studies with incomplete species
    sampling or text analysis where vocabulary may be incompletely observed.
    The estimator is especially useful when standard entropy estimators show systematic
    bias due to sample size limitations.

    Standard entropy estimators often underestimate diversity in finite samples,
    especially when the sampling is incomplete. This estimator overcomes this limitation
    by leveraging information from rare species (singletons and doubletons) to estimate
    sample coverage and correct for unobserved species. The theoretical foundation in
    species accumulation curves and Good-Turing frequency estimation provide a robust
    statistical framework for addressing sampling bias issues.

    **Mathematical Foundation:**

    The estimator combines observed entropy with a correction term based on coverage estimation:

    .. math::

        \hat{H}_{\text{CWJ}} = \sum_{1 \leq n_i \leq N-1} \frac{n_i}{N} \left(\sum_{k=n_i}^{N-1} \frac{1}{k} \right) +
        \frac{f_1}{N} (1 - A)^{-N + 1} \left\{ - \log(A) - \sum_{r=1}^{N-1} \frac{1}{r} (1 - A)^r \right\}

    where the coverage parameter :math:`A` is estimated as:

    .. math::

        A = \begin{cases}
        \frac{2 f_2}{(N-1) f_1 + 2 f_2} \, & \text{if} \, f_2 > 0 \\
        \frac{2}{(N-1)(f_1 - 1) + 2} \, & \text{if} \, f_2 = 0, \; f_1 \neq 0 \\
        1, & \text{if} \, f_1 = f_2 = 0
        \end{cases}

    Here, :math:`f_1` represents the number of singletons (species observed exactly once) and
    :math:`f_2` the number of doubletons (species observed exactly twice) in the sample
    :cite:p:`chaoEntropySpeciesAccumulation2013`.

    Notes
    -----
    - The algorithm is adapted from the `entropart <https://ericmarcon.github.io/entropart/index.html>`_ R library :cite:p:`marconEntropartPackageMeasure2015`
    - The correction becomes negligible when samples are complete (:math:`f_1 = f_2 = 0`)

    Attributes
    ----------
    *data : array-like
        The data used to estimate the entropy.

    Examples
    --------
    >>> import infomeasure as im
    >>>
    >>> # Basic usage with incomplete sampling scenario
    >>> data = [1, 1, 2, 3, 4, 5]  # Many singletons suggest incomplete sampling
    >>> h_cwj = im.entropy(data, approach="chao_wang_jost", base=2)
    >>> h_standard = im.entropy(data, approach="discrete", base=2)
    >>> print(f"Chao-Wang-Jost: {h_cwj:.3f} bits")
    Chao-Wang-Jost: 3.635 bits
    >>> print(f"Standard: {h_standard:.3f} bits")
    Standard: 2.252 bits
    >>>
    >>> # Ecological diversity example
    >>> species_counts = [1, 1, 1, 2, 2, 3, 5, 8]  # Species abundance data
    >>> diversity = im.entropy(species_counts, approach="cwj", base="e")
    >>> print(f"Species diversity: {diversity:.3f} nats")
    Species diversity: 2.054 nats

    See Also
    --------
    infomeasure.estimators.functional.entropy : Functional interface for entropy calculation
    infomeasure.estimators.entropy.discrete.DiscreteEntropyEstimator : Standard maximum likelihood entropy estimator
    """

    def _simple_entropy(self):
        """Calculate the Chao Wang Jost entropy of the data.

        Returns
        -------
        float
            The calculated entropy value.
        """
        N = self.data[0].N
        counts = self.data[0].counts

        # Calculate singletons (f1) and doubletons (f2)
        f1 = np_sum(counts == 1)
        f2 = np_sum(counts == 2)

        #
        if f1 == 0 and f2 == 0:
            logger.warning(
                "There are no singletons and doubletons in the data, "
                "the corrections becomes negible."
            )

        # Calculate parameter A
        if f2 > 0:
            A = 2 * f2 / ((N - 1) * f1 + 2 * f2)
        elif f1 > 0:
            A = 2 / ((N - 1) * (f1 - 1) + 2)
        else:
            A = 1

        # First part of the formula: sum over observed counts
        # Using digamma(N) - digamma(n_i) = sum_{k=n_i}^{N-1} 1/k
        cwj = (
            counts[1 <= counts] * (digamma(N) - digamma(counts[1 <= counts]))
        ).sum() / N
        # Second part: correction term when A != 1
        if A != 1 and f1 > 0:
            # Calculate sum_{r=1}^{N-1} (1/r) * (1-A)^r
            p2 = sum(1 / r * (1 - A) ** r for r in range(1, N))
            correction = f1 / N * (1 - A) ** (1 - N) * (-log(A) - p2)
            cwj += correction

        # Convert to the desired base
        if self.base != "e":
            cwj /= log(self.base)

        return cwj

    def _extract_local_values(self):
        """Calculate local entropy values for each data point.

        Raises
        ------
        TheoreticalInconsistencyError
            Local values are not theoretically well-defined for Chao Wang Jost estimator
            due to the complex bias correction involving global sample statistics.
        """
        raise TheoreticalInconsistencyError(
            "Local values are not implemented for Chao Wang Jost estimator. "
            "The Chao Wang Jost correction involves global sample statistics (singletons, doubletons) "
            "and complex bias corrections that cannot be meaningfully decomposed into local contributions. "
            "The correction term depends on the entire sample structure and cannot be attributed to "
            "individual observations in a theoretically consistent manner."
        )

    def _cross_entropy(self):
        """Calculate cross-entropy between two distributions.

        Raises
        ------
        TheoreticalInconsistencyError
            Cross-entropy is not theoretically sound for Chao Wang Jost estimator
            due to fundamental issues with mixing bias corrections from different distributions.
        """
        raise TheoreticalInconsistencyError(
            "Cross-entropy is not implemented for Chao Wang Jost estimator. "
            "The Chao Wang Jost correction creates theoretical inconsistencies when applied to cross-entropy: "
            "(1) The bias correction depends on sample-specific statistics (singletons, doubletons) "
            "that are tied to individual distributions; "
            "(2) Mixing corrections from different distributions lacks theoretical foundation; "
            "(3) The complex correction terms involving coverage estimation cannot be meaningfully "
            "applied across different probability distributions."
        )