Source code for infomeasure.estimators.entropy.discrete

"""Module for the discrete entropy estimator."""

from numpy import sum as np_sum, ndarray

from ..base import DiscreteHEstimator
from ...utils.config import logger



[docs]
class DiscreteEntropyEstimator(DiscreteHEstimator):
    r"""Standard Shannon entropy estimator for discrete data using maximum likelihood.

    The discrete entropy estimator computes the Shannon entropy using the classical
    maximum likelihood approach:

    .. math::

        \hat{H} = -\sum_{i=1}^{K} \hat{p}_i \log \hat{p}_i

    where :math:`\hat{p}_i = \frac{n_i}{N}` are the empirical probabilities,
    :math:`n_i` are the counts for each unique value :math:`i`, :math:`K` is the number of
    unique values, and :math:`N` is the total number of observations.

    This is the most fundamental entropy estimator and serves as the baseline for
    comparison with other bias-corrected estimators. While it provides an asymptotically
    unbiased estimate of the true entropy, it can exhibit significant bias for small
    sample sizes, particularly when the number of unique values is large relative to
    the sample size.

    The estimator is suitable for:

    - Large datasets where bias is minimal
    - Baseline comparisons with bias-corrected estimators
    - Applications where computational simplicity is preferred
    - Well-sampled distributions with sufficient observations per unique value

    For small sample sizes or distributions with many rare events, consider using
    bias-corrected estimators
    such as :class:`~infomeasure.estimators.entropy.chao_shen.ChaoShenEntropyEstimator`,
    :class:`~infomeasure.estimators.entropy.bonachela.BonachelaEntropyEstimator`,
    or :class:`~infomeasure.estimators.entropy.zhang.ZhangEntropyEstimator`.

    Attributes
    ----------
    *data : array-like
        The data used to estimate the entropy. For joint entropy, multiple arrays
        can be provided.
    base : float or str, default=Config.get("base")
        The logarithm base for entropy calculation. Common values are 2 (bits),
        10 (dits), or 'e' (nats).

    Examples
    --------
    >>> import infomeasure as im
    >>> # Simple entropy calculation
    >>> data = [1, 1, 2, 3, 3, 4, 5]
    >>> entropy_value = im.entropy(data, approach="discrete")
    >>> print(f"Entropy: {entropy_value:.3f} nats")
    Entropy: 1.550 nats
    >>> # Local values
    >>> estimator = im.estimator(data, measure="h", approach="discrete")
    >>> estimator.local_vals()
    array([1.25276297, 1.25276297, 1.94591015, 1.25276297, 1.25276297,
       1.94591015, 1.94591015])
    """

    def _simple_entropy(self):
        """Calculate the entropy of the data.

        Returns
        -------
        float
            The calculated entropy.
        """
        probabilities = self.data[0].probabilities
        # Calculate the entropy
        return -np_sum(probabilities * self._log_base(probabilities))

    @property
    def dist_dict(self):
        """Return the distribution dictionary for JSD."""
        return self.data[0].distribution_dict

    def _extract_local_values(self):
        """Separately, calculate the local values.

        Returns
        -------
        ndarray[float]
            The calculated local values of entropy.
        """
        distribution_dict = dict(zip(self.data[0].uniq, self.data[0].probabilities))
        p_local = [distribution_dict[val] for val in self.data[0].data]
        return -self._log_base(p_local)

    def _cross_entropy(self) -> float:
        """Calculate the cross-entropy between two distributions.

        Returns
        -------
        float
            The calculated cross-entropy.
        """
        # Calculate distribution of both data sets
        uniq_p = self.data[0].uniq
        dist_p = self.data[0].distribution_dict
        uniq_q = self.data[1].uniq
        dist_q = self.data[1].distribution_dict
        # Only consider the values where both RV have the same support
        uniq = list(set(uniq_p).intersection(set(uniq_q)))  # P ∩ Q
        if len(uniq) == 0:
            logger.warning("No common support between the two distributions.")
            return 0.0
        return -np_sum([dist_p[val] * self._log_base(dist_q[val]) for val in uniq])