Source code for infomeasure.estimators.entropy.bonachela

"""Module for the Bonachela entropy estimator."""

from numpy import log
from numpy import arange, sum as np_sum

from infomeasure.estimators.base import DiscreteHEstimator
from infomeasure.utils.exceptions import TheoreticalInconsistencyError



[docs]
class BonachelaEntropyEstimator(DiscreteHEstimator):
    r"""Bonachela (Bonachela-Hinrichsen-Muñoz) entropy estimator for discrete data.

    The Bonachela estimator computes the Shannon entropy using the formula from
    :cite:p:`bonachelaEntropyEstimatesSmall2008`:

    .. math::

        \hat{H}_{B} = \frac{1}{N+2} \sum_{i=1}^{K} \left( (n_i + 1) \sum_{j=n_i + 2}^{N+2} \frac{1}{j} \right)

    where :math:`n_i` are the counts for each unique value, :math:`K` is the number of
    unique values, and :math:`N` is the total number of observations.

    This estimator is specially designed to provide a compromise between low bias and
    small statistical errors for short data series, particularly when the data sets are
    small and the probabilities are not close to zero.

    Attributes
    ----------
    *data : array-like
        The data used to estimate the entropy.
    """

    def _simple_entropy(self):
        """Calculate the Bonachela entropy of the data.

        Returns
        -------
        float
            The calculated Bonachela entropy.
        """
        # Get counts and total observations
        counts = self.data[0].counts
        N = self.data[0].N

        # Vectorized computation
        # For each count ni = count + 1, we need sum(1/j for j in range(ni + 1, N + 3))
        ni_values = counts + 1  # Shape: (K,)

        # Create array of all possible j values from 2 to N+2
        j_values = arange(2, N + 3)  # Shape: (N+1,)

        # Create a mask matrix where mask[i, j] is True if j_values[j] > ni_values[i]
        # This uses broadcasting: ni_values[:, None] has shape (K, 1), j_values has shape (N+1,)
        mask = j_values[None, :] > ni_values[:, None]  # Shape: (K, N+1)

        # Create reciprocal array: 1/j for each j
        reciprocals = 1.0 / j_values  # Shape: (N+1,)

        # Apply mask and sum along j dimension to get inner sums for each count
        # mask * reciprocals[None, :] broadcasts reciprocals to shape (K, N+1)
        inner_sums = np_sum(mask * reciprocals[None, :], axis=1)  # Shape: (K,)

        # Calculate contributions: ni * inner_sum for each count
        contributions = ni_values * inner_sums  # Shape: (K,)

        # Sum all contributions
        acc = np_sum(contributions)

        # Calculate final entropy with normalization factor
        ent = acc / (N + 2)

        # Convert to the desired base if needed
        if self.base != "e":
            ent /= log(self.base)

        return ent

    def _extract_local_values(self):
        """Calculate local Bonachela entropy values for each data point.

        Returns
        -------
        ndarray[float]
            The calculated local values of Bonachela entropy.
        """
        raise TheoreticalInconsistencyError(
            "Local values are not implemented for Bonachela estimator due to "
            "theoretical inconsistencies in the mathematical foundation."
        )

    def _cross_entropy(self):
        """Calculate cross-entropy between two distributions using Bonachela estimator.

        Returns
        -------
        float
            The calculated cross-entropy.
        """
        raise TheoreticalInconsistencyError(
            "Cross-entropy is not implemented for Bonachela estimator due to "
            "theoretical inconsistencies in applying bias corrections from "
            "different distributions."
        )