Source code for infomeasure.estimators.entropy.miller_madow

"""Module for the discrete Miller-Madow entropy estimator."""

from numpy import log
from numpy import sum as np_sum

from infomeasure.estimators.base import DiscreteHEstimator
from ...utils.config import logger



[docs]
class MillerMadowEntropyEstimator(DiscreteHEstimator):
    r"""Discrete Miller-Madow entropy estimator.

    .. math::

        \hat{H}_{\tiny{MM}} = \hat{H}_{\tiny{MLE}} + \frac{K - 1}{2N}

    :math:`\hat{H}_{\tiny{MM}}` is the Miller-Madow entropy,
    where :math:`\hat{H}_{\tiny{MLE}}` is the maximum likelihood entropy
    (:class:`~infomeasure.estimators.entropy.discrete.DiscreteEntropyEstimator`).
    :math:`K` is the number of unique values in the data,
    and :math:`N` is the number of observations.

    Attributes
    ----------
    *data : array-like
        The data used to estimate the entropy.
    """

    def _simple_entropy(self):
        """Calculate the Miller-Madow entropy of the data.

        Returns
        -------
        float
            The calculated entropy.
        """
        probabilities = self.data[0].probabilities

        correction = self._mm_factor()
        # Calculate the entropy
        return -np_sum(probabilities * self._log_base(probabilities)) + correction

    def _extract_local_values(self):
        """Separately, calculate the local values.

        Returns
        -------
        ndarray[float]
            The calculated local values of entropy.
        """
        dist_dict = self.data[0].distribution_dict
        p_local = [dist_dict[val] for val in self.data[0].data]

        correction = self._mm_factor()

        return -self._log_base(p_local) + correction

    def _mm_factor(self):
        # Miller-Madow correction factor
        K = self.data[0].K  # number of unique values
        N = self.data[0].N  # total observations
        correction = (K - 1) / (2 * N)
        if self.base != "e":
            correction /= log(self.base)
        return correction

    def _cross_entropy(self) -> float:
        """Calculate the Miller-Madow cross-entropy between two distributions.

        Returns
        -------
        float
            The calculated cross-entropy.
        """
        # Calculate the distribution of both data sets
        dist_p = self.data[0].distribution_dict
        uniq_p = self.data[0].uniq
        dist_q = self.data[1].distribution_dict
        uniq_q = self.data[1].uniq
        # Only consider the values where both RV have the same support
        uniq = list(set(uniq_p).intersection(set(uniq_q)))  # P ∩ Q
        if len(uniq) == 0:
            logger.warning("No common support between the two distributions.")
            return 0.0
        # Miller-Madow correction
        N = self.data[0].N + self.data[1].N
        K = ((self.data[0].K + self.data[1].K) / 2.0) - 1.0
        correction = K / N if self.base == "e" else K / (N * log(self.base))
        return (
            -np_sum([dist_p[val] * self._log_base(dist_q[val]) for val in uniq])
            + correction
        )