Source code for infomeasure.estimators.entropy.bayes

"""Module for the Bayesian entropy estimator."""

from numpy import sum as np_sum

from infomeasure.estimators.base import DiscreteHEstimator
from ... import Config
from ...utils.config import logger
from ...utils.types import LogBaseType



[docs]
class BayesEntropyEstimator(DiscreteHEstimator):
    r"""Bayesian entropy estimator.

    Computes an estimate of Shannon entropy using Bayesian probability estimates with
    a Dirichlet prior characterized by concentration parameter α. This approach provides
    a principled way to handle sparse data and incorporate prior knowledge about the
    probability distribution.

    The Bayesian probabilities are calculated as:

    .. math::

        p_k^{\text{Bayes}} = \frac{n_k + \alpha}{N + K \alpha}

    where :math:`n_k` is the count of symbol :math:`k`, :math:`N` is the total number
    of observations, :math:`K` is the support size (number of unique symbols), and
    :math:`\alpha` is the concentration parameter of the Dirichlet prior.

    The entropy is then :math:`-\sum p_k^{\text{Bayes}} \log p_k^{\text{Bayes}}`,
    same as the maximum likelihood entropy estimator,
    also supporting local entropy values.

    **Concentration Parameter Choices**

    The concentration parameter α controls the strength of the prior belief in uniform
    distribution. Several well-established choices are available:

    **Jeffreys Prior** (``α = 0.5 = "jeffrey"``)
        Non-informative prior that is invariant under reparameterization.
        Provides good performance for most applications
        :cite:p:`krichevskyPerformanceUniversalEncoding1981`.

    **Laplace Prior** (``α = 1.0 = "laplace"``)
        Uniform prior that adds one pseudocount to each symbol
        :cite:p:`bayesEssaySolvingProblem1763`.
        Simple and widely used, equivalent to add-one smoothing.

    **Schürmann-Grassberger Prior** (``α = 1/K = "sch-grass"``)
        Adaptive prior that scales with the alphabet size.
        Particularly effective for large alphabets.

    **Minimax Prior** (``α = √N/K = "min-max"``)
        Minimises the maximum expected loss.
        Balances between sample size and alphabet size.

    Attributes
    ----------
    *data : array-like
        The data used to estimate the entropy.
    alpha : float
        The concentration parameter α of the Dirichlet prior.
    K : int, optional
        The support size. If not provided, uses the observed support size.
    """

    def __init__(
        self,
        *data,
        alpha: float | str,
        K: int = None,
        base: LogBaseType = Config.get("base"),
    ):
        """Initialize the BayesEntropyEstimator.

        Parameters
        ----------
        *data : array-like
            The data used to estimate the entropy.
        alpha : float | str
            The concentration parameter α.
            Either a float or a string specifying the choice of concentration parameter.
        K : int, optional
            The support size. If not provided, uses the observed support size.
        base : LogBaseType, default=Config.get("base")
            The logarithm base for entropy calculation.
        """
        super().__init__(*data, base=base)

        self.alpha = alpha
        self.K_param = K

    def _simple_entropy(self):
        """Calculate the Bayesian entropy of the data.

        Returns
        -------
        float
            The calculated entropy.
        """
        bayes_probs = self.bayes_probs

        # Calculate entropy: -sum(p_k * log(p_k))
        return -np_sum(bayes_probs * self._log_base(bayes_probs))

    @property
    def bayes_probs(self):
        K = self.K_param if self.K_param is not None else self.data[0].K
        N = self.data[0].N
        self.alpha = self._get_alpha(self.alpha, K, N)
        # Calculate Bayesian probabilities: p_k = (n_k + α) / (N + K*α)
        weight = N + K * self.alpha
        bayes_probs = (self.data[0].counts + self.alpha) / weight
        return bayes_probs

    @property
    def dist_dict(self):
        """Return the Bayesian distribution dictionary for JSD."""
        return dict(zip(self.data[0].uniq, self.bayes_probs))

    @staticmethod
    def _get_alpha(alpha, K, N):
        # Alpha
        if isinstance(alpha, (int, float)):
            return alpha
        elif not isinstance(alpha, str) or (
            isinstance(alpha, str)
            and alpha.lower() not in ["jeffrey", "laplace", "sch-grass", "min-max"]
        ):
            raise ValueError(
                "Concentration parameter must be a float or one of the following"
                "strings: \n'jeffrey', 'laplace', 'sch-grass', 'min-max'\n"
                f"Received: {alpha}"
            )
        elif alpha.lower() == "jeffrey":
            return 0.5
        elif alpha.lower() == "laplace":
            return 1.0
        elif alpha.lower() == "sch-grass":
            return 1 / K
        elif alpha == "min-max":
            return N**0.5 / K
        raise ValueError(
            f"Concentration parameter '{alpha}' not recognized. "
            f"Must be a float or one of the following strings: \n"
            f"'jeffrey', 'laplace', 'sch-grass', 'min-max'\n"
            f"Received: {alpha}"
        )

    def _cross_entropy(self) -> float:
        """Calculate the Bayesian cross-entropy between two distributions.

        Returns
        -------
        float
            The calculated cross-entropy.
        """

        K_p = self.K_param if self.K_param is not None else self.data[0].K
        K_q = self.K_param if self.K_param is not None else self.data[1].K
        N_p = self.data[0].N
        N_q = self.data[1].N
        alpha_p = self._get_alpha(self.alpha, K_p, N_p)
        alpha_q = self._get_alpha(self.alpha, K_q, N_q)

        # Calculate Bayesian distributions using consistent weight calculation
        weight_p = N_p + K_p * alpha_p
        weight_q = N_q + K_q * alpha_q

        dist_p = {}
        for val, count in zip(self.data[0].uniq, self.data[0].counts):
            dist_p[val] = (count + alpha_p) / weight_p

        dist_q = {}
        for val, count in zip(self.data[1].uniq, self.data[1].counts):
            dist_q[val] = (count + alpha_q) / weight_q

        # Find common support
        uniq_p = set(self.data[0].uniq)
        uniq_q = set(self.data[1].uniq)
        uniq = list(uniq_p.intersection(uniq_q))

        if len(uniq) == 0:
            logger.warning("No common support between the two distributions.")
            return 0.0

        # Calculate cross-entropy: -sum(p(x) * log(q(x)))
        return -np_sum([dist_p[val] * self._log_base(dist_q[val]) for val in uniq])