Source code for infomeasure.estimators.entropy.bayes

"""Module for the Bayesian entropy estimator."""

from numpy import sum as np_sum

from infomeasure.estimators.base import DiscreteHEstimator
from ... import Config
from ...utils.config import logger
from ...utils.types import LogBaseType


[docs] class BayesEntropyEstimator(DiscreteHEstimator): r"""Bayesian entropy estimator. Computes an estimate of Shannon entropy using Bayesian probability estimates with a Dirichlet prior characterized by concentration parameter α. This approach provides a principled way to handle sparse data and incorporate prior knowledge about the probability distribution. The Bayesian probabilities are calculated as: .. math:: p_k^{\text{Bayes}} = \frac{n_k + \alpha}{N + K \alpha} where :math:`n_k` is the count of symbol :math:`k`, :math:`N` is the total number of observations, :math:`K` is the support size (number of unique symbols), and :math:`\alpha` is the concentration parameter of the Dirichlet prior. The entropy is then :math:`-\sum p_k^{\text{Bayes}} \log p_k^{\text{Bayes}}`, same as the maximum likelihood entropy estimator, also supporting local entropy values. **Concentration Parameter Choices** The concentration parameter α controls the strength of the prior belief in uniform distribution. Several well-established choices are available: **Jeffreys Prior** (``α = 0.5 = "jeffrey"``) Non-informative prior that is invariant under reparameterization. Provides good performance for most applications :cite:p:`krichevskyPerformanceUniversalEncoding1981`. **Laplace Prior** (``α = 1.0 = "laplace"``) Uniform prior that adds one pseudocount to each symbol :cite:p:`bayesEssaySolvingProblem1763`. Simple and widely used, equivalent to add-one smoothing. **Schürmann-Grassberger Prior** (``α = 1/K = "sch-grass"``) Adaptive prior that scales with the alphabet size. Particularly effective for large alphabets. **Minimax Prior** (``α = √N/K = "min-max"``) Minimises the maximum expected loss. Balances between sample size and alphabet size. Attributes ---------- *data : array-like The data used to estimate the entropy. alpha : float The concentration parameter α of the Dirichlet prior. K : int, optional The support size. If not provided, uses the observed support size. """ def __init__( self, *data, alpha: float | str, K: int = None, base: LogBaseType = Config.get("base"), ): """Initialize the BayesEntropyEstimator. Parameters ---------- *data : array-like The data used to estimate the entropy. alpha : float | str The concentration parameter α. Either a float or a string specifying the choice of concentration parameter. K : int, optional The support size. If not provided, uses the observed support size. base : LogBaseType, default=Config.get("base") The logarithm base for entropy calculation. """ super().__init__(*data, base=base) self.alpha = alpha self.K_param = K def _simple_entropy(self): """Calculate the Bayesian entropy of the data. Returns ------- float The calculated entropy. """ bayes_probs = self.bayes_probs # Calculate entropy: -sum(p_k * log(p_k)) return -np_sum(bayes_probs * self._log_base(bayes_probs)) @property def bayes_probs(self): K = self.K_param if self.K_param is not None else self.data[0].K N = self.data[0].N self.alpha = self._get_alpha(self.alpha, K, N) # Calculate Bayesian probabilities: p_k = (n_k + α) / (N + K*α) weight = N + K * self.alpha bayes_probs = (self.data[0].counts + self.alpha) / weight return bayes_probs @property def dist_dict(self): """Return the Bayesian distribution dictionary for JSD.""" return dict(zip(self.data[0].uniq, self.bayes_probs)) @staticmethod def _get_alpha(alpha, K, N): # Alpha if isinstance(alpha, (int, float)): return alpha elif not isinstance(alpha, str) or ( isinstance(alpha, str) and alpha.lower() not in ["jeffrey", "laplace", "sch-grass", "min-max"] ): raise ValueError( "Concentration parameter must be a float or one of the following" "strings: \n'jeffrey', 'laplace', 'sch-grass', 'min-max'\n" f"Received: {alpha}" ) elif alpha.lower() == "jeffrey": return 0.5 elif alpha.lower() == "laplace": return 1.0 elif alpha.lower() == "sch-grass": return 1 / K elif alpha == "min-max": return N**0.5 / K raise ValueError( f"Concentration parameter '{alpha}' not recognized. " f"Must be a float or one of the following strings: \n" f"'jeffrey', 'laplace', 'sch-grass', 'min-max'\n" f"Received: {alpha}" ) def _cross_entropy(self) -> float: """Calculate the Bayesian cross-entropy between two distributions. Returns ------- float The calculated cross-entropy. """ K_p = self.K_param if self.K_param is not None else self.data[0].K K_q = self.K_param if self.K_param is not None else self.data[1].K N_p = self.data[0].N N_q = self.data[1].N alpha_p = self._get_alpha(self.alpha, K_p, N_p) alpha_q = self._get_alpha(self.alpha, K_q, N_q) # Calculate Bayesian distributions using consistent weight calculation weight_p = N_p + K_p * alpha_p weight_q = N_q + K_q * alpha_q dist_p = {} for val, count in zip(self.data[0].uniq, self.data[0].counts): dist_p[val] = (count + alpha_p) / weight_p dist_q = {} for val, count in zip(self.data[1].uniq, self.data[1].counts): dist_q[val] = (count + alpha_q) / weight_q # Find common support uniq_p = set(self.data[0].uniq) uniq_q = set(self.data[1].uniq) uniq = list(uniq_p.intersection(uniq_q)) if len(uniq) == 0: logger.warning("No common support between the two distributions.") return 0.0 # Calculate cross-entropy: -sum(p(x) * log(q(x))) return -np_sum([dist_p[val] * self._log_base(dist_q[val]) for val in uniq])