Source code for infomeasure.estimators.entropy.kernel

"""Module for the kernel entropy estimator."""

from numpy import column_stack, sum as np_sum, isnan, nan

from ... import Config
from ...utils.types import LogBaseType
from ..base import EntropyEstimator
from ..mixins import WorkersMixin
from ..utils.array import assure_2d_data
from ..utils.kde import kde_probability_density_function


[docs] class KernelEntropyEstimator(WorkersMixin, EntropyEstimator): r"""Kernel entropy estimator for continuous data using Kernel Density Estimation (KDE). The kernel entropy estimator computes the differential Shannon entropy by estimating the probability density function using kernel density estimation: .. math:: \hat{H}(X) = -\int \hat{f}(x) \log \hat{f}(x) \, dx \approx -\frac{1}{N} \sum_{i=1}^{N} \log \hat{f}(x_i) where :math:`\hat{f}(x)` is the kernel density estimate: .. math:: \hat{f}(x) = \frac{1}{N h^d} \sum_{i=1}^{N} K\left(\frac{x - x_i}{h}\right) with :math:`K(\cdot)` being the kernel function, :math:`h` the bandwidth parameter, :math:`d` the dimensionality, and :math:`N` the number of data points. For joint entropy of multiple variables, the estimator concatenates the variables into a single multivariate space and applies the same KDE approach. The estimator supports both Gaussian and box (uniform) kernels. The choice of bandwidth is critical: small values can lead to under-smoothing and overfitting, while large values may over-smooth the data and obscure important features :cite:p:`silverman1986density,garcia-portuguesChapter2Kernel2025`. Parameters ---------- *data : array-like The continuous data used to estimate the entropy. For univariate entropy, pass a single array. For joint entropy, pass multiple arrays. bandwidth : float | int The bandwidth parameter for the kernel. Controls the smoothness of the density estimate. kernel : str Type of kernel to use. Supported options are: - ``'gaussian'``: Gaussian (normal) kernel - ``'box'``: Box (uniform) kernel Compatible with the KDE implementation :func:`kde_probability_density_function() <infomeasure.estimators.utils.kde.kde_probability_density_function>`. workers : int, optional Number of workers to use for parallel processing. Default is 1 (no parallelization). If set to -1, all available CPU cores will be used. base : float | str, optional Logarithm base for entropy calculation. Default is from global configuration. Attributes ---------- *data : array-like The data used to estimate the entropy. bandwidth : float | int The bandwidth for the kernel. kernel : str Type of kernel to use. workers : int Number of workers to use for parallel processing. Returns ------- array-like Local entropy values for each data point when calling entropy calculation methods. The mean of these values gives the overall entropy estimate. Notes ----- **Bandwidth Selection**: The bandwidth parameter critically affects the quality of the entropy estimate. A small bandwidth can lead to under-sampling and high variance, while a large bandwidth may over-smooth the data, obscuring important details and introducing bias. **Kernel Choice**: - Gaussian kernels provide smooth density estimates and are theoretically well-founded - Box kernels are computationally efficient and provide non-parametric estimates **Computational Complexity**: The algorithm has O(N²) complexity for box kernels using KDTree queries, and varies for Gaussian kernels depending on the implementation. **Cross-entropy**: Supported between two distributions by evaluating the density of the second distribution at points from the first distribution. Examples -------- >>> import infomeasure as im >>> from numpy.random import default_rng >>> rng = default_rng(281769) >>> # Generate sample data >>> data = rng.normal(0, 1, 1000) >>> >>> # Create estimator >>> estimator = im.estimator(data, measure="h", approach="kernel", bandwidth=0.5, kernel='gaussian') >>> >>> # Calculate entropy >>> estimator.result() np.float64(1.366015332652949) >>> # Local values >>> estimator.local_vals() array([1.54017083, 1.35855839, 0.97949819, 0.97333173, 2.62084886, ... 1.08174049, 0.97418054, 1.88055967, 0.99614516, 0.98548583]) See Also -------- infomeasure.estimators.utils.kde.kde_probability_density_function : Underlying KDE implementation infomeasure.estimators.entropy.discrete.DiscreteEntropyEstimator : For discrete data entropy estimation """ def __init__( self, *data, bandwidth: float | int, kernel: str, workers: int = 1, base: LogBaseType = Config.get("base"), ): """Initialize the KernelEntropyEstimator. Parameters ---------- bandwidth : float | int The bandwidth for the kernel. kernel : str Type of kernel to use, compatible with the KDE implementation :func:`kde_probability_density_function() <infomeasure.estimators.utils.kde.kde_probability_density_function>`. workers : int, optional Number of workers to use for parallel processing. Default is 1, meaning no parallel processing. If set to -1, all available CPU cores will be used. """ super().__init__(*data, workers=workers, base=base) self.data = tuple(assure_2d_data(var) for var in self.data) self.bandwidth = bandwidth self.kernel = kernel def _simple_entropy(self): """Calculate the entropy of the data. Returns ------- array-like The local form of the entropy. """ # Compute the KDE densities densities = kde_probability_density_function( self.data[0], self.bandwidth, kernel=self.kernel, workers=self.n_workers ) densities[densities == 0] = nan # Compute the log of the densities log_densities = -self._log_base(densities) log_densities[isnan(log_densities)] = 0 return log_densities def _joint_entropy(self): """Calculate the joint entropy of the data. This is done by joining the variables into one space and calculating the entropy. Returns ------- array-like The local form of the joint entropy. """ self.data = (column_stack(self.data[0]),) return self._simple_entropy() def _cross_entropy(self) -> float: """Calculate the cross-entropy between two distributions. Returns ------- float The calculated cross-entropy. """ # Compute the KDE densities densities = kde_probability_density_function( self.data[1], self.bandwidth, at=self.data[0], kernel=self.kernel, workers=self.n_workers, ) # Compute the log of the densities return -np_sum(self._log_base(densities[densities > 0])) / len(densities)