Source code for infomeasure.estimators.entropy.kernel

"""Module for the kernel entropy estimator."""

from numpy import column_stack, sum as np_sum, isnan, nan

from ... import Config
from ...utils.types import LogBaseType
from ..base import EntropyEstimator, WorkersMixin
from ..utils.array import assure_2d_data
from ..utils.kde import kde_probability_density_function


[docs] class KernelEntropyEstimator(WorkersMixin, EntropyEstimator): """Estimator for entropy (Shannon) using Kernel Density Estimation (KDE). Attributes ---------- *data : array-like The data used to estimate the entropy. bandwidth : float | int The bandwidth for the kernel. kernel : str Type of kernel to use, compatible with the KDE implementation :func:`kde_probability_density_function() <infomeasure.estimators.utils.kde.kde_probability_density_function>`. workers : int, optional Number of workers to use for parallel processing. Default is 1, meaning no parallel processing. If set to -1, all available CPU cores will be used. Notes ----- A small ``bandwidth`` can lead to under-sampling, while a large ``bandwidth`` may over-smooth the data, obscuring details. """ def __init__( self, *data, bandwidth: float | int, kernel: str, workers: int = 1, base: LogBaseType = Config.get("base"), ): """Initialize the KernelEntropyEstimator. Parameters ---------- bandwidth : float | int The bandwidth for the kernel. kernel : str Type of kernel to use, compatible with the KDE implementation :func:`kde_probability_density_function() <infomeasure.estimators.utils.kde.kde_probability_density_function>`. workers : int, optional Number of workers to use for parallel processing. Default is 1, meaning no parallel processing. If set to -1, all available CPU cores will be used. """ super().__init__(*data, workers=workers, base=base) self.data = tuple(assure_2d_data(var) for var in self.data) self.bandwidth = bandwidth self.kernel = kernel def _simple_entropy(self): """Calculate the entropy of the data. Returns ------- array-like The local form of the entropy. """ # Compute the KDE densities densities = kde_probability_density_function( self.data[0], self.bandwidth, kernel=self.kernel, workers=self.n_workers ) densities[densities == 0] = nan # Compute the log of the densities log_densities = -self._log_base(densities) log_densities[isnan(log_densities)] = 0 return log_densities def _joint_entropy(self): """Calculate the joint entropy of the data. This is done by joining the variables into one space and calculating the entropy. Returns ------- array-like The local form of the joint entropy. """ self.data = (column_stack(self.data[0]),) return self._simple_entropy() def _cross_entropy(self) -> float: """Calculate the cross-entropy between two distributions. Returns ------- float The calculated cross-entropy. """ # Compute the KDE densities densities = kde_probability_density_function( self.data[1], self.bandwidth, at=self.data[0], kernel=self.kernel, workers=self.n_workers, ) # Compute the log of the densities return -np_sum(self._log_base(densities[densities > 0])) / len(densities)