Source code for infomeasure.estimators.entropy.kernel
"""Module for the kernel entropy estimator."""
from numpy import column_stack, sum as np_sum, isnan, nan
from ... import Config
from ...utils.types import LogBaseType
from ..base import EntropyEstimator, WorkersMixin
from ..utils.array import assure_2d_data
from ..utils.kde import kde_probability_density_function
[docs]
class KernelEntropyEstimator(WorkersMixin, EntropyEstimator):
"""Estimator for entropy (Shannon) using Kernel Density Estimation (KDE).
Attributes
----------
*data : array-like
The data used to estimate the entropy.
bandwidth : float | int
The bandwidth for the kernel.
kernel : str
Type of kernel to use, compatible with the KDE
implementation :func:`kde_probability_density_function() <infomeasure.estimators.utils.kde.kde_probability_density_function>`.
workers : int, optional
Number of workers to use for parallel processing.
Default is 1, meaning no parallel processing.
If set to -1, all available CPU cores will be used.
Notes
-----
A small ``bandwidth`` can lead to under-sampling,
while a large ``bandwidth`` may over-smooth the data, obscuring details.
"""
def __init__(
self,
*data,
bandwidth: float | int,
kernel: str,
workers: int = 1,
base: LogBaseType = Config.get("base"),
):
"""Initialize the KernelEntropyEstimator.
Parameters
----------
bandwidth : float | int
The bandwidth for the kernel.
kernel : str
Type of kernel to use, compatible with the KDE
implementation :func:`kde_probability_density_function() <infomeasure.estimators.utils.kde.kde_probability_density_function>`.
workers : int, optional
Number of workers to use for parallel processing.
Default is 1, meaning no parallel processing.
If set to -1, all available CPU cores will be used.
"""
super().__init__(*data, workers=workers, base=base)
self.data = tuple(assure_2d_data(var) for var in self.data)
self.bandwidth = bandwidth
self.kernel = kernel
def _simple_entropy(self):
"""Calculate the entropy of the data.
Returns
-------
array-like
The local form of the entropy.
"""
# Compute the KDE densities
densities = kde_probability_density_function(
self.data[0], self.bandwidth, kernel=self.kernel, workers=self.n_workers
)
densities[densities == 0] = nan
# Compute the log of the densities
log_densities = -self._log_base(densities)
log_densities[isnan(log_densities)] = 0
return log_densities
def _joint_entropy(self):
"""Calculate the joint entropy of the data.
This is done by joining the variables into one space
and calculating the entropy.
Returns
-------
array-like
The local form of the joint entropy.
"""
self.data = (column_stack(self.data[0]),)
return self._simple_entropy()
def _cross_entropy(self) -> float:
"""Calculate the cross-entropy between two distributions.
Returns
-------
float
The calculated cross-entropy.
"""
# Compute the KDE densities
densities = kde_probability_density_function(
self.data[1],
self.bandwidth,
at=self.data[0],
kernel=self.kernel,
workers=self.n_workers,
)
# Compute the log of the densities
return -np_sum(self._log_base(densities[densities > 0])) / len(densities)