"""Data structures and containers for the infomeasure package."""
from dataclasses import dataclass, field
from numpy import ndarray, unique, asarray, percentile, integer, issubdtype
[docs]
@dataclass(frozen=True)
class DiscreteData:
"""Container for discrete random variable data.
Attributes
----------
uniq : ndarray
Array of unique values in the discrete data.
counts : ndarray
Array of counts for each unique value.
data : ndarray, optional
Original data array. Defaults to None.
N : int, optional
Total number of samples (length of data). Defaults to None.
K : int, optional
Number of unique values (len(uniq)). Defaults to None.
"""
uniq: ndarray
counts: ndarray
N: int = field(init=False)
K: int = field(init=False)
data: ndarray = None
[docs]
@classmethod
def from_data(cls, data: ndarray) -> "DiscreteData":
"""Create a DiscreteData object from a data array.
Parameters
----------
data : ndarray
Raw data array to analyse.
Returns
-------
DiscreteData
New instance with computed unique values and counts.
"""
data = asarray(data)
if len(data) == 0:
raise ValueError(
f"`data` must not be empty, got {data} with shape {data.shape} instead."
)
uniq, counts = unique(data, return_counts=True)
return cls(uniq=uniq, counts=counts, data=data)
[docs]
@classmethod
def from_counts(cls, uniq: ndarray, counts: ndarray) -> "DiscreteData":
"""
Constructs a `DiscreteData` instance from unique values and their counts.
This class method creates an instance of the `DiscreteData` class by
specifying the unique values and their corresponding counts. The `data`
attribute of the created instance is set to `None`.
Parameters
----------
uniq : ndarray
An array of unique values.
counts : ndarray
An array of counts corresponding to the unique values.
Returns
-------
DiscreteData
A new instance of the `DiscreteData` class initialized with the
given unique values and their counts.
"""
return cls(uniq=asarray(uniq), counts=asarray(counts))
def __post_init__(self):
"""Validate attributes and set N and K."""
if len(self.uniq) != len(self.counts):
raise ValueError("uniq and counts must have same length")
if any(self.counts < 0):
raise ValueError("counts must be non-negative")
if self.counts.sum() == 0:
raise ValueError("counts must sum to a positive value")
if not issubdtype(self.counts.dtype, integer):
raise ValueError("counts must be integers")
# Set K (number of unique values)
object.__setattr__(self, "K", len(self.uniq))
# Set N if data available
if self.data is not None:
if len(self.data) == 0:
raise ValueError("data must not be empty")
# Consistency between data and counts
if self.counts.sum() != len(self.data):
raise ValueError("counts must sum to the length of data")
object.__setattr__(self, "N", len(self.data))
else:
object.__setattr__(self, "N", int(self.counts.sum()))
@property
def probabilities(self) -> ndarray:
"""
Computes and returns the probabilities by normalizing counts.
The `probabilities` property calculates the probabilities as the ratio
of counts to the total value `N`. This provides a normalized
representation of counts as probabilities.
Returns
-------
ndarray
An array containing the probabilities, calculated by dividing
counts by `N`.
"""
return self.counts / self.N
@property
def distribution_dict(self) -> dict:
"""Dictionary mapping unique elements to their corresponding probabilities.
Returns
-------
dict
A dictionary where keys are unique elements and values are their
corresponding probabilities.
"""
return dict(zip(self.uniq, self.probabilities))
[docs]
@dataclass(frozen=True)
class StatisticalTestResult:
"""Comprehensive statistical test result containing *p*-value, *t*-score,
and confidence intervals.
Attributes
----------
p_value : float
The *p*-value of the statistical test.
t_score : float
The *t*-score (standardized test statistic).
test_values : ndarray
The test values from permutation/bootstrap sampling.
observed_value : float
The observed value being tested.
null_mean : float
Mean of the null distribution (test values).
null_std : float
Standard deviation of the null distribution.
n_tests : int
Number of tests performed (permutations or bootstrap samples).
method : str
The statistical test method used ("permutation_test" or "bootstrap").
"""
p_value: float
t_score: float
test_values: ndarray
observed_value: float
null_mean: float
null_std: float
n_tests: int
method: str
[docs]
def percentile(self, q, method="linear"):
"""Compute the q-th percentile of the test values.
This method wraps numpy's percentile function to compute percentiles
of the test values from the statistical test.
Parameters
----------
q : array_like of float
Percentage or sequence of percentages for the percentiles to compute.
Values must be between 0 and 100 inclusive.
method : str, optional
Method to use for estimating the percentile. Default is "linear".
See :py:func:`numpy.percentile` for available methods.
Returns
-------
percentile : scalar or ndarray
If `q` is a single percentile, returns a scalar.
If multiple percentiles are given, returns an array.
See Also
--------
numpy.percentile : Compute percentiles along specified axes.
Notes
-----
For details on the method parameter, reference :py:func:`numpy.percentile`.
Examples
--------
>>> result = estimator.statistical_test(n_tests=100,method="permutation_test")
>>> result.percentile(50) # Median
>>> result.percentile([25, 75]) # Quartiles
>>> result.percentile(95, method="nearest") # 95th percentile with nearest method
"""
return percentile(self.test_values, q, method=method)
[docs]
def confidence_interval(self, confidence_level, method="linear"):
"""Get confidence interval for the specified confidence level.
This is a convenience function that converts a confidence level
(e.g., 95 for 95% CI) to the appropriate percentile calls.
Parameters
----------
confidence_level : float
Confidence level as a percentage (e.g., 95 for 95% CI).
Must be between 0 and 100.
method : str, optional
Method to use for estimating the percentile. Default is "linear".
See :py:func:`numpy.percentile` for available methods.
Returns
-------
ndarray
Array containing [lower_bound, upper_bound] of the confidence interval.
Raises
------
ValueError
If confidence_level is not between 0 and 100.
Examples
--------
>>> result = estimator.statistical_test(n_tests=100)
>>> result.confidence_interval(95) # 95% CI
>>> result.confidence_interval(90, method="nearest") # 90% CI with nearest method
"""
# Validate confidence level
if not 0 < confidence_level < 100:
raise ValueError(
f"Confidence level must be between 0 and 100, got {confidence_level}"
)
# Calculate percentiles: y = (100 - x) / 2
y = (100 - confidence_level) / 2
percentiles = [y, 100 - y]
return self.percentile(percentiles, method=method)