Source code for infomeasure.utils.data

"""Data structures and containers for the infomeasure package."""

from dataclasses import dataclass, field

from numpy import ndarray, unique, asarray, percentile, integer, issubdtype


[docs] @dataclass(frozen=True) class DiscreteData: """Container for discrete random variable data. Attributes ---------- uniq : ndarray Array of unique values in the discrete data. counts : ndarray Array of counts for each unique value. data : ndarray, optional Original data array. Defaults to None. N : int, optional Total number of samples (length of data). Defaults to None. K : int, optional Number of unique values (len(uniq)). Defaults to None. """ uniq: ndarray counts: ndarray N: int = field(init=False) K: int = field(init=False) data: ndarray = None
[docs] @classmethod def from_data(cls, data: ndarray) -> "DiscreteData": """Create a DiscreteData object from a data array. Parameters ---------- data : ndarray Raw data array to analyse. Returns ------- DiscreteData New instance with computed unique values and counts. """ data = asarray(data) if len(data) == 0: raise ValueError( f"`data` must not be empty, got {data} with shape {data.shape} instead." ) uniq, counts = unique(data, return_counts=True) return cls(uniq=uniq, counts=counts, data=data)
[docs] @classmethod def from_counts(cls, uniq: ndarray, counts: ndarray) -> "DiscreteData": """ Constructs a `DiscreteData` instance from unique values and their counts. This class method creates an instance of the `DiscreteData` class by specifying the unique values and their corresponding counts. The `data` attribute of the created instance is set to `None`. Parameters ---------- uniq : ndarray An array of unique values. counts : ndarray An array of counts corresponding to the unique values. Returns ------- DiscreteData A new instance of the `DiscreteData` class initialized with the given unique values and their counts. """ return cls(uniq=asarray(uniq), counts=asarray(counts))
def __post_init__(self): """Validate attributes and set N and K.""" if len(self.uniq) != len(self.counts): raise ValueError("uniq and counts must have same length") if any(self.counts < 0): raise ValueError("counts must be non-negative") if self.counts.sum() == 0: raise ValueError("counts must sum to a positive value") if not issubdtype(self.counts.dtype, integer): raise ValueError("counts must be integers") # Set K (number of unique values) object.__setattr__(self, "K", len(self.uniq)) # Set N if data available if self.data is not None: if len(self.data) == 0: raise ValueError("data must not be empty") # Consistency between data and counts if self.counts.sum() != len(self.data): raise ValueError("counts must sum to the length of data") object.__setattr__(self, "N", len(self.data)) else: object.__setattr__(self, "N", int(self.counts.sum())) @property def probabilities(self) -> ndarray: """ Computes and returns the probabilities by normalizing counts. The `probabilities` property calculates the probabilities as the ratio of counts to the total value `N`. This provides a normalized representation of counts as probabilities. Returns ------- ndarray An array containing the probabilities, calculated by dividing counts by `N`. """ return self.counts / self.N @property def distribution_dict(self) -> dict: """Dictionary mapping unique elements to their corresponding probabilities. Returns ------- dict A dictionary where keys are unique elements and values are their corresponding probabilities. """ return dict(zip(self.uniq, self.probabilities))
[docs] @dataclass(frozen=True) class StatisticalTestResult: """Comprehensive statistical test result containing *p*-value, *t*-score, and confidence intervals. Attributes ---------- p_value : float The *p*-value of the statistical test. t_score : float The *t*-score (standardized test statistic). test_values : ndarray The test values from permutation/bootstrap sampling. observed_value : float The observed value being tested. null_mean : float Mean of the null distribution (test values). null_std : float Standard deviation of the null distribution. n_tests : int Number of tests performed (permutations or bootstrap samples). method : str The statistical test method used ("permutation_test" or "bootstrap"). """ p_value: float t_score: float test_values: ndarray observed_value: float null_mean: float null_std: float n_tests: int method: str
[docs] def percentile(self, q, method="linear"): """Compute the q-th percentile of the test values. This method wraps numpy's percentile function to compute percentiles of the test values from the statistical test. Parameters ---------- q : array_like of float Percentage or sequence of percentages for the percentiles to compute. Values must be between 0 and 100 inclusive. method : str, optional Method to use for estimating the percentile. Default is "linear". See :py:func:`numpy.percentile` for available methods. Returns ------- percentile : scalar or ndarray If `q` is a single percentile, returns a scalar. If multiple percentiles are given, returns an array. See Also -------- numpy.percentile : Compute percentiles along specified axes. Notes ----- For details on the method parameter, reference :py:func:`numpy.percentile`. Examples -------- >>> result = estimator.statistical_test(n_tests=100,method="permutation_test") >>> result.percentile(50) # Median >>> result.percentile([25, 75]) # Quartiles >>> result.percentile(95, method="nearest") # 95th percentile with nearest method """ return percentile(self.test_values, q, method=method)
[docs] def confidence_interval(self, confidence_level, method="linear"): """Get confidence interval for the specified confidence level. This is a convenience function that converts a confidence level (e.g., 95 for 95% CI) to the appropriate percentile calls. Parameters ---------- confidence_level : float Confidence level as a percentage (e.g., 95 for 95% CI). Must be between 0 and 100. method : str, optional Method to use for estimating the percentile. Default is "linear". See :py:func:`numpy.percentile` for available methods. Returns ------- ndarray Array containing [lower_bound, upper_bound] of the confidence interval. Raises ------ ValueError If confidence_level is not between 0 and 100. Examples -------- >>> result = estimator.statistical_test(n_tests=100) >>> result.confidence_interval(95) # 95% CI >>> result.confidence_interval(90, method="nearest") # 90% CI with nearest method """ # Validate confidence level if not 0 < confidence_level < 100: raise ValueError( f"Confidence level must be between 0 and 100, got {confidence_level}" ) # Calculate percentiles: y = (100 - x) / 2 y = (100 - confidence_level) / 2 percentiles = [y, 100 - y] return self.percentile(percentiles, method=method)