"""Some generally helpful statistical functions for base QC."""
from __future__ import annotations
import copy
import math
from collections.abc import Sequence
import numpy as np
[docs]
def p_data_given_good(x: float, q: float, r_hi: float, r_lo: float, mu: float, sigma: float) -> float:
"""
Probability of an observed value assuming it comes from a "good" measurement.
Calculate the probability of an observed value x given a normal distribution with mean mu
standard deviation of sigma, where x is constrained to fall between R_hi and R_lo
and is known only to an integer multiple of Q, the quantization level.
Parameters
----------
x : float
Observed value for which probability is required.
q : float
Quantization of x, i.e. x is an integer multiple of Q.
r_hi : float
The upper limit on x imposed by previous QC choices.
r_lo : float
The lower limit on x imposed by previous QC choices.
mu : float
The mean of the distribution.
sigma : float
The standard deviation of the distribution.
Returns
-------
float
Probability of the observed value given the specified distribution.
Raises
------
ValueError
When inputs are incorrectly specified: q<=0, sigma<=0, r_lo > r_hi, x < r_lo or x > r_hi.
"""
if q <= 0.0:
raise ValueError(f"Value is not positive: q = {q}.")
if sigma <= 0.0:
raise ValueError(f"Value is not positive: sigma = {sigma}.")
if r_lo >= r_hi:
raise ValueError(f"Lower limit is greater than upper limit: r_hi = {r_hi}, r_lo = {r_lo}.")
if x < r_lo:
raise ValueError(f"Lower limit is greater than x: r_lo = {r_lo}, x = {x}.")
if x > r_hi:
raise ValueError(f"Upper limit is less than x: r_hi = {r_hi}, x = {x}.")
upper_x = min([x + 0.5 * q, r_hi + 0.5 * q])
lower_x = max([x - 0.5 * q, r_lo - 0.5 * q])
normalizer = 0.5 * (math.erf((r_hi + 0.5 * q - mu) / (sigma * math.sqrt(2))) - math.erf((r_lo - 0.5 * q - mu) / (sigma * math.sqrt(2))))
return 0.5 * (math.erf((upper_x - mu) / (sigma * math.sqrt(2))) - math.erf((lower_x - mu) / (sigma * math.sqrt(2)))) / normalizer
[docs]
def p_data_given_gross(q: float, r_hi: float, r_lo: float) -> float:
"""
Probability of an observed value assuming it is a gross error.
Calculate the probability of the data given a gross error
assuming gross errors are uniformly distributed between
R_low and R_high and that the quantization, rounding level is Q
Parameters
----------
q : float
Quantization of x, i.e. x is an integer multiple of Q.
r_hi : float
The upper limit on x imposed by previous QC choices.
r_lo : float
The lower limit on x imposed by previous QC choices.
Returns
-------
float
Probability of the observed value given that it is a gross error.
Raises
------
ValueError
When limits are not ascending or q<=0.
"""
if r_hi < r_lo:
raise ValueError(f"Lower limit is greater than upper limit: r_hi = {r_hi}, r_lo = {r_lo}.")
if q <= 0.0:
raise ValueError(f"Value is not positive: q = {q}.")
r = r_hi - r_lo
return 1.0 / (1.0 + (r / q))
[docs]
def p_gross(p0: float, q: float, r_hi: float, r_lo: float, x: float, mu: float, sigma: float) -> float:
"""
Posterior probability that an observation is a gross error.
Calculate the posterior probability of a gross error given the prior probability p0,
the quantization level of the observed value, Q, previous limits on the observed value,
R_hi and R_lo, the observed value, x, and the mean (mu) and standard deviation (sigma) of the
distribution of good observations assuming they are normally distributed. Gross errors are
assumed to be uniformly distributed between R_lo and R_hi.
Parameters
----------
p0 : float
Prior probability of gross error.
q : float
Quantization of x, i.e. x is an integer multiple of Q.
r_hi : float
The upper limit on x imposed by previous QC choices.
r_lo : float
The lower limit on x imposed by previous QC choices.
x : float
Observed value for which probability is required.
mu : float
The mean of the distribution of good obs.
sigma : float
The standard deviation of the distribution of good obs.
Returns
-------
float
Probability of gross error given an observed value.
Raises
------
ValueError
When inputs are incorrectly specified: p0 < 0, p0 > 1, q <= 0, r_hi < r_lo, x < r_lo, x > r_hi, sigma <= 0.
"""
if p0 < 0.0:
raise ValueError(f"Value is not positive: p0 = {p0}.")
if p0 > 1.0:
raise ValueError(f"Value is greater than 1.0: p0 = {p0}.")
if q <= 0.0:
raise ValueError(f"Value is not positive: q = {q}.")
if r_hi < r_lo:
raise ValueError(f"Lower limit is greater than upper limit: r_hi = {r_hi}, r_lo = {r_lo}.")
if x < r_lo:
raise ValueError(f"Lower limit is greater than x: r_lo = {r_lo}, x = {x}.")
if x > r_hi:
raise ValueError(f"Upper limit is less than x: r_hi = {r_hi}, x = {x}.")
if sigma <= 0.0:
raise ValueError(f"Value is not positive: sigma = {sigma}.")
pgross = (
p0 * p_data_given_gross(q, r_hi, r_lo) / (p0 * p_data_given_gross(q, r_hi, r_lo) + (1 - p0) * p_data_given_good(x, q, r_hi, r_lo, mu, sigma))
)
if not (0 <= pgross <= 1.0):
raise ValueError(f"Invalid pgross: {pgross}. Must between 0 and 1.")
return pgross
[docs]
def winsorised_mean(inarr: list[float | int]) -> float:
"""
Compute the 25% winsorised mean of the input array.
The winsorised mean is a resistant way of calculating an average.
Parameters
----------
inarr : list of float
Input array to be averaged.
Returns
-------
float
The winsorised mean of the input array with a 25% trimming.
Raises
------
ValueError
if length of `inarr` is equal to 0.
Notes
-----
The winsorised mean is that which you get if you set the first quarter of
the sorted input array to the 1st quartile value and the last quarter
to the 3rd quartile and then take the mean. This is quite a heavy trimming of
the distribution. It makes it very resistant - about half the obs can be egregiously
bad without affecting the mean strongly - but it will be less accurate if
there are lots of observations, or the quality of the obs is higher.
"""
length = len(inarr)
if length == 0:
raise ValueError("Input array must have at least one element.")
inarr_sorted = sorted(inarr)
trim = length // 4
if trim == 0:
return float(sum(inarr_sorted) / length)
middle_sum = sum(inarr_sorted[trim : length - trim])
lower_sum = inarr_sorted[trim] * trim
upper_sum = inarr_sorted[length - trim - 1] * trim
total = lower_sum + middle_sum + upper_sum
return total / length
[docs]
def missing_mean(inarr: list[float | None]) -> float | None:
"""
Return mean of input array.
Parameters
----------
inarr : list of float
List of values for which mean is required. Missing values represented by None in list.
Returns
-------
float or None
Mean of non-missing values or None.
"""
result = 0.0
num = 0.0
for val in inarr:
if val is not None:
result += val
num += 1.0
if num == 0.0:
return None
return result / num
[docs]
def _trim_stat(inarr: np.ndarray | Sequence[int | float], trim: int, stat: str) -> float:
"""
Calculate a resistant (aka robust) statistics of an input array given a trimming criteria.
Parameters
----------
inarr : array-like of float, shape (n,)
1-dimensional value array.
trim : int
Trimming criteria.
A value of 10 trims one tenth of the values off each end of the sorted array
before calculating the mean.
stat : str
Name of the numpy statistic function to apply, e.g., "mean", "std".
Returns
-------
float
The trimmed statistic calculated on the array.
"""
arr = list(copy.deepcopy(inarr))
stat_func = getattr(np, stat)
if trim == 0:
return float(stat_func(arr))
length = len(arr)
arr.sort()
index1 = int(length / trim)
return float(stat_func(arr[index1 : length - index1]))
[docs]
def trim_mean(inarr: np.ndarray | Sequence[int | float], trim: int) -> float:
"""
Calculate a resistant (aka robust) mean of an input array given a trimming criteria.
Parameters
----------
inarr : array-like of float, shape (n,)
1-dimensional value array.
trim : int
Trimming criteria.
A value of 10 trims one tenth of the values off each end of the sorted array
before calculating the mean.
Returns
-------
float
Trimmed mean.
"""
return _trim_stat(inarr, trim, "mean")
[docs]
def trim_std(inarr: np.ndarray | Sequence[int | float], trim: int) -> float:
"""
Calculate a resistant (aka robust) standard deviation of an input array given a trimming criteria.
Parameters
----------
inarr : array-like of float, shape (n,)
1-dimensional value array.
trim : int
Trimming criteria.
A value of 10 trims one tenth of the values off each end of the sorted array before
calculating the standard deviation.
Returns
-------
float
Trimmed standard deviation.
"""
return _trim_stat(inarr, trim, "std")