Source code for marine_qc.statistics

"""Some generally helpful statistical functions for base QC."""

from __future__ import annotations
import copy
import math
from collections.abc import Sequence

import numpy as np



[docs]
def p_data_given_good(x: float, q: float, r_hi: float, r_lo: float, mu: float, sigma: float) -> float:
    """
    Probability of an observed value assuming it comes from a "good" measurement.

    Calculate the probability of an observed value x given a normal distribution with mean mu
    standard deviation of sigma, where x is constrained to fall between R_hi and R_lo
    and is known only to an integer multiple of Q, the quantization level.

    Parameters
    ----------
    x : float
        Observed value for which probability is required.
    q : float
        Quantization of x, i.e. x is an integer multiple of Q.
    r_hi : float
        The upper limit on x imposed by previous QC choices.
    r_lo : float
        The lower limit on x imposed by previous QC choices.
    mu : float
        The mean of the distribution.
    sigma : float
        The standard deviation of the distribution.

    Returns
    -------
    float
        Probability of the observed value given the specified distribution.

    Raises
    ------
    ValueError
        When inputs are incorrectly specified: q<=0, sigma<=0, r_lo > r_hi, x < r_lo or x > r_hi.
    """
    if q <= 0.0:
        raise ValueError(f"Value is not positive: q = {q}.")
    if sigma <= 0.0:
        raise ValueError(f"Value is not positive: sigma = {sigma}.")
    if r_lo >= r_hi:
        raise ValueError(f"Lower limit is greater than upper limit: r_hi = {r_hi}, r_lo = {r_lo}.")
    if x < r_lo:
        raise ValueError(f"Lower limit is greater than x: r_lo = {r_lo}, x = {x}.")
    if x > r_hi:
        raise ValueError(f"Upper limit is less than x: r_hi = {r_hi}, x = {x}.")

    upper_x = min([x + 0.5 * q, r_hi + 0.5 * q])
    lower_x = max([x - 0.5 * q, r_lo - 0.5 * q])

    normalizer = 0.5 * (math.erf((r_hi + 0.5 * q - mu) / (sigma * math.sqrt(2))) - math.erf((r_lo - 0.5 * q - mu) / (sigma * math.sqrt(2))))

    return 0.5 * (math.erf((upper_x - mu) / (sigma * math.sqrt(2))) - math.erf((lower_x - mu) / (sigma * math.sqrt(2)))) / normalizer




[docs]
def p_data_given_gross(q: float, r_hi: float, r_lo: float) -> float:
    """
    Probability of an observed value assuming it is a gross error.

    Calculate the probability of the data given a gross error
    assuming gross errors are uniformly distributed between
    R_low and R_high and that the quantization, rounding level is Q

    Parameters
    ----------
    q : float
        Quantization of x, i.e. x is an integer multiple of Q.
    r_hi : float
        The upper limit on x imposed by previous QC choices.
    r_lo : float
        The lower limit on x imposed by previous QC choices.

    Returns
    -------
    float
        Probability of the observed value given that it is a gross error.

    Raises
    ------
    ValueError
        When limits are not ascending or q<=0.
    """
    if r_hi < r_lo:
        raise ValueError(f"Lower limit is greater than upper limit: r_hi = {r_hi}, r_lo = {r_lo}.")
    if q <= 0.0:
        raise ValueError(f"Value is not positive: q = {q}.")

    r = r_hi - r_lo
    return 1.0 / (1.0 + (r / q))




[docs]
def p_gross(p0: float, q: float, r_hi: float, r_lo: float, x: float, mu: float, sigma: float) -> float:
    """
    Posterior probability that an observation is a gross error.

    Calculate the posterior probability of a gross error given the prior probability p0,
    the quantization level of the observed value, Q, previous limits on the observed value,
    R_hi and R_lo, the observed value, x, and the mean (mu) and standard deviation (sigma) of the
    distribution of good observations assuming they are normally distributed. Gross errors are
    assumed to be uniformly distributed between R_lo and R_hi.

    Parameters
    ----------
    p0 : float
        Prior probability of gross error.
    q : float
        Quantization of x, i.e. x is an integer multiple of Q.
    r_hi : float
        The upper limit on x imposed by previous QC choices.
    r_lo : float
        The lower limit on x imposed by previous QC choices.
    x : float
        Observed value for which probability is required.
    mu : float
        The mean of the distribution of good obs.
    sigma : float
        The standard deviation of the distribution of good obs.

    Returns
    -------
    float
        Probability of gross error given an observed value.

    Raises
    ------
    ValueError
        When inputs are incorrectly specified: p0 < 0, p0 > 1, q <= 0, r_hi < r_lo, x < r_lo, x > r_hi, sigma <= 0.
    """
    if p0 < 0.0:
        raise ValueError(f"Value is not positive: p0 = {p0}.")
    if p0 > 1.0:
        raise ValueError(f"Value is greater than 1.0: p0 = {p0}.")
    if q <= 0.0:
        raise ValueError(f"Value is not positive: q = {q}.")
    if r_hi < r_lo:
        raise ValueError(f"Lower limit is greater than upper limit: r_hi = {r_hi}, r_lo = {r_lo}.")
    if x < r_lo:
        raise ValueError(f"Lower limit is greater than x: r_lo = {r_lo}, x = {x}.")
    if x > r_hi:
        raise ValueError(f"Upper limit is less than x: r_hi = {r_hi}, x = {x}.")
    if sigma <= 0.0:
        raise ValueError(f"Value is not positive: sigma = {sigma}.")

    pgross = (
        p0 * p_data_given_gross(q, r_hi, r_lo) / (p0 * p_data_given_gross(q, r_hi, r_lo) + (1 - p0) * p_data_given_good(x, q, r_hi, r_lo, mu, sigma))
    )

    if not (0 <= pgross <= 1.0):
        raise ValueError(f"Invalid pgross: {pgross}. Must between 0 and 1.")

    return pgross




[docs]
def winsorised_mean(inarr: list[float | int]) -> float:
    """
    Compute the 25% winsorised mean of the input array.

    The winsorised mean is a resistant way of calculating an average.

    Parameters
    ----------
    inarr : list of float
        Input array to be averaged.

    Returns
    -------
    float
        The winsorised mean of the input array with a 25% trimming.

    Raises
    ------
    ValueError
        if length of `inarr` is equal to 0.

    Notes
    -----
    The winsorised mean is that which you get if you set the first quarter of
    the sorted input array to the 1st quartile value and the last quarter
    to the 3rd quartile and then take the mean. This is quite a heavy trimming of
    the distribution. It makes it very resistant - about half the obs can be egregiously
    bad without affecting the mean strongly - but it will be less accurate if
    there are lots of observations, or the quality of the obs is higher.
    """
    length = len(inarr)
    if length == 0:
        raise ValueError("Input array must have at least one element.")

    inarr_sorted = sorted(inarr)

    trim = length // 4

    if trim == 0:
        return float(sum(inarr_sorted) / length)

    middle_sum = sum(inarr_sorted[trim : length - trim])
    lower_sum = inarr_sorted[trim] * trim
    upper_sum = inarr_sorted[length - trim - 1] * trim

    total = lower_sum + middle_sum + upper_sum
    return total / length




[docs]
def missing_mean(inarr: list[float | None]) -> float | None:
    """
    Return mean of input array.

    Parameters
    ----------
    inarr : list of float
        List of values for which mean is required. Missing values represented by None in list.

    Returns
    -------
    float or None
         Mean of non-missing values or None.
    """
    result = 0.0
    num = 0.0
    for val in inarr:
        if val is not None:
            result += val
            num += 1.0
    if num == 0.0:
        return None
    return result / num




[docs]
def _trim_stat(inarr: np.ndarray | Sequence[int | float], trim: int, stat: str) -> float:
    """
    Calculate a resistant (aka robust) statistics of an input array given a trimming criteria.

    Parameters
    ----------
    inarr : array-like of float, shape (n,)
        1-dimensional value array.
    trim : int
        Trimming criteria.
        A value of 10 trims one tenth of the values off each end of the sorted array
        before calculating the mean.
    stat : str
        Name of the numpy statistic function to apply, e.g., "mean", "std".

    Returns
    -------
    float
        The trimmed statistic calculated on the array.
    """
    arr = list(copy.deepcopy(inarr))
    stat_func = getattr(np, stat)
    if trim == 0:
        return float(stat_func(arr))

    length = len(arr)
    arr.sort()

    index1 = int(length / trim)

    return float(stat_func(arr[index1 : length - index1]))




[docs]
def trim_mean(inarr: np.ndarray | Sequence[int | float], trim: int) -> float:
    """
    Calculate a resistant (aka robust) mean of an input array given a trimming criteria.

    Parameters
    ----------
    inarr : array-like of float, shape (n,)
        1-dimensional value array.
    trim : int
        Trimming criteria.
        A value of 10 trims one tenth of the values off each end of the sorted array
        before calculating the mean.

    Returns
    -------
    float
        Trimmed mean.
    """
    return _trim_stat(inarr, trim, "mean")




[docs]
def trim_std(inarr: np.ndarray | Sequence[int | float], trim: int) -> float:
    """
    Calculate a resistant (aka robust) standard deviation of an input array given a trimming criteria.

    Parameters
    ----------
    inarr : array-like of float, shape (n,)
        1-dimensional value array.
    trim : int
        Trimming criteria.
        A value of 10 trims one tenth of the values off each end of the sorted array before
        calculating the standard deviation.

    Returns
    -------
    float
        Trimmed standard deviation.
    """
    return _trim_stat(inarr, trim, "std")