Source code for problem_bank_helpers.stats

# Author: Firas Moosvi, Jake Bobowski, others
# Date: 2023-10-31

from __future__ import annotations

from typing import Any, Literal

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.figure import Figure
from scipy import stats



[docs]
def shaded_normal_density(
    q: float | tuple[float, float],
    /,
    mean: float = 0,
    sd: float = 1,
    rsd: float = 4,
    lower_tail: bool = True,
    add_prob: bool = True,
    add_q: bool = True,
    add_legend: bool = False,
    figsize: tuple[float, float] | None = (8, 6),
    color: Any = "xkcd:sky blue",
    x_label: str = "x",
    y_label: str = "f(x; \u03BC,\u03C3)",
    legend_text: str | None = None,
    **kwargs,
) -> Figure:
    """Generate a normal distribution plot with optional listed probability calculation.

    Parameters
    ----------
    q : float or tuple[float, float]
        If a float, the upper or lower bound of the shaded area. If a tuple of floats, the lower and upper bounds of the shaded area.
    mean : float
        The mean of the normal distribution. Defaults to ``0``
    sd : float
        The standard deviation of the normal distribution. Defaults to ``1``
    rsd : float
        The number of standard deviations to plot on either side of the mean.  Defaults to ``4``
    lower_tail : bool
        Whether the shaded area should represent the lower tail probability :math:`\\operatorname{P}(X \\le x)`,
        or the upper tail probability :math:`\\operatorname{P}(X \\ge x)`. Defaults to ``True``
    add_prob : bool
        Whether to show the probability of the shaded area will be displayed on the plot. Defaults to ``True``
    add_q : bool
        Whether the value(s) of ``q`` should be displayed on the x-axis of the plot. Defaults to ``True``
    add_legend : bool
        Whether a legend with the mean and standard deviation values will be displayed on the plot. Defaults to ``False``
    figsize : tuple or tuple[float, float] or None
        The size of the plot in inches. If None, the default matplotlib figure size
        will be used as this is passed to :func:`matplotlib.pyplot.figure`. Defaults to ``(8, 6)``
    color : Any
        The color of the shaded area as a valid :doc:`matplotlib color <mpl:users/explain/colors/colors>`. Defaults to ``xkcd:sky blue``
    x_label : str
        The label for the x-axis. Defaults to ``x``
    y_label : str
        The label for the y-axis. Defaults to ``f(x; \u03BC,\u03C3)``
    legend_text : str or None, Optional
        The text to display in the legend if add_legend is set to true. By default (None), the legend will display the mean and standard deviation values.
    **kwargs
        Additional keyword arguments to pass to :func:`matplotlib.pyplot.figure`.

    Returns
    -------
    matplotlib.figure.Figure
        The generated matplotlib Figure object.

    Raises
    ------
    TypeError
        If the input parameters are not of the expected type.
    ValueError
        If the input values are out of the expected range.

    Examples
    --------

    Shading the region :math:`P(Z \\le z)` where :math:`Z\\sim N(0,1)` is the standard normal ``N(0,1)``
    
    .. plot::
        :context: reset

        pbh.stats.shaded_normal_density(-0.2533)

    Shading the region :math:`P(Z \\ge z)` where :math:`Z\\sim N(\u03BC,\u03C3)` is a normal distribution with mean \u03BC and standard deviation \u03C3.
    
    .. plot::
        :context: reset

        pbh.stats.shaded_normal_density(7.1, 7, 0.1, lower_tail=False, add_legend=True)
    
    Shading the region :math:`P(a \\le Z \\le b)` where :math:`Z\\sim N(0,1)` is the standard normal ``N(0,1)``
    
    .. plot::
        :context: reset

        pbh.stats.shaded_normal_density((-1.1, 2))

    References
    ----------
    Based off of an R function written by Dr. Irene Vrbick for making `shaded normal density curves <https://irene.vrbik.ok.ubc.ca/blog/2021-11-04-shading-under-the-normal-curve/>`__.

    The R function by Dr. Irene Vrbick was adapted from `here <http://rstudio-pubs-static.s3.amazonaws.com/78857_86c2403ca9c146ba8fcdcda79c3f4738.html>`__.
    """
    if not isinstance(mean, (float, int)):
        msg = f"mean must be a number, not a {mean.__class__.__name__!r}"
        raise TypeError(msg)
    if not isinstance(sd, (float, int)):
        msg = f"sd must be a number, not a {sd.__class__.__name__!r}"
        raise TypeError(msg)
    if not isinstance(rsd, (float, int)):
        msg = f"rsd must be a number, not a {rsd.__class__.__name__!r}"
        raise TypeError(msg)
    if (
        isinstance(q, tuple)
        and len(q) == 2
        and isinstance(q[0], (float, int))
        and isinstance(q[1], (float, int))
    ):
        q_lower, q_upper = sorted(q)
        xx = np.linspace(mean - rsd * sd, mean + rsd * sd, 200)
        yy = stats.norm.pdf(xx, mean, sd)
        fig = plt.figure(figsize=figsize, **kwargs)
        ax = fig.gca()
        ax.plot(xx, yy)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        x = np.linspace(q_lower, q_upper, 200)
        y = stats.norm.pdf(x, mean, sd)
        # fmt: off
        filled, *_ = ax.fill(  # Fill returns a list of polygons, but we're  only making one
            np.concatenate([[q_lower], x, [q_upper]]),
            np.concatenate([[0], y, [0]]),
            color,
        )
        # fmt: on
        if add_prob:
            height = max(y) / 4
            rv = stats.norm(mean, sd)
            prob: float = rv.cdf(q_upper) - rv.cdf(q_lower)
            ax.text((sum(q) / 2), height, f"{prob:.3f}", ha="center")
        if add_q:
            ax.set_xticks(
                [q_lower, q_upper],
                labels=[
                    str(round(q_lower, 4)),
                    str(round(q_upper, 4)),
                ],
                minor=True,
                color=color,
                y=-0.05,
            )
            if q_lower in ax.get_xticks():
                ax.get_xticklabels()[
                    np.where(ax.get_xticks() == q_lower)[0][0]
                ].set_color(color)
            if q_upper in ax.get_xticks():
                ax.get_xticklabels()[
                    np.where(ax.get_xticks() == q_upper)[0][0]
                ].set_color(color)

    elif isinstance(q, (float, int)):
        if not isinstance(lower_tail, bool):
            msg = f"lower_tail must be a bool, not a {lower_tail.__class__.__name__!r}"
            raise TypeError(msg)

        xx = np.linspace(mean - rsd * sd, mean + rsd * sd, 200)
        yy = stats.norm.pdf(xx, mean, sd)
        fig = plt.figure(figsize=figsize, **kwargs)
        ax = fig.gca()
        ax.plot(xx, yy)
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)

        if lower_tail is True:
            x = np.linspace(xx[0], q, 100)
            y = stats.norm.pdf(x, mean, sd)
            # fmt: off
            filled, *_ = ax.fill(  # Fill returns a list of polygons, but we're  only making one
                np.concatenate([[xx[0]], x, [q]]),
                np.concatenate([[0], y, [0]]),
                color,
            )
            # fmt: on
            if add_prob:
                height: float = stats.norm.pdf(q, mean, sd) / 4  # type: ignore
                prob: float = stats.norm.cdf(q, mean, sd)  # type: ignore
                ax.text((q - 0.5 * sd), height, f"{prob:.3f}", ha="center")
        else:
            x = np.linspace(q, xx[-1], 100)
            y = stats.norm.pdf(x, mean, sd)
            # fmt: off
            filled, *_ = ax.fill(  # Fill returns a list of polygons, but we're  only making one
                np.concatenate([[q], x, [xx[-1]]]),
                np.concatenate([[0], y, [0]]),
                color,
            )
            # fmt: on
            if add_prob:
                height: float = stats.norm.pdf(q, mean, sd) / 4  # type: ignore
                prob: float = stats.norm.sf(q, mean, sd)  # type: ignore
                ax.text((q + 0.5 * sd), height, f"{prob:.3f}", ha="center")

        if add_q:
            if q in ax.get_xticks():
                ax.get_xticklabels()[np.where(ax.get_xticks() == q)[0][0]].set_color(
                    color
                )
            else:
                ax.set_xticks(
                    [q],
                    labels=[
                        str(round(q, 4)),
                    ],
                    minor=True,
                    color=color,
                    y=-0.05,
                )

    else:
        error_base = "q must be a tuple of two numbers, or a single number"
        if isinstance(q, tuple):
            if len(q) != 2:
                msg = f"{error_base}, not a {len(q)}-tuple"
                raise ValueError(msg)
            msg = f"{error_base}, not a 2-tuple containing a {q[0].__class__.__name__!r} and a {q[1].__class__.__name__!r}"
            raise TypeError(msg)
        else:
            msg = f"{error_base}, not a {q.__class__.__name__!r}"
            raise TypeError(msg)

    if add_legend:
        ax.set_title(legend_text or f"\u03BC = {mean}, \u03C3 = {sd}")

    return fig




[docs]
def shaded_hypothesis_test(
    critical_value: float,
    tail: Literal["left", "right", "both"],
    /,
    mean: float = 0,
    sd: float = 1,
    rsd: float = 4,
    figsize: tuple[float, float] | None = (8, 6),
    color: Any = "xkcd:sky blue",
    x_label: str = "x",
    y_label: str = "Probability Density",
    legend: str | None = None,
    **kwargs
) -> Figure:
    """Generate a normal distribution plot with appropriate tails for a hypothesis test.

    Parameters
    ----------
    critical_value : float
        The critical value to plot. If ``tail`` is ``both``, :code:`-abs(critical_value)` is used for the left tail
        and :code:`abs(critical_value)` is used for the right tail.
    tail : ``left`` or ``right`` or ``both``
        The type of hypothesis test to plot.
    mean : float
        The mean of the normal distribution. Defaults to ``0``
    sd : float
        The standard deviation of the normal distribution. Defaults to ``1``
    rsd : float
        The number of standard deviations to plot on either side of the mean.  Defaults to ``4``
    figsize : tuple or tuple[float, float] or None
        The size of the plot in inches. If None, the default matplotlib figure size
        will be used as this is passed to :func:`matplotlib.pyplot.figure`. Defaults to ``(8, 6)``
    color : Any
        The color of the shaded area as a valid :doc:`matplotlib color <mpl:users/explain/colors/colors>`. Defaults to ``xkcd:sky blue``
    x_label : str
        The label for the x-axis. Defaults to ``x``
    y_label : str
        The label for the y-axis. Defaults to ``Probability Density``
    legend : str or None, Optional
        The text to display in the legend (title) of the plot.
    **kwargs
        Additional keyword arguments to pass to :func:`matplotlib.pyplot.figure`.

    Returns
    -------
    matplotlib.figure.Figure
        The generated matplotlib Figure object.

    Raises
    ------
    TypeError
        If the input parameters are not of the expected type.
    ValueError
        If the input values are out of the expected range.

    Examples
    --------

    Left-tailed hypothesis test with a critical value of ``-1.645`` for the standard normal ``N(0,1)``
    
    .. plot::
        :context: reset

        pbh.stats.shaded_hypothesis_test(-1.645, "left")

    Right-tailed hypothesis test with a critical value of ``1.645`` for the standard normal ``N(0,1)``
    
    .. plot::
        :context: reset

        pbh.stats.shaded_hypothesis_test(1.645, "right")
    
    Two-tailed hypothesis test with a critical value of ``±1.96`` for the standard normal ``N(0,1)``
    
    .. plot::
        :context: reset

        pbh.stats.shaded_hypothesis_test(1.96, "both")
    """
    if not isinstance(mean, (float, int)):
        msg = f"mean must be a number, not a {mean.__class__.__name__!r}!"
        raise TypeError(msg)
    if not isinstance(sd, (float, int)):
        msg = f"sd must be a number, not a {sd.__class__.__name__!r}!"
        raise TypeError(msg)
    if not isinstance(rsd, (float, int)):
        msg = f"rsd must be a number, not a {rsd.__class__.__name__!r}!"
        raise TypeError(msg)
    if not isinstance(critical_value, (float, int)):
        msg = f"critical_value must be a number, not a {critical_value.__class__.__name__!r}!"
        raise TypeError(msg)
    if tail not in {"left", "right", "both"}:
        msg = f"tail must be one of 'left', 'right', or 'both', not {tail!r}!"
        raise ValueError(msg)
    
    # Define the normal distribution
    plot_to = rsd * sd
    xx = np.linspace(mean - plot_to, mean + plot_to, 100)
    yy = stats.norm.pdf(xx, mean, sd)
    
    fig = plt.figure(figsize=figsize, **kwargs)
    ax = fig.gca()
    ax.plot(xx, yy)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

    match tail:
        case "left":
            x = np.linspace(xx[0], critical_value, 100)
            y = stats.norm.pdf(x, mean, sd)
            # fmt: off
            ax.fill(
                np.concatenate([[xx[0]], x, [critical_value]]),
                np.concatenate([[0], y, [0]]),
                color,
            )
        case "right":
            x = np.linspace(critical_value, xx[-1], 100)
            y = stats.norm.pdf(x, mean, sd)
            # fmt: off
            ax.fill(
                np.concatenate([[critical_value], x, [xx[-1]]]),
                np.concatenate([[0], y, [0]]),
                color,
            )
        case "both":
            right_crit = abs(critical_value)
            left_crit = -right_crit
            left_x = np.linspace(xx[0], left_crit, 100)
            left_y = stats.norm.pdf(left_x, mean, sd)
            right_x = np.linspace(critical_value, xx[-1], 100)
            right_y = stats.norm.pdf(right_x, mean, sd)
            # fmt: off
            ax.fill(
                np.concatenate([[xx[0]], left_x, [left_crit]]),
                np.concatenate([[0], left_y, [0]]),
                color,
                np.concatenate([[right_crit], right_x, [xx[-1]]]),
                np.concatenate([[0], right_y, [0]]),
                color,
            )
        case other:
            msg = f"tail must be one of 'left', 'right', or 'both', not {other!r}!"
            raise ValueError(msg)
    
    if legend is not None:
        ax.set_title(legend)

    return fig