Source code for problem_bank_helpers.stats

# Author: Firas Moosvi, Jake Bobowski, others
# Date: 2023-10-31

from __future__ import annotations

from typing import Any, Literal

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.figure import Figure
from scipy import stats


[docs] def shaded_normal_density( q: float | tuple[float, float], /, mean: float = 0, sd: float = 1, rsd: float = 4, lower_tail: bool = True, add_prob: bool = True, add_q: bool = True, add_legend: bool = False, figsize: tuple[float, float] | None = (8, 6), color: Any = "xkcd:sky blue", x_label: str = "x", y_label: str = "f(x; \u03BC,\u03C3)", legend_text: str | None = None, **kwargs, ) -> Figure: """Generate a normal distribution plot with optional listed probability calculation. Parameters ---------- q : float or tuple[float, float] If a float, the upper or lower bound of the shaded area. If a tuple of floats, the lower and upper bounds of the shaded area. mean : float The mean of the normal distribution. Defaults to ``0`` sd : float The standard deviation of the normal distribution. Defaults to ``1`` rsd : float The number of standard deviations to plot on either side of the mean. Defaults to ``4`` lower_tail : bool Whether the shaded area should represent the lower tail probability :math:`\\operatorname{P}(X \\le x)`, or the upper tail probability :math:`\\operatorname{P}(X \\ge x)`. Defaults to ``True`` add_prob : bool Whether to show the probability of the shaded area will be displayed on the plot. Defaults to ``True`` add_q : bool Whether the value(s) of ``q`` should be displayed on the x-axis of the plot. Defaults to ``True`` add_legend : bool Whether a legend with the mean and standard deviation values will be displayed on the plot. Defaults to ``False`` figsize : tuple or tuple[float, float] or None The size of the plot in inches. If None, the default matplotlib figure size will be used as this is passed to :func:`matplotlib.pyplot.figure`. Defaults to ``(8, 6)`` color : Any The color of the shaded area as a valid :doc:`matplotlib color <mpl:users/explain/colors/colors>`. Defaults to ``xkcd:sky blue`` x_label : str The label for the x-axis. Defaults to ``x`` y_label : str The label for the y-axis. Defaults to ``f(x; \u03BC,\u03C3)`` legend_text : str or None, Optional The text to display in the legend if add_legend is set to true. By default (None), the legend will display the mean and standard deviation values. **kwargs Additional keyword arguments to pass to :func:`matplotlib.pyplot.figure`. Returns ------- matplotlib.figure.Figure The generated matplotlib Figure object. Raises ------ TypeError If the input parameters are not of the expected type. ValueError If the input values are out of the expected range. Examples -------- Shading the region :math:`P(Z \\le z)` where :math:`Z\\sim N(0,1)` is the standard normal ``N(0,1)`` .. plot:: :context: reset pbh.stats.shaded_normal_density(-0.2533) Shading the region :math:`P(Z \\ge z)` where :math:`Z\\sim N(\u03BC,\u03C3)` is a normal distribution with mean \u03BC and standard deviation \u03C3. .. plot:: :context: reset pbh.stats.shaded_normal_density(7.1, 7, 0.1, lower_tail=False, add_legend=True) Shading the region :math:`P(a \\le Z \\le b)` where :math:`Z\\sim N(0,1)` is the standard normal ``N(0,1)`` .. plot:: :context: reset pbh.stats.shaded_normal_density((-1.1, 2)) References ---------- Based off of an R function written by Dr. Irene Vrbick for making `shaded normal density curves <https://irene.vrbik.ok.ubc.ca/blog/2021-11-04-shading-under-the-normal-curve/>`__. The R function by Dr. Irene Vrbick was adapted from `here <http://rstudio-pubs-static.s3.amazonaws.com/78857_86c2403ca9c146ba8fcdcda79c3f4738.html>`__. """ if not isinstance(mean, (float, int)): msg = f"mean must be a number, not a {mean.__class__.__name__!r}" raise TypeError(msg) if not isinstance(sd, (float, int)): msg = f"sd must be a number, not a {sd.__class__.__name__!r}" raise TypeError(msg) if not isinstance(rsd, (float, int)): msg = f"rsd must be a number, not a {rsd.__class__.__name__!r}" raise TypeError(msg) if ( isinstance(q, tuple) and len(q) == 2 and isinstance(q[0], (float, int)) and isinstance(q[1], (float, int)) ): q_lower, q_upper = sorted(q) xx = np.linspace(mean - rsd * sd, mean + rsd * sd, 200) yy = stats.norm.pdf(xx, mean, sd) fig = plt.figure(figsize=figsize, **kwargs) ax = fig.gca() ax.plot(xx, yy) ax.set_xlabel(x_label) ax.set_ylabel(y_label) x = np.linspace(q_lower, q_upper, 200) y = stats.norm.pdf(x, mean, sd) # fmt: off filled, *_ = ax.fill( # Fill returns a list of polygons, but we're only making one np.concatenate([[q_lower], x, [q_upper]]), np.concatenate([[0], y, [0]]), color, ) # fmt: on if add_prob: height = max(y) / 4 rv = stats.norm(mean, sd) prob: float = rv.cdf(q_upper) - rv.cdf(q_lower) ax.text((sum(q) / 2), height, f"{prob:.3f}", ha="center") if add_q: ax.set_xticks( [q_lower, q_upper], labels=[ str(round(q_lower, 4)), str(round(q_upper, 4)), ], minor=True, color=color, y=-0.05, ) if q_lower in ax.get_xticks(): ax.get_xticklabels()[ np.where(ax.get_xticks() == q_lower)[0][0] ].set_color(color) if q_upper in ax.get_xticks(): ax.get_xticklabels()[ np.where(ax.get_xticks() == q_upper)[0][0] ].set_color(color) elif isinstance(q, (float, int)): if not isinstance(lower_tail, bool): msg = f"lower_tail must be a bool, not a {lower_tail.__class__.__name__!r}" raise TypeError(msg) xx = np.linspace(mean - rsd * sd, mean + rsd * sd, 200) yy = stats.norm.pdf(xx, mean, sd) fig = plt.figure(figsize=figsize, **kwargs) ax = fig.gca() ax.plot(xx, yy) ax.set_xlabel(x_label) ax.set_ylabel(y_label) if lower_tail is True: x = np.linspace(xx[0], q, 100) y = stats.norm.pdf(x, mean, sd) # fmt: off filled, *_ = ax.fill( # Fill returns a list of polygons, but we're only making one np.concatenate([[xx[0]], x, [q]]), np.concatenate([[0], y, [0]]), color, ) # fmt: on if add_prob: height: float = stats.norm.pdf(q, mean, sd) / 4 # type: ignore prob: float = stats.norm.cdf(q, mean, sd) # type: ignore ax.text((q - 0.5 * sd), height, f"{prob:.3f}", ha="center") else: x = np.linspace(q, xx[-1], 100) y = stats.norm.pdf(x, mean, sd) # fmt: off filled, *_ = ax.fill( # Fill returns a list of polygons, but we're only making one np.concatenate([[q], x, [xx[-1]]]), np.concatenate([[0], y, [0]]), color, ) # fmt: on if add_prob: height: float = stats.norm.pdf(q, mean, sd) / 4 # type: ignore prob: float = stats.norm.sf(q, mean, sd) # type: ignore ax.text((q + 0.5 * sd), height, f"{prob:.3f}", ha="center") if add_q: if q in ax.get_xticks(): ax.get_xticklabels()[np.where(ax.get_xticks() == q)[0][0]].set_color( color ) else: ax.set_xticks( [q], labels=[ str(round(q, 4)), ], minor=True, color=color, y=-0.05, ) else: error_base = "q must be a tuple of two numbers, or a single number" if isinstance(q, tuple): if len(q) != 2: msg = f"{error_base}, not a {len(q)}-tuple" raise ValueError(msg) msg = f"{error_base}, not a 2-tuple containing a {q[0].__class__.__name__!r} and a {q[1].__class__.__name__!r}" raise TypeError(msg) else: msg = f"{error_base}, not a {q.__class__.__name__!r}" raise TypeError(msg) if add_legend: ax.set_title(legend_text or f"\u03BC = {mean}, \u03C3 = {sd}") return fig
[docs] def shaded_hypothesis_test( critical_value: float, tail: Literal["left", "right", "both"], /, mean: float = 0, sd: float = 1, rsd: float = 4, figsize: tuple[float, float] | None = (8, 6), color: Any = "xkcd:sky blue", x_label: str = "x", y_label: str = "Probability Density", legend: str | None = None, **kwargs ) -> Figure: """Generate a normal distribution plot with appropriate tails for a hypothesis test. Parameters ---------- critical_value : float The critical value to plot. If ``tail`` is ``both``, :code:`-abs(critical_value)` is used for the left tail and :code:`abs(critical_value)` is used for the right tail. tail : ``left`` or ``right`` or ``both`` The type of hypothesis test to plot. mean : float The mean of the normal distribution. Defaults to ``0`` sd : float The standard deviation of the normal distribution. Defaults to ``1`` rsd : float The number of standard deviations to plot on either side of the mean. Defaults to ``4`` figsize : tuple or tuple[float, float] or None The size of the plot in inches. If None, the default matplotlib figure size will be used as this is passed to :func:`matplotlib.pyplot.figure`. Defaults to ``(8, 6)`` color : Any The color of the shaded area as a valid :doc:`matplotlib color <mpl:users/explain/colors/colors>`. Defaults to ``xkcd:sky blue`` x_label : str The label for the x-axis. Defaults to ``x`` y_label : str The label for the y-axis. Defaults to ``Probability Density`` legend : str or None, Optional The text to display in the legend (title) of the plot. **kwargs Additional keyword arguments to pass to :func:`matplotlib.pyplot.figure`. Returns ------- matplotlib.figure.Figure The generated matplotlib Figure object. Raises ------ TypeError If the input parameters are not of the expected type. ValueError If the input values are out of the expected range. Examples -------- Left-tailed hypothesis test with a critical value of ``-1.645`` for the standard normal ``N(0,1)`` .. plot:: :context: reset pbh.stats.shaded_hypothesis_test(-1.645, "left") Right-tailed hypothesis test with a critical value of ``1.645`` for the standard normal ``N(0,1)`` .. plot:: :context: reset pbh.stats.shaded_hypothesis_test(1.645, "right") Two-tailed hypothesis test with a critical value of ``±1.96`` for the standard normal ``N(0,1)`` .. plot:: :context: reset pbh.stats.shaded_hypothesis_test(1.96, "both") """ if not isinstance(mean, (float, int)): msg = f"mean must be a number, not a {mean.__class__.__name__!r}!" raise TypeError(msg) if not isinstance(sd, (float, int)): msg = f"sd must be a number, not a {sd.__class__.__name__!r}!" raise TypeError(msg) if not isinstance(rsd, (float, int)): msg = f"rsd must be a number, not a {rsd.__class__.__name__!r}!" raise TypeError(msg) if not isinstance(critical_value, (float, int)): msg = f"critical_value must be a number, not a {critical_value.__class__.__name__!r}!" raise TypeError(msg) if tail not in {"left", "right", "both"}: msg = f"tail must be one of 'left', 'right', or 'both', not {tail!r}!" raise ValueError(msg) # Define the normal distribution plot_to = rsd * sd xx = np.linspace(mean - plot_to, mean + plot_to, 100) yy = stats.norm.pdf(xx, mean, sd) fig = plt.figure(figsize=figsize, **kwargs) ax = fig.gca() ax.plot(xx, yy) ax.set_xlabel(x_label) ax.set_ylabel(y_label) match tail: case "left": x = np.linspace(xx[0], critical_value, 100) y = stats.norm.pdf(x, mean, sd) # fmt: off ax.fill( np.concatenate([[xx[0]], x, [critical_value]]), np.concatenate([[0], y, [0]]), color, ) case "right": x = np.linspace(critical_value, xx[-1], 100) y = stats.norm.pdf(x, mean, sd) # fmt: off ax.fill( np.concatenate([[critical_value], x, [xx[-1]]]), np.concatenate([[0], y, [0]]), color, ) case "both": right_crit = abs(critical_value) left_crit = -right_crit left_x = np.linspace(xx[0], left_crit, 100) left_y = stats.norm.pdf(left_x, mean, sd) right_x = np.linspace(critical_value, xx[-1], 100) right_y = stats.norm.pdf(right_x, mean, sd) # fmt: off ax.fill( np.concatenate([[xx[0]], left_x, [left_crit]]), np.concatenate([[0], left_y, [0]]), color, np.concatenate([[right_crit], right_x, [xx[-1]]]), np.concatenate([[0], right_y, [0]]), color, ) case other: msg = f"tail must be one of 'left', 'right', or 'both', not {other!r}!" raise ValueError(msg) if legend is not None: ax.set_title(legend) return fig