Quartet d’Anscombe🔗

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
# coding: utf-8

import numpy as N
import scipy.stats as SS
import matplotlib.pyplot as P


def printStats(x, y):
    """
    Print out means and variances for x and y, as well as correlation
    coeff. (Pearson) and linear regression for y vs. x.
    """

    assert N.shape(x) == N.shape(y), "Incompatible input arrays"

    print(f"x: mean={N.mean(x):.2f}, variance={N.var(x):.2f}")
    print(f"y: mean={N.mean(y):.2f}, variance={N.var(y):.2f}")
    print(f"y vs. x: corrcoeff={SS.pearsonr(x, y)[0]:.2f}")
    # slope, intercept, r_value, p_value, std_err
    a, b, _, _, _ = SS.linregress(x, y)
    print(f"y vs. x: y = {a:.2f} x + {b:.2f}")


def plotStats(ax, x, y, title='', fancy=True):
    """
    Plot y vs. x, and linear regression.
    """

    assert N.shape(x) == N.shape(y), "Incompatible input arrays"

    # slope, intercept, r_value, p_value, std_err
    a, b, r, _, _ = SS.linregress(x, y)

    # Data + corrcoeff
    ax.plot(x, y, 'bo', label=f"r = {r:.2f}")

    # Linear regression
    xx = N.array([0, 20])
    yy = a * xx + b
    ax.plot(xx, yy, 'r-', label=f"y = {a:.2f} x + {b:.2f}")

    leg = ax.legend(loc='upper left', fontsize='small')

    if fancy:                   # Additional stuff
        # Add mean line ± stddev
        m = N.mean(x)
        s = N.std(x, ddof=1)
        ax.axvline(m, color='g', ls='--', label='_')    # Mean
        ax.axvspan(m - s, m + s, color='g', alpha=0.2)  # Std-dev

        m = N.mean(y)
        s = N.std(y, ddof=1)
        ax.axhline(m, color='g', ls='--', label='_')    # Mean
        ax.axhspan(m - s, m + s, color='g', alpha=0.2)  # Std-dev

        # Title and labels
        if title:
            ax.set_title(title)
        if ax.is_last_row():
            ax.set_xlabel("x")
        if ax.is_first_col():
            ax.set_ylabel("y")


if __name__ == '__main__':

    quartet = N.genfromtxt("anscombe.dat")  # Read Anscombe's Quartet

    fig = P.figure()

    for i in range(4):                  # Loop over quartet sets x,y
        ax = fig.add_subplot(2, 2, i + 1)
        print(f" Dataset #{i+1} ".center(40, '='))
        x, y = quartet[:, 2 * i:2 * i + 2].T
        printStats(x, y)                # Print main statistics
        plotStats(ax, x, y, title='#'+str(i + 1))  # Plots

    fig.suptitle("Anscombe's Quartet", fontsize='x-large')
    fig.tight_layout()

    P.show()
$ python3 anscombe.py

============== Dataset #1 ==============
x: mean=9.00, variance=10.00
y: mean=7.50, variance=3.75
y vs. x: corrcoeff=0.82
y vs. x: y = 0.50 x + 3.00
============== Dataset #2 ==============
x: mean=9.00, variance=10.00
y: mean=7.50, variance=3.75
y vs. x: corrcoeff=0.82
y vs. x: y = 0.50 x + 3.00
============== Dataset #3 ==============
x: mean=9.00, variance=10.00
y: mean=7.50, variance=3.75
y vs. x: corrcoeff=0.82
y vs. x: y = 0.50 x + 3.00
============== Dataset #4 ==============
x: mean=9.00, variance=10.00
y: mean=7.50, variance=3.75
y vs. x: corrcoeff=0.82
y vs. x: y = 0.50 x + 3.00
../_images/anscombe.png

Source: anscombe.py