codec/base64.3: Supply missing Oxford comma.
[mLib] / utils / linreg.h
CommitLineData
b64eb60f
MW
1/* -*-c-*-
2 *
3 * Simple linear regression
4 *
5 * (c) 2023 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28#ifndef MLIB_LINREG_H
29#define MLIB_LINREG_H
30
31#ifdef __cplusplus
32 extern "C" {
33#endif
34
35/* Theory. (This should be well-known.)
36 *
37 * We have a number of data points (x_i, y_i) for 0 <= i < n. We believe
38 * they lie close to a straight line y = m x + c, for unknown constants m and
39 * c. Let e_i = m x_i + c - y_i. We seek the parameters which minimize
40 * E = ∑_{0\le i<n} e_i^2.
41 *
42 * We begin by simplifying
43 *
44 * E = ∑ e_i^2
45 * = ∑ (m x_i + c - y_i)^2
46 * = ∑ (m^2 x_i^2 + c^2 + y_i^2 + 2 c m x_i - 2 m x_i y_i - 2 c y_i) .
47 *
48 * (where all sums are over 0 <= i < n). Taking partial derivatives,
49 *
50 * ∂E/∂m = ∑ (2 m x_i^2 + 2 c x_i - 2 x_i y_i)
51 * = 2 (m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i)
52 *
53 * and
54 *
55 * ∂E/∂c = ∑ (2 c + 2 m x_i - 2 y_i)
56 * = 2 (n c + m ∑ x_i - ∑ y_i) .
57 *
58 * Now we solve for m and c such that the partial derivatives vanish. Note
59 * that the second partial derivatives are nonnegative, being 2 ∑ x_i^2 and 2
60 * n respectively, so we shall indeed find a minimum.
61 *
62 * Restating the problem,
63 *
64 * m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i = 0
65 * m ∑ x_i + c n - ∑ y_i = 0 .
66 *
67 * Writing E[X] = (∑ x_i)/n, E[X Y] = (∑ x_i y_i)/n, etc., this gives
68 *
69 * m n E[X^2] + c n E[X] - n E[X Y] = 0
70 * m n E[X] + c n - n E[Y] = 0 .
71 *
72 * We see a common factor of n, so we can take that out. Now multiply the
73 * latter equation by E[X] and subtract, giving
74 *
75 * m (E[X^2] - E[X]^2) - (E[X Y] - E[X] E[Y])
76 *
77 * whence
78 *
79 * m = (E[X Y] - E[X] E[Y])/(E[X^2] - E[X]^2) .
80 *
81 * The numerator is known as the covariance of X and Y, written
82 *
83 * cov(X, Y) = σ_{XY} = E[X Y] - E[X] E[Y] ;
84 *
85 * the denominator is the variance of X, written
86 *
87 * var(X) = σ_X = E[X^2] - E[X]^2 .
88 *
89 * The second equation gives
90 *
91 * c = E[Y] - m E[X] .
92 *
93 * Also of interest is the correlation coefficient
94 *
95 * r = σ_{XY}/√(σ_X σ_Y) ,
96 *
97 * which I'm not going to derive here.
98 */
99
100/*----- Header files ------------------------------------------------------*/
101
102/*----- Data structures ---------------------------------------------------*/
103
104struct linreg {
105 double sum_x, sum_x2, sum_y, sum_y2, sum_x_y;
106 unsigned long n;
107};
108#define LINREG_INIT { 0.0, 0.0, 0.0, 0.0, 0.0, 0 }
109
110/*----- Functions provided ------------------------------------------------*/
111
112/* --- @linreg_init@ --- *
113 *
114 * Arguments: @struct linreg *lr@ = linear regression state
115 *
116 * Returns: ---
117 *
118 * Use: Initializes a linear-regression state ready for use.
119 */
120
121extern void linreg_init(struct linreg */*lr*/);
122
123/* --- @linreg_update@ --- *
124 *
125 * Arguments: @struct linreg *lr@ = linear regression state
126 * @double x, y@ = point coordinates
127 *
128 * Returns: ---
129 *
130 * Use: Informs the linear regression machinery of a point.
131 *
132 * Note that the state size is constant, and independent of the
133 * number of points.
134 */
135
136extern void linreg_update(struct linreg */*lr*/, double /*x*/, double /*y*/);
137
138/* --- @linreg_fit@ --- *
139 *
140 * Arguments: @const struct linreg *lr@ = linear regression state
141 * @double *m_out, *c_out, *r_out@ = where to write outputs
142 *
143 * Returns: ---
144 *
145 * Use: Compute the best-fit line through the previously-specified
146 * points. The line has the equation %$y = m x + c$%; %$m$% and
147 * %$c$% are written to @*m_out@ and @*c_out@ respectively, and
148 * the correlation coefficient %$r$% is written to @*r_out@.
149 * Any (or all, but that would be useless) of the output
150 * pointers may be null to discard that result.
151 *
152 * At least one point must have been given.
153 */
154
155extern void linreg_fit(const struct linreg */*lr*/,
156 double */*m_out*/, double */*c_out*/,
157 double */*r_out*/);
158
159/*----- That's all, folks -------------------------------------------------*/
160
161#ifdef __cplusplus
162 }
163#endif
164
165#endif