Commit | Line | Data |
---|---|---|
b64eb60f MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Simple linear regression | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | #ifndef MLIB_LINREG_H | |
29 | #define MLIB_LINREG_H | |
30 | ||
31 | #ifdef __cplusplus | |
32 | extern "C" { | |
33 | #endif | |
34 | ||
35 | /* Theory. (This should be well-known.) | |
36 | * | |
37 | * We have a number of data points (x_i, y_i) for 0 <= i < n. We believe | |
38 | * they lie close to a straight line y = m x + c, for unknown constants m and | |
39 | * c. Let e_i = m x_i + c - y_i. We seek the parameters which minimize | |
40 | * E = ∑_{0\le i<n} e_i^2. | |
41 | * | |
42 | * We begin by simplifying | |
43 | * | |
44 | * E = ∑ e_i^2 | |
45 | * = ∑ (m x_i + c - y_i)^2 | |
46 | * = ∑ (m^2 x_i^2 + c^2 + y_i^2 + 2 c m x_i - 2 m x_i y_i - 2 c y_i) . | |
47 | * | |
48 | * (where all sums are over 0 <= i < n). Taking partial derivatives, | |
49 | * | |
50 | * ∂E/∂m = ∑ (2 m x_i^2 + 2 c x_i - 2 x_i y_i) | |
51 | * = 2 (m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i) | |
52 | * | |
53 | * and | |
54 | * | |
55 | * ∂E/∂c = ∑ (2 c + 2 m x_i - 2 y_i) | |
56 | * = 2 (n c + m ∑ x_i - ∑ y_i) . | |
57 | * | |
58 | * Now we solve for m and c such that the partial derivatives vanish. Note | |
59 | * that the second partial derivatives are nonnegative, being 2 ∑ x_i^2 and 2 | |
60 | * n respectively, so we shall indeed find a minimum. | |
61 | * | |
62 | * Restating the problem, | |
63 | * | |
64 | * m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i = 0 | |
65 | * m ∑ x_i + c n - ∑ y_i = 0 . | |
66 | * | |
67 | * Writing E[X] = (∑ x_i)/n, E[X Y] = (∑ x_i y_i)/n, etc., this gives | |
68 | * | |
69 | * m n E[X^2] + c n E[X] - n E[X Y] = 0 | |
70 | * m n E[X] + c n - n E[Y] = 0 . | |
71 | * | |
72 | * We see a common factor of n, so we can take that out. Now multiply the | |
73 | * latter equation by E[X] and subtract, giving | |
74 | * | |
75 | * m (E[X^2] - E[X]^2) - (E[X Y] - E[X] E[Y]) | |
76 | * | |
77 | * whence | |
78 | * | |
79 | * m = (E[X Y] - E[X] E[Y])/(E[X^2] - E[X]^2) . | |
80 | * | |
81 | * The numerator is known as the covariance of X and Y, written | |
82 | * | |
83 | * cov(X, Y) = σ_{XY} = E[X Y] - E[X] E[Y] ; | |
84 | * | |
85 | * the denominator is the variance of X, written | |
86 | * | |
87 | * var(X) = σ_X = E[X^2] - E[X]^2 . | |
88 | * | |
89 | * The second equation gives | |
90 | * | |
91 | * c = E[Y] - m E[X] . | |
92 | * | |
93 | * Also of interest is the correlation coefficient | |
94 | * | |
95 | * r = σ_{XY}/√(σ_X σ_Y) , | |
96 | * | |
97 | * which I'm not going to derive here. | |
98 | */ | |
99 | ||
100 | /*----- Header files ------------------------------------------------------*/ | |
101 | ||
102 | /*----- Data structures ---------------------------------------------------*/ | |
103 | ||
104 | struct linreg { | |
105 | double sum_x, sum_x2, sum_y, sum_y2, sum_x_y; | |
106 | unsigned long n; | |
107 | }; | |
108 | #define LINREG_INIT { 0.0, 0.0, 0.0, 0.0, 0.0, 0 } | |
109 | ||
110 | /*----- Functions provided ------------------------------------------------*/ | |
111 | ||
112 | /* --- @linreg_init@ --- * | |
113 | * | |
114 | * Arguments: @struct linreg *lr@ = linear regression state | |
115 | * | |
116 | * Returns: --- | |
117 | * | |
118 | * Use: Initializes a linear-regression state ready for use. | |
119 | */ | |
120 | ||
121 | extern void linreg_init(struct linreg */*lr*/); | |
122 | ||
123 | /* --- @linreg_update@ --- * | |
124 | * | |
125 | * Arguments: @struct linreg *lr@ = linear regression state | |
126 | * @double x, y@ = point coordinates | |
127 | * | |
128 | * Returns: --- | |
129 | * | |
130 | * Use: Informs the linear regression machinery of a point. | |
131 | * | |
132 | * Note that the state size is constant, and independent of the | |
133 | * number of points. | |
134 | */ | |
135 | ||
136 | extern void linreg_update(struct linreg */*lr*/, double /*x*/, double /*y*/); | |
137 | ||
138 | /* --- @linreg_fit@ --- * | |
139 | * | |
140 | * Arguments: @const struct linreg *lr@ = linear regression state | |
141 | * @double *m_out, *c_out, *r_out@ = where to write outputs | |
142 | * | |
143 | * Returns: --- | |
144 | * | |
145 | * Use: Compute the best-fit line through the previously-specified | |
146 | * points. The line has the equation %$y = m x + c$%; %$m$% and | |
147 | * %$c$% are written to @*m_out@ and @*c_out@ respectively, and | |
148 | * the correlation coefficient %$r$% is written to @*r_out@. | |
149 | * Any (or all, but that would be useless) of the output | |
150 | * pointers may be null to discard that result. | |
151 | * | |
152 | * At least one point must have been given. | |
153 | */ | |
154 | ||
155 | extern void linreg_fit(const struct linreg */*lr*/, | |
156 | double */*m_out*/, double */*c_out*/, | |
157 | double */*r_out*/); | |
158 | ||
159 | /*----- That's all, folks -------------------------------------------------*/ | |
160 | ||
161 | #ifdef __cplusplus | |
162 | } | |
163 | #endif | |
164 | ||
165 | #endif |