[mLib] / utils / linreg.h

/* -*-c-*-
 *
 * Simple linear regression
 *
 * (c) 2023 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of the mLib utilities library.
 *
 * mLib is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * mLib is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with mLib.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

#ifndef MLIB_LINREG_H
#define MLIB_LINREG_H

#ifdef __cplusplus
  extern "C" {
#endif

/* Theory.  (This should be well-known.)
 *
 * We have a number of data points (x_i, y_i) for 0 <= i < n.  We believe
 * they lie close to a straight line y = m x + c, for unknown constants m and
 * c.  Let e_i = m x_i + c - y_i.  We seek the parameters which minimize
 * E = ∑_{0\le i<n} e_i^2.
 *
 * We begin by simplifying
 *
 *	E = ∑ e_i^2
 *	  = ∑ (m x_i + c - y_i)^2
 *	  = ∑ (m^2 x_i^2 + c^2 + y_i^2 + 2 c m x_i - 2 m x_i y_i - 2 c y_i) .
 *
 * (where all sums are over 0 <= i < n).  Taking partial derivatives,
 *
 *	∂E/∂m = ∑ (2 m x_i^2 + 2 c x_i - 2 x_i y_i)
 *	      = 2 (m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i)
 *
 * and
 *
 *	∂E/∂c = ∑ (2 c + 2 m x_i - 2 y_i)
 *	      = 2 (n c + m ∑ x_i - ∑ y_i) .
 *
 * Now we solve for m and c such that the partial derivatives vanish.  Note
 * that the second partial derivatives are nonnegative, being 2 ∑ x_i^2 and 2
 * n respectively, so we shall indeed find a minimum.
 *
 * Restating the problem,
 *
 *	m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i = 0
 *	m ∑ x_i   + c n     - ∑ y_i     = 0 .
 *
 * Writing E[X] = (∑ x_i)/n, E[X Y] = (∑ x_i y_i)/n, etc., this gives
 *
 *	m n E[X^2] + c n E[X] - n E[X Y] = 0
 *	m n E[X]   + c n      - n E[Y]   = 0 .
 *
 * We see a common factor of n, so we can take that out.  Now multiply the
 * latter equation by E[X] and subtract, giving
 *
 *	m (E[X^2] - E[X]^2) - (E[X Y] - E[X] E[Y])
 *
 * whence
 *
 *	m = (E[X Y] - E[X] E[Y])/(E[X^2] - E[X]^2) .
 *
 * The numerator is known as the covariance of X and Y, written
 *
 *	cov(X, Y) = σ_{XY} = E[X Y] - E[X] E[Y] ;
 *
 * the denominator is the variance of X, written
 *
 *	var(X) = σ_X = E[X^2] - E[X]^2 .
 *
 * The second equation gives
 *
 *	c = E[Y] - m E[X] .
 *
 * Also of interest is the correlation coefficient
 *
 *	r = σ_{XY}/√(σ_X σ_Y) ,
 *
 * which I'm not going to derive here.
 */

/*----- Header files ------------------------------------------------------*/

/*----- Data structures ---------------------------------------------------*/

struct linreg {
  double sum_x, sum_x2, sum_y, sum_y2, sum_x_y;
  unsigned long n;
};
#define LINREG_INIT { 0.0, 0.0, 0.0, 0.0, 0.0, 0 }

/*----- Functions provided ------------------------------------------------*/

/* --- @linreg_init@ --- *
 *
 * Arguments:	@struct linreg *lr@ = linear regression state
 *
 * Returns:	---
 *
 * Use:		Initializes a linear-regression state ready for use.
 */

extern void linreg_init(struct linreg */*lr*/);

/* --- @linreg_update@ --- *
 *
 * Arguments:	@struct linreg *lr@ = linear regression state
 *		@double x, y@ = point coordinates
 *
 * Returns:	---
 *
 * Use:		Informs the linear regression machinery of a point.
 *
 *		Note that the state size is constant, and independent of the
 *		number of points.
 */

extern void linreg_update(struct linreg */*lr*/, double /*x*/, double /*y*/);

/* --- @linreg_fit@ --- *
 *
 * Arguments:	@const struct linreg *lr@ = linear regression state
 *		@double *m_out, *c_out, *r_out@ = where to write outputs
 *
 * Returns:	---
 *
 * Use:		Compute the best-fit line through the previously-specified
 *		points.  The line has the equation %$y = m x + c$%; %$m$% and
 *		%$c$% are written to @*m_out@ and @*c_out@ respectively, and
 *		the correlation coefficient %$r$% is written to @*r_out@.
 *		Any (or all, but that would be useless) of the output
 *		pointers may be null to discard that result.
 *
 *		At least one point must have been given.
 */

extern void linreg_fit(const struct linreg */*lr*/,
		       double */*m_out*/, double */*c_out*/,
		       double */*r_out*/);

/*----- That's all, folks -------------------------------------------------*/

#ifdef __cplusplus
  }
#endif

#endif
Commit	Line	Data
b64eb60f MW	1	/* --c--
	2	*
	3	* Simple linear regression
	4	*
	5	* (c) 2023 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of the mLib utilities library.
	11	*
	12	* mLib is free software: you can redistribute it and/or modify it under
	13	* the terms of the GNU Library General Public License as published by
	14	* the Free Software Foundation; either version 2 of the License, or (at
	15	* your option) any later version.
	16	*
	17	* mLib is distributed in the hope that it will be useful, but WITHOUT
	18	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	19	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
	20	* License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with mLib. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	#ifndef MLIB_LINREG_H
	29	#define MLIB_LINREG_H
	30
	31	#ifdef __cplusplus
	32	extern "C" {
	33	#endif
	34
	35	/* Theory. (This should be well-known.)
	36	*
	37	* We have a number of data points (x_i, y_i) for 0 <= i < n. We believe
	38	* they lie close to a straight line y = m x + c, for unknown constants m and
	39	* c. Let e_i = m x_i + c - y_i. We seek the parameters which minimize
	40	* E = ∑_{0\le i<n} e_i^2.
	41	*
	42	* We begin by simplifying
	43	*
	44	* E = ∑ e_i^2
	45	* = ∑ (m x_i + c - y_i)^2
	46	* = ∑ (m^2 x_i^2 + c^2 + y_i^2 + 2 c m x_i - 2 m x_i y_i - 2 c y_i) .
	47	*
	48	* (where all sums are over 0 <= i < n). Taking partial derivatives,
	49	*
	50	* ∂E/∂m = ∑ (2 m x_i^2 + 2 c x_i - 2 x_i y_i)
	51	* = 2 (m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i)
	52	*
	53	* and
	54	*
	55	* ∂E/∂c = ∑ (2 c + 2 m x_i - 2 y_i)
	56	* = 2 (n c + m ∑ x_i - ∑ y_i) .
	57	*
	58	* Now we solve for m and c such that the partial derivatives vanish. Note
	59	* that the second partial derivatives are nonnegative, being 2 ∑ x_i^2 and 2
	60	* n respectively, so we shall indeed find a minimum.
	61	*
	62	* Restating the problem,
	63	*
	64	* m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i = 0
65	* m ∑ x_i + c n - ∑ y_i = 0 .
66	*
67	* Writing E[X] = (∑ x_i)/n, E[X Y] = (∑ x_i y_i)/n, etc., this gives
68	*
69	* m n E[X^2] + c n E[X] - n E[X Y] = 0
70	* m n E[X] + c n - n E[Y] = 0 .
71	*
72	* We see a common factor of n, so we can take that out. Now multiply the
73	* latter equation by E[X] and subtract, giving
74	*
75	* m (E[X^2] - E[X]^2) - (E[X Y] - E[X] E[Y])
76	*
77	* whence
78	*
79	* m = (E[X Y] - E[X] E[Y])/(E[X^2] - E[X]^2) .
80	*
81	* The numerator is known as the covariance of X and Y, written
82	*
83	* cov(X, Y) = σ_{XY} = E[X Y] - E[X] E[Y] ;
84	*
85	* the denominator is the variance of X, written
86	*
87	* var(X) = σ_X = E[X^2] - E[X]^2 .
88	*
89	* The second equation gives
90	*
91	* c = E[Y] - m E[X] .
92	*
93	* Also of interest is the correlation coefficient
94	*
95	* r = σ_{XY}/√(σ_X σ_Y) ,
96	*
97	* which I'm not going to derive here.
98	*/
99
100	/----- Header files ------------------------------------------------------/
101
102	/----- Data structures ---------------------------------------------------/
103
104	struct linreg {
105	double sum_x, sum_x2, sum_y, sum_y2, sum_x_y;
106	unsigned long n;
107	};
108	#define LINREG_INIT { 0.0, 0.0, 0.0, 0.0, 0.0, 0 }
109
110	/----- Functions provided ------------------------------------------------/
111
112	/* --- @linreg_init@ --- *
113	*
114	* Arguments: @struct linreg *lr@ = linear regression state
115	*
116	* Returns: ---
117	*
118	* Use: Initializes a linear-regression state ready for use.
119	*/
120
121	extern void linreg_init(struct linreg /lr*/);
122
123	/* --- @linreg_update@ --- *
124	*
125	* Arguments: @struct linreg *lr@ = linear regression state
126	* @double x, y@ = point coordinates
127	*
128	* Returns: ---
129	*
130	* Use: Informs the linear regression machinery of a point.
131	*
132	* Note that the state size is constant, and independent of the
133	* number of points.
134	*/
135
136	extern void linreg_update(struct linreg /lr/, double /x/, double /y*/);
137
138	/* --- @linreg_fit@ --- *
139	*
140	* Arguments: @const struct linreg *lr@ = linear regression state
141	* @double m_out, c_out, *r_out@ = where to write outputs
142	*
143	* Returns: ---
144	*
145	* Use: Compute the best-fit line through the previously-specified
146	* points. The line has the equation %$y = m x + c$%; %$m$% and
147	* %$c$% are written to @m_out@ and @c_out@ respectively, and
148	* the correlation coefficient %$r$% is written to @*r_out@.
149	* Any (or all, but that would be useless) of the output
150	* pointers may be null to discard that result.
151	*
152	* At least one point must have been given.
153	*/
154
155	extern void linreg_fit(const struct linreg /lr*/,
156	double /m_out/, double /c_out/,
157	double /r_out*/);
158
159	/----- That's all, folks -------------------------------------------------/
160
161	#ifdef __cplusplus
162	}
163	#endif
164
165	#endif