QUICK CLUSTER: Use fixed format for cluster centers.
[pspp.git] / src / math / moments.c
blob517219d54fcf5d3cc159b4b64c8f6454f64a5bd4
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "math/moments.h"
21 #include <assert.h>
22 #include <math.h>
23 #include <stdlib.h>
25 #include "data/val-type.h"
26 #include "data/value.h"
27 #include "libpspp/misc.h"
29 #include "gl/xalloc.h"
32 /* Calculates variance, skewness, and kurtosis into *VARIANCE,
33 *SKEWNESS, and *KURTOSIS if they are non-null and not greater
34 moments than MAX_MOMENT. Accepts W as the total weight, D1 as
35 the total deviation from the estimated mean, and D2, D3, and
36 D4 as the sum of the squares, cubes, and 4th powers,
37 respectively, of the deviation from the estimated mean. */
38 static void
39 calc_moments (enum moment max_moment,
40 double w, double d1, double d2, double d3, double d4,
41 double *variance, double *skewness, double *kurtosis)
43 assert (w > 0.);
45 if (max_moment >= MOMENT_VARIANCE && w > 1.)
47 double s2 = (d2 - pow2 (d1) / w) / (w - 1.);
48 if (variance != NULL)
49 *variance = s2;
51 /* From _SPSS Statistical Algorithms, 2nd ed.,
52 0-918469-89-9, section "DESCRIPTIVES". */
53 if (fabs (s2) >= 1e-20)
55 if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.)
57 double s3 = s2 * sqrt (s2);
58 double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3);
59 if (isfinite (g1))
60 *skewness = g1;
62 if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.)
64 double den = (w - 2.) * (w - 3.) * pow2 (s2);
65 double g2 = (w * (w + 1) * d4 / (w - 1.) / den
66 - 3. * pow2 (d2) / den);
67 if (isfinite (g2))
68 *kurtosis = g2;
74 /* Two-pass moments. */
76 /* A set of two-pass moments. */
77 struct moments
79 enum moment max_moment; /* Highest-order moment we're computing. */
80 int pass; /* Current pass (1 or 2). */
82 /* Pass one. */
83 double w1; /* Total weight for pass 1, so far. */
84 double sum; /* Sum of values so far. */
85 double mean; /* Mean = sum / w1. */
87 /* Pass two. */
88 double w2; /* Total weight for pass 2, so far. */
89 double d1; /* Sum of deviations from the mean. */
90 double d2; /* Sum of squared deviations from the mean. */
91 double d3; /* Sum of cubed deviations from the mean. */
92 double d4; /* Sum of (deviations from the mean)**4. */
95 /* Initializes moments M for calculating moment MAX_MOMENT and
96 lower moments. */
97 static void
98 init_moments (struct moments *m, enum moment max_moment)
100 assert (m != NULL);
101 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
102 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
103 m->max_moment = max_moment;
104 moments_clear (m);
107 /* Clears out a set of moments so that it can be reused for a new
108 set of values. The moments to be calculated are not changed. */
109 void
110 moments_clear (struct moments *m)
112 m->pass = 1;
113 m->w1 = m->w2 = 0.;
114 m->sum = 0.;
117 /* Creates and returns a data structure for calculating moment
118 MAX_MOMENT and lower moments on a data series. The user
119 should call moments_pass_one() for each value in the series,
120 then call moments_pass_two() for the same set of values in the
121 same order, then call moments_calculate() to obtain the
122 moments. The user may ask for the mean at any time during the
123 first pass (using moments_calculate()), but otherwise no
124 statistics may be requested until the end of the second pass.
125 Call moments_destroy() when the moments are no longer
126 needed. */
127 struct moments *
128 moments_create (enum moment max_moment)
130 struct moments *m = xmalloc (sizeof *m);
131 init_moments (m, max_moment);
132 return m;
135 /* Adds VALUE with the given WEIGHT to the calculation of
136 moments for the first pass. */
137 void
138 moments_pass_one (struct moments *m, double value, double weight)
140 assert (m != NULL);
141 assert (m->pass == 1);
143 if (value != SYSMIS && weight > 0.)
145 m->sum += value * weight;
146 m->w1 += weight;
150 /* Adds VALUE with the given WEIGHT to the calculation of
151 moments for the second pass. */
152 void
153 moments_pass_two (struct moments *m, double value, double weight)
155 assert (m != NULL);
157 if (m->pass == 1)
159 m->pass = 2;
160 m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.;
161 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
164 if (value != SYSMIS && weight >= 0.)
166 double d = value - m->mean;
167 double d1_delta = d * weight;
168 m->d1 += d1_delta;
169 if (m->max_moment >= MOMENT_VARIANCE)
171 double d2_delta = d1_delta * d;
172 m->d2 += d2_delta;
173 if (m->max_moment >= MOMENT_SKEWNESS)
175 double d3_delta = d2_delta * d;
176 m->d3 += d3_delta;
177 if (m->max_moment >= MOMENT_KURTOSIS)
179 double d4_delta = d3_delta * d;
180 m->d4 += d4_delta;
184 m->w2 += weight;
188 /* Calculates moments based on the input data. Stores the total
189 weight in *WEIGHT, the mean in *MEAN, the variance in
190 *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
191 *KURTOSIS. Any of these result parameters may be null
192 pointers, in which case the values are not calculated. If any
193 result cannot be calculated, either because they are undefined
194 based on the input data or because their moments are higher
195 than the maximum requested on moments_create(), then SYSMIS is
196 stored into that result. */
197 void
198 moments_calculate (const struct moments *m,
199 double *weight,
200 double *mean, double *variance,
201 double *skewness, double *kurtosis)
203 assert (m != NULL);
205 if (mean != NULL)
206 *mean = SYSMIS;
207 if (variance != NULL)
208 *variance = SYSMIS;
209 if (skewness != NULL)
210 *skewness = SYSMIS;
211 if (kurtosis != NULL)
212 *kurtosis = SYSMIS;
214 if (weight != NULL)
215 *weight = m->w1;
217 /* How many passes so far? */
218 if (m->pass == 1)
220 /* In the first pass we can only calculate the mean. */
221 if (mean != NULL && m->w1 > 0.)
222 *mean = m->sum / m->w1;
224 else
226 /* After the second pass we can calculate any stat. */
227 assert (m->pass == 2);
229 if (m->w2 > 0.)
231 if (mean != NULL)
232 *mean = m->mean;
233 calc_moments (m->max_moment,
234 m->w2, m->d1, m->d2, m->d3, m->d4,
235 variance, skewness, kurtosis);
240 /* Destroys a set of moments. */
241 void
242 moments_destroy (struct moments *m)
244 free (m);
247 /* Calculates the requested moments on the N values in ARRAY.
248 Each value is given a weight of 1. The total weight is stored
249 into *WEIGHT (trivially) and the mean, variance, skewness, and
250 kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and
251 *KURTOSIS, respectively. Any of the result pointers may be
252 null, in which case no value is stored. */
253 void
254 moments_of_doubles (const double *array, size_t n,
255 double *weight,
256 double *mean, double *variance,
257 double *skewness, double *kurtosis)
259 enum moment max_moment;
260 struct moments m;
261 size_t idx;
263 if (kurtosis != NULL)
264 max_moment = MOMENT_KURTOSIS;
265 else if (skewness != NULL)
266 max_moment = MOMENT_SKEWNESS;
267 else if (variance != NULL)
268 max_moment = MOMENT_VARIANCE;
269 else
270 max_moment = MOMENT_MEAN;
272 init_moments (&m, max_moment);
273 for (idx = 0; idx < n; idx++)
274 moments_pass_one (&m, array[idx], 1.);
275 for (idx = 0; idx < n; idx++)
276 moments_pass_two (&m, array[idx], 1.);
277 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
280 /* Calculates the requested moments on the N numeric values in
281 ARRAY. Each value is given a weight of 1. The total weight
282 is stored into *WEIGHT (trivially) and the mean, variance,
283 skewness, and kurtosis are stored into *MEAN, *VARIANCE,
284 *SKEWNESS, and *KURTOSIS, respectively. Any of the result
285 pointers may be null, in which case no value is stored. */
286 void
287 moments_of_values (const union value *array, size_t n,
288 double *weight,
289 double *mean, double *variance,
290 double *skewness, double *kurtosis)
292 enum moment max_moment;
293 struct moments m;
294 size_t idx;
296 if (kurtosis != NULL)
297 max_moment = MOMENT_KURTOSIS;
298 else if (skewness != NULL)
299 max_moment = MOMENT_SKEWNESS;
300 else if (variance != NULL)
301 max_moment = MOMENT_VARIANCE;
302 else
303 max_moment = MOMENT_MEAN;
305 init_moments (&m, max_moment);
306 for (idx = 0; idx < n; idx++)
307 moments_pass_one (&m, array[idx].f, 1.);
308 for (idx = 0; idx < n; idx++)
309 moments_pass_two (&m, array[idx].f, 1.);
310 moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
313 /* One-pass moments. */
315 /* A set of one-pass moments. */
316 struct moments1
318 enum moment max_moment; /* Highest-order moment we're computing. */
319 double w; /* Total weight so far. */
320 double d1; /* Sum of deviations from the mean. */
321 double d2; /* Sum of squared deviations from the mean. */
322 double d3; /* Sum of cubed deviations from the mean. */
323 double d4; /* Sum of (deviations from the mean)**4. */
326 /* Initializes one-pass moments M for calculating moment
327 MAX_MOMENT and lower moments. */
328 static void
329 init_moments1 (struct moments1 *m, enum moment max_moment)
331 assert (m != NULL);
332 assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
333 || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
334 m->max_moment = max_moment;
335 moments1_clear (m);
338 /* Clears out a set of one-pass moments so that it can be reused
339 for a new set of values. The moments to be calculated are not
340 changed. */
341 void
342 moments1_clear (struct moments1 *m)
344 m->w = 0.;
345 m->d1 = m->d2 = m->d3 = m->d4 = 0.;
348 /* Creates and returns a data structure for calculating moment
349 MAX_MOMENT and lower moments on a data series in a single
350 pass. The user should call moments1_add() for each value in
351 the series. The user may call moments1_calculate() to obtain
352 the current moments at any time. Call moments1_destroy() when
353 the moments are no longer needed.
355 One-pass moments should only be used when two passes over the
356 data are impractical. */
357 struct moments1 *
358 moments1_create (enum moment max_moment)
360 struct moments1 *m = xmalloc (sizeof *m);
361 init_moments1 (m, max_moment);
362 return m;
365 /* Adds VALUE with the given WEIGHT to the calculation of
366 one-pass moments. */
367 void
368 moments1_add (struct moments1 *m, double value, double weight)
370 assert (m != NULL);
372 if (value != SYSMIS && weight > 0.)
374 double prev_w, v1;
376 prev_w = m->w;
377 m->w += weight;
378 v1 = (weight / m->w) * (value - m->d1);
379 m->d1 += v1;
381 if (m->max_moment >= MOMENT_VARIANCE)
383 double v2 = v1 * v1;
384 double w_prev_w = m->w * prev_w;
385 double prev_m2 = m->d2;
387 m->d2 += w_prev_w / weight * v2;
388 if (m->max_moment >= MOMENT_SKEWNESS)
390 double w2 = weight * weight;
391 double v3 = v2 * v1;
392 double prev_m3 = m->d3;
394 m->d3 += (-3. * v1 * prev_m2
395 + w_prev_w / w2 * (m->w - 2. * weight) * v3);
396 if (m->max_moment >= MOMENT_KURTOSIS)
398 double w3 = w2 * weight;
399 double v4 = v2 * v2;
401 m->d4 += (-4. * v1 * prev_m3
402 + 6. * v2 * prev_m2
403 + ((pow2 (m->w) - 3. * weight * prev_w)
404 * v4 * w_prev_w / w3));
411 /* Calculates one-pass moments based on the input data. Stores
412 the total weight in *WEIGHT, the mean in *MEAN, the variance
413 in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
414 *KURTOSIS. Any of these result parameters may be null
415 pointers, in which case the values are not calculated. If any
416 result cannot be calculated, either because they are undefined
417 based on the input data or because their moments are higher
418 than the maximum requested on moments_create(), then SYSMIS is
419 stored into that result. */
420 void
421 moments1_calculate (const struct moments1 *m,
422 double *weight,
423 double *mean, double *variance,
424 double *skewness, double *kurtosis)
426 assert (m != NULL);
428 if (mean != NULL)
429 *mean = SYSMIS;
430 if (variance != NULL)
431 *variance = SYSMIS;
432 if (skewness != NULL)
433 *skewness = SYSMIS;
434 if (kurtosis != NULL)
435 *kurtosis = SYSMIS;
437 if (weight != NULL)
438 *weight = m->w;
440 if (m->w > 0.)
442 if (mean != NULL)
443 *mean = m->d1;
445 calc_moments (m->max_moment,
446 m->w, 0., m->d2, m->d3, m->d4,
447 variance, skewness, kurtosis);
451 /* Destroy one-pass moments M. */
452 void
453 moments1_destroy (struct moments1 *m)
455 free (m);
459 double
460 calc_semean (double variance, double W)
462 return sqrt (variance / W);
465 /* Returns the standard error of the skewness for the given total
466 weight W.
468 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
469 section "DESCRIPTIVES". */
470 double
471 calc_seskew (double W)
473 return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.)));
476 /* Returns the standard error of the kurtosis for the given total
477 weight W.
479 From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
480 section "DESCRIPTIVES", except that the sqrt symbol is omitted
481 there. */
482 double
483 calc_sekurt (double W)
485 return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W)))
486 / ((W - 3.) * (W + 5.)));