Independent Samples T-Test Dialog: Fix Crash
[pspp.git] / src / math / covariance.c
blobc7fb4b70ad8b7ec99a842aef4d91995f92a0ca9d
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include "math/covariance.h"
21 #include <gsl/gsl_matrix.h>
23 #include "data/case.h"
24 #include "data/variable.h"
25 #include "libpspp/assertion.h"
26 #include "libpspp/misc.h"
27 #include "math/categoricals.h"
28 #include "math/interaction.h"
29 #include "math/moments.h"
31 #include "gl/xalloc.h"
33 #define n_MOMENTS (MOMENT_VARIANCE + 1)
36 /* Create a new matrix of NEW_SIZE x NEW_SIZE and copy the elements of
37 matrix IN into it. IN must be a square matrix, and in normal usage
38 it will be smaller than NEW_SIZE.
39 IN is destroyed by this function. The return value must be destroyed
40 when no longer required.
42 static gsl_matrix *
43 resize_matrix (gsl_matrix *in, size_t new_size)
45 size_t i, j;
47 gsl_matrix *out = NULL;
49 assert (in->size1 == in->size2);
51 if (new_size <= in->size1)
52 return in;
54 out = gsl_matrix_calloc (new_size, new_size);
56 for (i = 0; i < in->size1; ++i)
58 for (j = 0; j < in->size2; ++j)
60 double x = gsl_matrix_get (in, i, j);
62 gsl_matrix_set (out, i, j, x);
66 gsl_matrix_free (in);
68 return out;
71 struct covariance
73 /* True if the covariances are centerered. (ie Real covariances) */
74 bool centered;
76 /* The variables for which the covariance matrix is to be calculated. */
77 size_t n_vars;
78 const struct variable *const *vars;
80 /* Categorical variables. */
81 struct categoricals *categoricals;
83 /* Array containing number of categories per categorical variable. */
84 size_t *n_categories;
86 /* Dimension of the covariance matrix. */
87 size_t dim;
89 /* The weight variable (or NULL if none) */
90 const struct variable *wv;
92 /* A set of matrices containing the 0th, 1st and 2nd moments */
93 gsl_matrix **moments;
95 /* The class of missing values to exclude */
96 enum mv_class exclude;
98 /* An array of doubles representing the covariance matrix.
99 Only the top triangle is included, and no diagonals */
100 double *cm;
101 int n_cm;
103 /* 1 for single pass algorithm;
104 2 for double pass algorithm
106 short passes;
109 0 : No pass has been made
110 1 : First pass has been started
111 2 : Second pass has been
113 IE: How many passes have been (partially) made. */
114 short state;
116 /* Flags indicating that the first case has been seen */
117 bool pass_one_first_case_seen;
118 bool pass_two_first_case_seen;
120 gsl_matrix *unnormalised;
125 /* Return a matrix containing the M th moments.
126 The matrix is of size NxN where N is the number of variables.
127 Each row represents the moments of a variable.
128 In the absence of missing values, the columns of this matrix will
129 be identical. If missing values are involved, then element (i,j)
130 is the moment of the i th variable, when paired with the j th variable.
132 gsl_matrix *
133 covariance_moments (const struct covariance *cov, int m)
135 return cov->moments[m];
140 /* Create a covariance struct.
142 struct covariance *
143 covariance_1pass_create (size_t n_vars, const struct variable *const *vars,
144 const struct variable *weight, enum mv_class exclude,
145 bool centered)
147 size_t i;
148 struct covariance *cov = XZALLOC (struct covariance);
150 cov->centered = centered;
151 cov->passes = 1;
152 cov->state = 0;
153 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
155 cov->vars = vars;
157 cov->wv = weight;
158 cov->n_vars = n_vars;
159 cov->dim = n_vars;
161 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
163 for (i = 0; i < n_MOMENTS; ++i)
164 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
166 cov->exclude = exclude;
168 cov->n_cm = (n_vars * (n_vars - 1)) / 2;
171 cov->cm = xcalloc (cov->n_cm, sizeof *cov->cm);
172 cov->categoricals = NULL;
174 return cov;
178 Create a covariance struct for a two-pass algorithm. If categorical
179 variables are involed, the dimension cannot be know until after the
180 first data pass, so the actual covariances will not be allocated
181 until then.
183 struct covariance *
184 covariance_2pass_create (size_t n_vars, const struct variable *const *vars,
185 struct categoricals *cats,
186 const struct variable *wv, enum mv_class exclude,
187 bool centered)
189 size_t i;
190 struct covariance *cov = xmalloc (sizeof *cov);
192 cov->centered = centered;
193 cov->passes = 2;
194 cov->state = 0;
195 cov->pass_one_first_case_seen = cov->pass_two_first_case_seen = false;
197 cov->vars = vars;
199 cov->wv = wv;
200 cov->n_vars = n_vars;
201 cov->dim = n_vars;
203 cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
205 for (i = 0; i < n_MOMENTS; ++i)
206 cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
208 cov->exclude = exclude;
210 cov->n_cm = -1;
211 cov->cm = NULL;
213 cov->categoricals = cats;
214 cov->unnormalised = NULL;
216 return cov;
219 /* Return an integer, which can be used to index
220 into COV->cm, to obtain the I, J th element
221 of the covariance matrix. If COV->cm does not
222 contain that element, then a negative value
223 will be returned.
225 static int
226 cm_idx (const struct covariance *cov, int i, int j)
228 int as;
229 const int n2j = cov->dim - 2 - j;
230 const int nj = cov->dim - 2 ;
232 assert (i >= 0);
233 assert (j < cov->dim);
235 if (i == 0)
236 return -1;
238 if (j >= cov->dim - 1)
239 return -1;
241 if (i <= j)
242 return -1 ;
244 as = nj * (nj + 1) ;
245 as -= n2j * (n2j + 1) ;
246 as /= 2;
248 return i - 1 + as;
253 Returns true iff the variable corresponding to the Ith element of the covariance matrix
254 has a missing value for case C
256 static bool
257 is_missing (const struct covariance *cov, int i, const struct ccase *c)
259 const struct variable *var = i < cov->n_vars ?
260 cov->vars[i] :
261 categoricals_get_interaction_by_subscript (cov->categoricals, i - cov->n_vars)->vars[0];
263 const union value *val = case_data (c, var);
265 return (var_is_value_missing (var, val) & cov->exclude) != 0;
269 static double
270 get_val (const struct covariance *cov, int i, const struct ccase *c)
272 if (i < cov->n_vars)
274 const struct variable *var = cov->vars[i];
276 const union value *val = case_data (c, var);
278 return val->f;
281 return categoricals_get_effects_code_for_case (cov->categoricals, i - cov->n_vars, c);
284 #if 0
285 void
286 dump_matrix (const gsl_matrix *m)
288 size_t i, j;
290 for (i = 0 ; i < m->size1; ++i)
292 for (j = 0 ; j < m->size2; ++j)
293 printf ("%02f ", gsl_matrix_get (m, i, j));
294 printf ("\n");
297 #endif
299 /* Call this function for every case in the data set */
300 void
301 covariance_accumulate_pass1 (struct covariance *cov, const struct ccase *c)
303 size_t i, j, m;
304 const double weight = cov->wv ? case_num (c, cov->wv) : 1.0;
306 assert (cov->passes == 2);
307 if (!cov->pass_one_first_case_seen)
309 assert (cov->state == 0);
310 cov->state = 1;
313 if (cov->categoricals)
314 categoricals_update (cov->categoricals, c);
316 for (i = 0 ; i < cov->dim; ++i)
318 double v1 = get_val (cov, i, c);
320 if (is_missing (cov, i, c))
321 continue;
323 for (j = 0 ; j < cov->dim; ++j)
325 double pwr = 1.0;
327 if (is_missing (cov, j, c))
328 continue;
330 for (m = 0 ; m <= MOMENT_MEAN; ++m)
332 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
334 *x += pwr * weight;
335 pwr *= v1;
340 cov->pass_one_first_case_seen = true;
344 /* Call this function for every case in the data set */
345 void
346 covariance_accumulate_pass2 (struct covariance *cov, const struct ccase *c)
348 size_t i, j;
349 const double weight = cov->wv ? case_num (c, cov->wv) : 1.0;
351 assert (cov->passes == 2);
352 assert (cov->state >= 1);
354 if (! cov->pass_two_first_case_seen)
356 size_t m;
357 assert (cov->state == 1);
358 cov->state = 2;
360 if (cov->categoricals)
361 categoricals_done (cov->categoricals);
363 cov->dim = cov->n_vars;
365 if (cov->categoricals)
366 cov->dim += categoricals_df_total (cov->categoricals);
368 cov->n_cm = (cov->dim * (cov->dim - 1)) / 2;
369 cov->cm = xcalloc (cov->n_cm, sizeof *cov->cm);
371 /* Grow the moment matrices so that they're large enough to accommodate the
372 categorical elements */
373 for (i = 0; i < n_MOMENTS; ++i)
375 cov->moments[i] = resize_matrix (cov->moments[i], cov->dim);
378 /* Populate the moments matrices with the categorical value elements */
379 for (i = cov->n_vars; i < cov->dim; ++i)
381 for (j = 0 ; j < cov->dim ; ++j) /* FIXME: This is WRONG !!! */
383 double w = categoricals_get_weight_by_subscript (cov->categoricals, i - cov->n_vars);
385 gsl_matrix_set (cov->moments[MOMENT_NONE], i, j, w);
387 w = categoricals_get_sum_by_subscript (cov->categoricals, i - cov->n_vars);
389 gsl_matrix_set (cov->moments[MOMENT_MEAN], i, j, w);
393 /* FIXME: This is WRONG!! It must be fixed to properly handle missing values. For
394 now it assumes there are none */
395 for (m = 0 ; m < n_MOMENTS; ++m)
397 for (i = 0 ; i < cov->dim ; ++i)
399 double x = gsl_matrix_get (cov->moments[m], i, cov->n_vars -1);
400 for (j = cov->n_vars; j < cov->dim; ++j)
402 gsl_matrix_set (cov->moments[m], i, j, x);
407 /* Divide the means by the number of samples */
408 for (i = 0; i < cov->dim; ++i)
410 for (j = 0; j < cov->dim; ++j)
412 double *x = gsl_matrix_ptr (cov->moments[MOMENT_MEAN], i, j);
413 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
418 for (i = 0 ; i < cov->dim; ++i)
420 double v1 = get_val (cov, i, c);
422 if (is_missing (cov, i, c))
423 continue;
425 for (j = 0 ; j < cov->dim; ++j)
427 int idx;
428 double ss ;
429 double v2 = get_val (cov, j, c);
431 const double s = pow2 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)) * weight;
433 if (is_missing (cov, j, c))
434 continue;
437 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
438 *x += s;
441 ss =
442 (v1 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
444 (v2 - gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
445 * weight
448 idx = cm_idx (cov, i, j);
449 if (idx >= 0)
451 cov->cm [idx] += ss;
456 cov->pass_two_first_case_seen = true;
460 /* Call this function for every case in the data set.
461 After all cases have been passed, call covariance_calculate
463 void
464 covariance_accumulate (struct covariance *cov, const struct ccase *c)
466 size_t i, j, m;
467 const double weight = cov->wv ? case_num (c, cov->wv) : 1.0;
469 assert (cov->passes == 1);
471 if (!cov->pass_one_first_case_seen)
473 assert (cov->state == 0);
474 cov->state = 1;
477 for (i = 0 ; i < cov->dim; ++i)
479 const union value *val1 = case_data (c, cov->vars[i]);
481 if (is_missing (cov, i, c))
482 continue;
484 for (j = 0 ; j < cov->dim; ++j)
486 double pwr = 1.0;
487 int idx;
488 const union value *val2 = case_data (c, cov->vars[j]);
490 if (is_missing (cov, j, c))
491 continue;
493 idx = cm_idx (cov, i, j);
494 if (idx >= 0)
496 cov->cm [idx] += val1->f * val2->f * weight;
499 for (m = 0 ; m < n_MOMENTS; ++m)
501 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
503 *x += pwr * weight;
504 pwr *= val1->f;
509 cov->pass_one_first_case_seen = true;
514 Allocate and return a gsl_matrix containing the covariances of the
515 data.
517 static gsl_matrix *
518 cm_to_gsl (struct covariance *cov)
520 int i, j;
521 gsl_matrix *m = gsl_matrix_calloc (cov->dim, cov->dim);
523 /* Copy the non-diagonal elements from cov->cm */
524 for (j = 0 ; j < cov->dim - 1; ++j)
526 for (i = j+1 ; i < cov->dim; ++i)
528 double x = cov->cm [cm_idx (cov, i, j)];
529 gsl_matrix_set (m, i, j, x);
530 gsl_matrix_set (m, j, i, x);
534 /* Copy the diagonal elements from cov->moments[2] */
535 for (j = 0 ; j < cov->dim ; ++j)
537 double sigma = gsl_matrix_get (cov->moments[2], j, j);
538 gsl_matrix_set (m, j, j, sigma);
541 return m;
545 static gsl_matrix *
546 covariance_calculate_double_pass (struct covariance *cov)
548 size_t i, j;
549 for (i = 0 ; i < cov->dim; ++i)
551 for (j = 0 ; j < cov->dim; ++j)
553 int idx;
554 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
555 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
557 idx = cm_idx (cov, i, j);
558 if (idx >= 0)
560 x = &cov->cm [idx];
561 *x /= gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
566 return cm_to_gsl (cov);
569 static gsl_matrix *
570 covariance_calculate_single_pass (struct covariance *cov)
572 size_t i, j;
573 size_t m;
575 for (m = 0; m < n_MOMENTS; ++m)
577 /* Divide the moments by the number of samples */
578 if (m > 0)
580 for (i = 0 ; i < cov->dim; ++i)
582 for (j = 0 ; j < cov->dim; ++j)
584 double *x = gsl_matrix_ptr (cov->moments[m], i, j);
585 *x /= gsl_matrix_get (cov->moments[0], i, j);
587 if (m == MOMENT_VARIANCE)
588 *x -= pow2 (gsl_matrix_get (cov->moments[1], i, j));
594 if (cov->centered)
596 /* Centre the moments */
597 for (j = 0 ; j < cov->dim - 1; ++j)
599 for (i = j + 1 ; i < cov->dim; ++i)
601 double *x = &cov->cm [cm_idx (cov, i, j)];
603 *x /= gsl_matrix_get (cov->moments[0], i, j);
605 *x -=
606 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
608 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i);
613 return cm_to_gsl (cov);
617 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
618 caller owns the returned matrix and must free it when it is no longer
619 needed.
621 Call this function only after all data have been accumulated. */
622 gsl_matrix *
623 covariance_calculate (struct covariance *cov)
625 if (cov->state <= 0)
626 return NULL;
628 switch (cov->passes)
630 case 1:
631 return covariance_calculate_single_pass (cov);
632 break;
633 case 2:
634 return covariance_calculate_double_pass (cov);
635 break;
636 default:
637 NOT_REACHED ();
642 Covariance computed without dividing by the sample size.
644 static gsl_matrix *
645 covariance_calculate_double_pass_unnormalized (struct covariance *cov)
647 return cm_to_gsl (cov);
650 static gsl_matrix *
651 covariance_calculate_single_pass_unnormalized (struct covariance *cov)
653 size_t i, j;
655 if (cov->centered)
657 for (i = 0 ; i < cov->dim; ++i)
659 for (j = 0 ; j < cov->dim; ++j)
661 double *x = gsl_matrix_ptr (cov->moments[MOMENT_VARIANCE], i, j);
662 *x -= pow2 (gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j))
663 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
667 for (j = 0 ; j < cov->dim - 1; ++j)
669 for (i = j + 1 ; i < cov->dim; ++i)
671 double *x = &cov->cm [cm_idx (cov, i, j)];
673 *x -=
674 gsl_matrix_get (cov->moments[MOMENT_MEAN], i, j)
676 gsl_matrix_get (cov->moments[MOMENT_MEAN], j, i)
677 / gsl_matrix_get (cov->moments[MOMENT_NONE], i, j);
682 return cm_to_gsl (cov);
686 /* Return a pointer to gsl_matrix containing the pairwise covariances. The
687 returned matrix is owned by the structure, and must not be freed.
689 Call this function only after all data have been accumulated. */
690 const gsl_matrix *
691 covariance_calculate_unnormalized (struct covariance *cov)
693 if (cov->state <= 0)
694 return NULL;
696 if (cov->unnormalised != NULL)
697 return cov->unnormalised;
699 switch (cov->passes)
701 case 1:
702 cov->unnormalised = covariance_calculate_single_pass_unnormalized (cov);
703 break;
704 case 2:
705 cov->unnormalised = covariance_calculate_double_pass_unnormalized (cov);
706 break;
707 default:
708 NOT_REACHED ();
711 return cov->unnormalised;
714 /* Function to access the categoricals used by COV
715 The return value is owned by the COV
717 const struct categoricals *
718 covariance_get_categoricals (const struct covariance *cov)
720 return cov->categoricals;
724 /* Destroy the COV object */
725 void
726 covariance_destroy (struct covariance *cov)
728 size_t i;
730 categoricals_destroy (cov->categoricals);
732 for (i = 0; i < n_MOMENTS; ++i)
733 gsl_matrix_free (cov->moments[i]);
735 gsl_matrix_free (cov->unnormalised);
736 free (cov->moments);
737 free (cov->cm);
738 free (cov);
741 size_t
742 covariance_dim (const struct covariance * cov)
744 return (cov->dim);
750 Routines to assist debugging.
751 The following are not thoroughly tested and in certain respects
752 unreliable. They should only be
753 used for aids to development. Not as user accessible code.
756 #include "libpspp/str.h"
757 #include "output/pivot-table.h"
758 #include "data/format.h"
761 /* Create a table which can be populated with the encodings for
762 the covariance matrix COV */
763 struct pivot_table *
764 covariance_dump_enc_header (const struct covariance *cov)
766 struct pivot_table *table = pivot_table_create ("Covariance Encoding");
768 struct pivot_dimension *factors = pivot_dimension_create (
769 table, PIVOT_AXIS_COLUMN, "Factor");
770 for (size_t i = 0 ; i < cov->n_vars; ++i)
771 pivot_category_create_leaf (factors->root,
772 pivot_value_new_variable (cov->vars[i]));
773 for (size_t i = 0, n = 0; i < cov->dim - cov->n_vars; n++)
775 const struct interaction *iact =
776 categoricals_get_interaction_by_subscript (cov->categoricals, i);
778 struct string str = DS_EMPTY_INITIALIZER;
779 interaction_to_string (iact, &str);
780 struct pivot_category *group = pivot_category_create_group__ (
781 factors->root,
782 pivot_value_new_user_text_nocopy (ds_steal_cstr (&str)));
784 int df = categoricals_df (cov->categoricals, n);
785 for (int j = 0; j < df; j++)
786 pivot_category_create_leaf_rc (group, pivot_value_new_integer (j),
787 PIVOT_RC_INTEGER);
789 i += df;
792 struct pivot_dimension *matrix = pivot_dimension_create (
793 table, PIVOT_AXIS_ROW, "Matrix", "Matrix");
794 matrix->hide_all_labels = true;
796 return table;
801 Append table T, which should have been returned by covariance_dump_enc_header
802 with an entry corresponding to case C for the covariance matrix COV
804 void
805 covariance_dump_enc (const struct covariance *cov, const struct ccase *c,
806 struct pivot_table *table)
808 int row = pivot_category_create_leaf (
809 table->dimensions[1]->root,
810 pivot_value_new_integer (table->dimensions[1]->n_leaves));
812 for (int i = 0 ; i < cov->dim; ++i)
813 pivot_table_put2 (
814 table, i, row, pivot_value_new_number (get_val (cov, i, c)));