1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/categoricals.h"
20 #include "math/interaction.h"
25 #include "data/case.h"
26 #include "data/value.h"
27 #include "data/variable.h"
28 #include "libpspp/array.h"
29 #include "libpspp/hmap.h"
30 #include "libpspp/pool.h"
31 #include "libpspp/str.h"
32 #include "libpspp/hash-functions.h"
34 #include "gl/xalloc.h"
36 #define CATEGORICALS_DEBUG 0
40 struct hmap_node node
; /* Node in hash map. */
42 union value val
; /* The value */
44 int index
; /* A zero based unique index for this value */
48 struct interaction_value
50 struct hmap_node node
; /* Node in hash map */
52 struct ccase
*ccase
; /* A case (probably the first in the dataset) which matches
55 double cc
; /* Total of the weights of cases matching this interaction */
57 void *user_data
; /* A pointer to data which the caller can store stuff */
60 static struct value_node
*
61 lookup_value (const struct hmap
*map
, const union value
*val
, unsigned int hash
, int width
)
63 struct value_node
*vn
= NULL
;
64 HMAP_FOR_EACH_WITH_HASH (vn
, struct value_node
, node
, hash
, map
)
66 if (value_equal (&vn
->val
, val
, width
))
75 struct hmap_node node
; /* Node in hash map. */
76 const struct variable
*var
; /* The variable */
78 struct hmap valmap
; /* A map of value nodes */
79 int n_vals
; /* Number of values for this variable */
83 /* Comparison function to sort value_nodes in ascending order */
85 compare_value_node_3way (const void *vn1_
, const void *vn2_
, const void *aux
)
87 const struct value_node
*const *vn1p
= vn1_
;
88 const struct value_node
*const *vn2p
= vn2_
;
90 const struct variable_node
*vn
= aux
;
93 return value_compare_3way (&(*vn1p
)->val
, &(*vn2p
)->val
, var_get_width (vn
->var
));
98 static struct variable_node
*
99 lookup_variable (const struct hmap
*map
, const struct variable
*var
, unsigned int hash
)
101 struct variable_node
*vn
= NULL
;
102 HMAP_FOR_EACH_WITH_HASH (vn
, struct variable_node
, node
, hash
, map
)
107 fprintf (stderr
, "%s:%d Warning: Hash table collision\n", __FILE__
, __LINE__
);
114 struct interact_params
116 /* A map of cases indexed by a interaction_value */
119 const struct interaction
*iact
;
121 int base_subscript_short
;
122 int base_subscript_long
;
124 /* The number of distinct values of this interaction */
127 /* An array of integers df_n * df_{n-1} * df_{n-2} ...
128 These are the products of the degrees of freedom for the current
129 variable and all preceeding variables */
134 /* A map of interaction_values indexed by subscript */
135 struct interaction_value
**reverse_interaction_value_map
;
141 /* Comparison function to sort the reverse_value_map in ascending order */
143 compare_interaction_value_3way (const void *vn1_
, const void *vn2_
, const void *aux
)
145 const struct interaction_value
*const *vn1p
= vn1_
;
146 const struct interaction_value
*const *vn2p
= vn2_
;
148 const struct interact_params
*iap
= aux
;
150 return interaction_case_cmp_3way (iap
->iact
, (*vn1p
)->ccase
, (*vn2p
)->ccase
);
155 /* The weight variable */
156 const struct variable
*wv
;
158 /* An array of interact_params */
159 struct interact_params
*iap
;
161 /* Map whose members are the union of the variables which comprise IAP */
164 /* The size of IAP. (ie, the number of interactions involved.) */
167 /* The number of categorical variables which contain entries.
168 In the absence of missing values, this will be equal to N_IAP */
173 /* A map to enable the lookup of variables indexed by subscript.
174 This map considers only the N - 1 of the N variables.
176 int *reverse_variable_map_short
;
178 /* Like the above, but uses all N variables */
179 int *reverse_variable_map_long
;
185 /* Missing values in the dependent varirable to be excluded */
186 enum mv_class dep_excl
;
188 /* Missing values in the factor variables to be excluded */
189 enum mv_class fctr_excl
;
196 const struct payload
*payload
;
201 categoricals_isbalanced (const struct categoricals
*cat
)
205 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
208 const struct interact_params
*iap
= &cat
->iap
[i
];
211 for (v
= 0; v
< hmap_count (&iap
->ivmap
); ++v
)
213 const struct interaction_value
*iv
= iap
->reverse_interaction_value_map
[v
];
225 categoricals_dump (const struct categoricals
*cat
)
227 if (CATEGORICALS_DEBUG
)
231 printf ("Reverse Variable Map (short):\n");
232 for (i
= 0; i
< cat
->df_sum
; ++i
)
234 printf (" %d", cat
->reverse_variable_map_short
[i
]);
238 printf ("Reverse Variable Map (long):\n");
239 for (i
= 0; i
< cat
->n_cats_total
; ++i
)
241 printf (" %d", cat
->reverse_variable_map_long
[i
]);
245 printf ("Number of interactions %zu\n", cat
->n_iap
);
246 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
250 const struct interact_params
*iap
= &cat
->iap
[i
];
251 const struct interaction
*iact
= iap
->iact
;
253 ds_init_empty (&str
);
254 interaction_to_string (iact
, &str
);
256 printf ("\nInteraction: \"%s\" (number of categories: %d); ", ds_cstr (&str
), iap
->n_cats
);
258 printf ("Base index (short/long): %d/%d\n", iap
->base_subscript_short
, iap
->base_subscript_long
);
261 for (v
= 0; v
< hmap_count (&iap
->ivmap
); ++v
)
264 const struct interaction_value
*iv
= iap
->reverse_interaction_value_map
[v
];
266 if (v
> 0) printf (" ");
268 for (vv
= 0; vv
< iact
->n_vars
; ++vv
)
270 const struct variable
*var
= iact
->vars
[vv
];
271 const union value
*val
= case_data (iv
->ccase
, var
);
272 unsigned int varhash
= hash_pointer (var
, 0);
273 struct variable_node
*vn
= lookup_variable (&cat
->varmap
, var
, varhash
);
275 const int width
= var_get_width (var
);
276 unsigned int valhash
= value_hash (val
, width
, 0);
277 struct value_node
*valn
= lookup_value (&vn
->valmap
, val
, valhash
, width
);
279 assert (vn
->var
== var
);
281 printf ("%.*g(%d)", DBL_DIG
+ 1, val
->f
, valn
->index
);
282 if (vv
< iact
->n_vars
- 1)
293 categoricals_destroy (struct categoricals
*cat
)
295 struct variable_node
*vn
= NULL
;
300 for (i
= 0; i
< cat
->n_iap
; ++i
)
302 struct interaction_value
*iv
= NULL
;
303 /* Interate over each interaction value, and unref any cases that we reffed */
304 HMAP_FOR_EACH (iv
, struct interaction_value
, node
, &cat
->iap
[i
].ivmap
)
306 if (cat
->payload
&& cat
->payload
->destroy
)
307 cat
->payload
->destroy (cat
->aux1
, cat
->aux2
, iv
->user_data
);
308 case_unref (iv
->ccase
);
311 free (cat
->iap
[i
].enc_sum
);
312 free (cat
->iap
[i
].df_prod
);
313 hmap_destroy (&cat
->iap
[i
].ivmap
);
316 /* Interate over each variable and delete its value map */
317 HMAP_FOR_EACH (vn
, struct variable_node
, node
, &cat
->varmap
)
319 hmap_destroy (&vn
->valmap
);
322 hmap_destroy (&cat
->varmap
);
324 pool_destroy (cat
->pool
);
331 static struct interaction_value
*
332 lookup_case (const struct hmap
*map
, const struct interaction
*iact
, const struct ccase
*c
)
334 struct interaction_value
*iv
= NULL
;
335 size_t hash
= interaction_case_hash (iact
, c
, 0);
337 HMAP_FOR_EACH_WITH_HASH (iv
, struct interaction_value
, node
, hash
, map
)
339 if (interaction_case_equal (iact
, c
, iv
->ccase
))
342 fprintf (stderr
, "Warning: Hash table collision\n");
349 categoricals_sane (const struct categoricals
*cat
)
354 struct categoricals
*
355 categoricals_create (struct interaction
*const*inter
, size_t n_inter
,
356 const struct variable
*wv
, enum mv_class dep_excl
, enum mv_class fctr_excl
)
359 struct categoricals
*cat
= xmalloc (sizeof *cat
);
361 cat
->n_iap
= n_inter
;
363 cat
->n_cats_total
= 0;
365 cat
->reverse_variable_map_short
= NULL
;
366 cat
->reverse_variable_map_long
= NULL
;
367 cat
->pool
= pool_create ();
368 cat
->dep_excl
= dep_excl
;
369 cat
->fctr_excl
= fctr_excl
;
374 cat
->iap
= pool_calloc (cat
->pool
, cat
->n_iap
, sizeof *cat
->iap
);
376 hmap_init (&cat
->varmap
);
377 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
380 hmap_init (&cat
->iap
[i
].ivmap
);
381 cat
->iap
[i
].iact
= inter
[i
];
382 cat
->iap
[i
].cc
= 0.0;
383 for (v
= 0; v
< inter
[i
]->n_vars
; ++v
)
385 const struct variable
*var
= inter
[i
]->vars
[v
];
386 unsigned int hash
= hash_pointer (var
, 0);
387 struct variable_node
*vn
= lookup_variable (&cat
->varmap
, var
, hash
);
390 vn
= pool_malloc (cat
->pool
, sizeof *vn
);
393 hmap_init (&vn
->valmap
);
395 hmap_insert (&cat
->varmap
, &vn
->node
, hash
);
406 categoricals_update (struct categoricals
*cat
, const struct ccase
*c
)
409 struct variable_node
*vn
= NULL
;
415 weight
= cat
->wv
? case_data (c
, cat
->wv
)->f
: 1.0;
416 weight
= var_force_valid_weight (cat
->wv
, weight
, NULL
);
418 assert (NULL
== cat
->reverse_variable_map_short
);
419 assert (NULL
== cat
->reverse_variable_map_long
);
421 /* Interate over each variable, and add the value of that variable
422 to the appropriate map, if it's not already present. */
423 HMAP_FOR_EACH (vn
, struct variable_node
, node
, &cat
->varmap
)
425 const int width
= var_get_width (vn
->var
);
426 const union value
*val
= case_data (c
, vn
->var
);
427 unsigned int hash
= value_hash (val
, width
, 0);
429 struct value_node
*valn
= lookup_value (&vn
->valmap
, val
, hash
, width
);
432 valn
= pool_malloc (cat
->pool
, sizeof *valn
);
435 value_init (&valn
->val
, width
);
436 value_copy (&valn
->val
, val
, width
);
437 hmap_insert (&vn
->valmap
, &valn
->node
, hash
);
441 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
443 const struct interaction
*iact
= cat
->iap
[i
].iact
;
446 struct interaction_value
*node
;
448 if ( interaction_case_is_missing (iact
, c
, cat
->fctr_excl
))
451 hash
= interaction_case_hash (iact
, c
, 0);
452 node
= lookup_case (&cat
->iap
[i
].ivmap
, iact
, c
);
456 node
= pool_malloc (cat
->pool
, sizeof *node
);
457 node
->ccase
= case_ref (c
);
460 hmap_insert (&cat
->iap
[i
].ivmap
, &node
->node
, hash
);
464 node
->user_data
= cat
->payload
->create (cat
->aux1
, cat
->aux2
);
471 cat
->iap
[i
].cc
+= weight
;
475 cat
->payload
->update (cat
->aux1
, cat
->aux2
, node
->user_data
, c
, weight
);
480 /* Return the number of categories (distinct values) for interction N */
482 categoricals_n_count (const struct categoricals
*cat
, size_t n
)
484 return hmap_count (&cat
->iap
[n
].ivmap
);
489 categoricals_df (const struct categoricals
*cat
, size_t n
)
491 const struct interact_params
*iap
= &cat
->iap
[n
];
492 return iap
->df_prod
[iap
->iact
->n_vars
- 1];
496 /* Return the total number of categories */
498 categoricals_n_total (const struct categoricals
*cat
)
500 if (!categoricals_is_complete (cat
))
503 return cat
->n_cats_total
;
507 categoricals_df_total (const struct categoricals
*cat
)
516 categoricals_is_complete (const struct categoricals
*cat
)
518 return (NULL
!= cat
->reverse_variable_map_short
);
522 /* This function must be called *before* any call to categoricals_get_*_by subscript and
523 *after* all calls to categoricals_update */
525 categoricals_done (const struct categoricals
*cat_
)
527 /* Implementation Note: Whilst this function is O(n) in cat->n_cats_total, in most
528 uses it will be more efficient that using a tree based structure, since it
529 is called only once, and means that subsequent lookups will be O(1).
531 1 call of O(n) + 10^9 calls of O(1) is better than 10^9 calls of O(log n).
533 struct categoricals
*cat
= CONST_CAST (struct categoricals
*, cat_
);
543 cat
->n_cats_total
= 0;
545 /* Calculate the degrees of freedom, and the number of categories */
546 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
549 const struct interaction
*iact
= cat
->iap
[i
].iact
;
551 cat
->iap
[i
].df_prod
= iact
->n_vars
? xcalloc (iact
->n_vars
, sizeof (int)) : NULL
;
553 cat
->iap
[i
].n_cats
= 1;
555 for (v
= 0 ; v
< iact
->n_vars
; ++v
)
558 const struct variable
*var
= iact
->vars
[v
];
560 struct variable_node
*vn
= lookup_variable (&cat
->varmap
, var
, hash_pointer (var
, 0));
562 struct value_node
*valnd
= NULL
;
563 struct value_node
**array
;
565 assert (vn
->n_vals
== hmap_count (&vn
->valmap
));
573 /* Sort the VALMAP here */
574 array
= xcalloc (sizeof *array
, vn
->n_vals
);
576 HMAP_FOR_EACH (valnd
, struct value_node
, node
, &vn
->valmap
)
578 /* Note: This loop is probably superfluous, it could be done in the
579 update stage (at the expense of a realloc) */
583 sort (array
, vn
->n_vals
, sizeof (*array
),
584 compare_value_node_3way
, vn
);
586 for (x
= 0; x
< vn
->n_vals
; ++x
)
588 struct value_node
*vvv
= array
[x
];
593 cat
->iap
[i
].df_prod
[v
] = df
* (vn
->n_vals
- 1);
594 df
= cat
->iap
[i
].df_prod
[v
];
596 cat
->iap
[i
].n_cats
*= vn
->n_vals
;
600 cat
->df_sum
+= cat
->iap
[i
].df_prod
[v
- 1];
602 cat
->n_cats_total
+= cat
->iap
[i
].n_cats
;
606 cat
->reverse_variable_map_short
= pool_calloc (cat
->pool
,
608 sizeof *cat
->reverse_variable_map_short
);
610 cat
->reverse_variable_map_long
= pool_calloc (cat
->pool
,
612 sizeof *cat
->reverse_variable_map_long
);
614 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
616 struct interaction_value
*ivn
= NULL
;
619 struct interact_params
*iap
= &cat
->iap
[i
];
621 iap
->base_subscript_short
= idx_short
;
622 iap
->base_subscript_long
= idx_long
;
624 iap
->reverse_interaction_value_map
= pool_calloc (cat
->pool
, iap
->n_cats
,
625 sizeof *iap
->reverse_interaction_value_map
);
627 HMAP_FOR_EACH (ivn
, struct interaction_value
, node
, &iap
->ivmap
)
629 iap
->reverse_interaction_value_map
[x
++] = ivn
;
632 assert (x
<= iap
->n_cats
);
634 /* For some purposes (eg CONTRASTS in ONEWAY) the values need to be sorted */
635 sort (iap
->reverse_interaction_value_map
, x
, sizeof (*iap
->reverse_interaction_value_map
),
636 compare_interaction_value_3way
, iap
);
638 /* Fill the remaining values with null */
639 for (ii
= x
; ii
< iap
->n_cats
; ++ii
)
640 iap
->reverse_interaction_value_map
[ii
] = NULL
;
642 /* Populate the reverse variable maps. */
645 for (ii
= 0; ii
< iap
->df_prod
[iap
->iact
->n_vars
- 1]; ++ii
)
646 cat
->reverse_variable_map_short
[idx_short
++] = i
;
649 for (ii
= 0; ii
< iap
->n_cats
; ++ii
)
650 cat
->reverse_variable_map_long
[idx_long
++] = i
;
653 assert (cat
->n_vars
<= cat
->n_iap
);
655 categoricals_dump (cat
);
657 /* Tally up the sums for all the encodings */
658 for (i
= 0 ; i
< cat
->n_iap
; ++i
)
661 struct interact_params
*iap
= &cat
->iap
[i
];
662 const struct interaction
*iact
= iap
->iact
;
664 const int df
= iap
->df_prod
? iap
->df_prod
[iact
->n_vars
- 1] : 0;
666 iap
->enc_sum
= xcalloc (df
, sizeof (*(iap
->enc_sum
)));
668 for (y
= 0; y
< hmap_count (&iap
->ivmap
); ++y
)
670 struct interaction_value
*iv
= iap
->reverse_interaction_value_map
[y
];
671 for (x
= iap
->base_subscript_short
; x
< iap
->base_subscript_short
+ df
;++x
)
673 const double bin
= categoricals_get_effects_code_for_case (cat
, x
, iv
->ccase
);
674 iap
->enc_sum
[x
- iap
->base_subscript_short
] += bin
* iv
->cc
;
676 if (cat
->payload
&& cat
->payload
->calculate
)
677 cat
->payload
->calculate (cat
->aux1
, cat
->aux2
, iv
->user_data
);
686 reverse_variable_lookup_short (const struct categoricals
*cat
, int subscript
)
688 assert (cat
->reverse_variable_map_short
);
689 assert (subscript
>= 0);
690 assert (subscript
< cat
->df_sum
);
692 return cat
->reverse_variable_map_short
[subscript
];
696 reverse_variable_lookup_long (const struct categoricals
*cat
, int subscript
)
698 assert (cat
->reverse_variable_map_long
);
699 assert (subscript
>= 0);
700 assert (subscript
< cat
->n_cats_total
);
702 return cat
->reverse_variable_map_long
[subscript
];
706 /* Return the interaction corresponding to SUBSCRIPT */
707 const struct interaction
*
708 categoricals_get_interaction_by_subscript (const struct categoricals
*cat
, int subscript
)
710 int index
= reverse_variable_lookup_short (cat
, subscript
);
712 return cat
->iap
[index
].iact
;
716 categoricals_get_weight_by_subscript (const struct categoricals
*cat
, int subscript
)
718 int vindex
= reverse_variable_lookup_short (cat
, subscript
);
719 const struct interact_params
*vp
= &cat
->iap
[vindex
];
725 categoricals_get_sum_by_subscript (const struct categoricals
*cat
, int subscript
)
727 int vindex
= reverse_variable_lookup_short (cat
, subscript
);
728 const struct interact_params
*vp
= &cat
->iap
[vindex
];
730 return vp
->enc_sum
[subscript
- vp
->base_subscript_short
];
734 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
735 for that subscript */
737 categoricals_get_code_for_case (const struct categoricals
*cat
, int subscript
,
738 const struct ccase
*c
,
741 const struct interaction
*iact
= categoricals_get_interaction_by_subscript (cat
, subscript
);
743 const int i
= reverse_variable_lookup_short (cat
, subscript
);
745 const int base_index
= cat
->iap
[i
].base_subscript_short
;
750 const struct interact_params
*iap
= &cat
->iap
[i
];
753 for (v
= 0; v
< iact
->n_vars
; ++v
)
755 const struct variable
*var
= iact
->vars
[v
];
757 const union value
*val
= case_data (c
, var
);
758 const int width
= var_get_width (var
);
759 const struct variable_node
*vn
= lookup_variable (&cat
->varmap
, var
, hash_pointer (var
, 0));
761 const unsigned int hash
= value_hash (val
, width
, 0);
762 const struct value_node
*valn
= lookup_value (&vn
->valmap
, val
, hash
, width
);
766 const double df
= iap
->df_prod
[v
] / dfp
;
768 /* Translate the subscript into an index for the individual variable */
769 const int index
= ((subscript
- base_index
) % iap
->df_prod
[v
] ) / dfp
;
770 dfp
= iap
->df_prod
[v
];
772 if (effects_coding
&& valn
->index
== df
)
774 else if ( valn
->index
!= index
)
784 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
785 for that subscript */
787 categoricals_get_dummy_code_for_case (const struct categoricals
*cat
, int subscript
,
788 const struct ccase
*c
)
790 return categoricals_get_code_for_case (cat
, subscript
, c
, false);
793 /* Returns unity if the value in case C at SUBSCRIPT is equal to the category
795 Else if it is the last category, return -1.
799 categoricals_get_effects_code_for_case (const struct categoricals
*cat
, int subscript
,
800 const struct ccase
*c
)
802 return categoricals_get_code_for_case (cat
, subscript
, c
, true);
807 categoricals_get_n_variables (const struct categoricals
*cat
)
809 printf ("%s\n", __FUNCTION__
);
814 /* Return a case containing the set of values corresponding to
815 the Nth Category of the IACTth interaction */
817 categoricals_get_case_by_category_real (const struct categoricals
*cat
, int iact
, int n
)
819 const struct interaction_value
*vn
;
821 const struct interact_params
*vp
= &cat
->iap
[iact
];
823 if ( n
>= hmap_count (&vp
->ivmap
))
826 vn
= vp
->reverse_interaction_value_map
[n
];
831 /* Return a the user data corresponding to the Nth Category of the IACTth interaction. */
833 categoricals_get_user_data_by_category_real (const struct categoricals
*cat
, int iact
, int n
)
835 const struct interact_params
*vp
= &cat
->iap
[iact
];
836 const struct interaction_value
*iv
;
838 if ( n
>= hmap_count (&vp
->ivmap
))
841 iv
= vp
->reverse_interaction_value_map
[n
];
843 return iv
->user_data
;
848 /* Return a case containing the set of values corresponding to SUBSCRIPT */
850 categoricals_get_case_by_category (const struct categoricals
*cat
, int subscript
)
852 int vindex
= reverse_variable_lookup_long (cat
, subscript
);
853 const struct interact_params
*vp
= &cat
->iap
[vindex
];
854 const struct interaction_value
*vn
= vp
->reverse_interaction_value_map
[subscript
- vp
->base_subscript_long
];
860 categoricals_get_user_data_by_category (const struct categoricals
*cat
, int subscript
)
862 int vindex
= reverse_variable_lookup_long (cat
, subscript
);
863 const struct interact_params
*vp
= &cat
->iap
[vindex
];
865 const struct interaction_value
*iv
= vp
->reverse_interaction_value_map
[subscript
- vp
->base_subscript_long
];
866 return iv
->user_data
;
873 categoricals_set_payload (struct categoricals
*cat
, const struct payload
*p
,
874 const void *aux1
, void *aux2
)