1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2006, 2007, 2009, 2010, 2011, 2012, 2013, 2014, 2016 Free Software Foundation, Inc
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include <gsl/gsl_cdf.h>
22 #include "data/case.h"
23 #include "data/casegrouper.h"
24 #include "data/casereader.h"
25 #include "data/dataset.h"
26 #include "data/dictionary.h"
27 #include "data/format.h"
28 #include "data/variable.h"
29 #include "data/subcase.h"
30 #include "data/casewriter.h"
31 #include "data/short-names.h"
32 #include "language/command.h"
33 #include "language/lexer/lexer.h"
34 #include "language/lexer/variable-parser.h"
35 #include "language/stats/sort-criteria.h"
36 #include "math/sort.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/i18n.h"
39 #include "libpspp/message.h"
40 #include "libpspp/misc.h"
41 #include "libpspp/pool.h"
42 #include "libpspp/string-set.h"
43 #include "libpspp/taint.h"
45 #include "output/tab.h"
48 #define _(msgid) gettext (msgid)
49 #define N_(msgid) (msgid)
53 typedef double (*rank_function_t
) (const struct rank
*, double c
, double cc
, double cc_1
,
56 static double rank_proportion (const struct rank
*, double c
, double cc
, double cc_1
,
59 static double rank_normal (const struct rank
*, double c
, double cc
, double cc_1
,
62 static double rank_percent (const struct rank
*, double c
, double cc
, double cc_1
,
65 static double rank_rfraction (const struct rank
*, double c
, double cc
, double cc_1
,
68 static double rank_rank (const struct rank
*, double c
, double cc
, double cc_1
,
71 static double rank_n (const struct rank
*, double c
, double cc
, double cc_1
,
74 static double rank_savage (const struct rank
*, double c
, double cc
, double cc_1
,
77 static double rank_ntiles (const struct rank
*, double c
, double cc
, double cc_1
,
94 static const struct fmt_spec dest_format
[n_RANK_FUNCS
] = {
95 {FMT_F
, 9, 3}, /* rank */
96 {FMT_F
, 6, 4}, /* normal */
97 {FMT_F
, 6, 2}, /* percent */
98 {FMT_F
, 6, 4}, /* rfraction */
99 {FMT_F
, 6, 4}, /* proportion */
100 {FMT_F
, 6, 0}, /* n */
101 {FMT_F
, 3, 0}, /* ntiles */
102 {FMT_F
, 8, 4} /* savage */
105 static const char * const function_name
[n_RANK_FUNCS
] = {
116 static const rank_function_t rank_func
[n_RANK_FUNCS
] = {
146 enum rank_func rfunc
;
147 const char **dest_names
;
148 const char **dest_labels
;
151 /* If NEW_NAME exists in DICT or NEW_NAMES, returns NULL without changing
152 anything. Otherwise, inserts NEW_NAME in NEW_NAMES and returns the copy of
153 NEW_NAME now in NEW_NAMES. */
155 try_new_name (const char *new_name
,
156 const struct dictionary
*dict
, struct string_set
*new_names
)
158 return (!dict_lookup_var (dict
, new_name
)
159 && string_set_insert (new_names
, new_name
)
160 ? string_set_find_node (new_names
, new_name
)->string
164 /* Returns a variable name for storing ranks of a variable named SRC_NAME
165 accoring to the rank function F. The name chosen will not be one already in
168 If successful, adds the new name to NEW_NAMES and returns the name added.
169 If no name can be generated, returns NULL. */
171 rank_choose_dest_name (struct dictionary
*dict
, struct string_set
*new_names
,
172 enum rank_func f
, const char *src_name
)
179 /* Try the first character of the ranking function followed by the first 7
180 bytes of the srcinal variable name. */
181 src_name_7
= utf8_encoding_trunc (src_name
, dict_get_encoding (dict
), 7);
182 snprintf (name
, sizeof name
, "%c%s", function_name
[f
][0], src_name_7
);
184 s
= try_new_name (name
, dict
, new_names
);
189 for (i
= 1; i
<= 999; i
++)
191 sprintf (name
, "%.3s%03d", function_name
[f
], i
);
192 s
= try_new_name (name
, dict
, new_names
);
198 for (i
= 1; i
<= 99; i
++)
200 sprintf (name
, "RNK%.2s%02d", function_name
[f
], i
);
201 s
= try_new_name (name
, dict
, new_names
);
206 msg (ME
, _("Cannot generate variable name for ranking %s with %s. "
207 "All candidates in use."),
208 src_name
, function_name
[f
]);
214 struct dictionary
*dict
;
218 const struct variable
**vars
;
221 const struct variable
**group_vars
;
225 enum mv_class exclude
;
227 struct rank_spec
*rs
;
232 enum fraction fraction
;
237 /* Pool on which cell functions may allocate data */
243 destroy_rank (struct rank
*rank
)
246 free (rank
->group_vars
);
247 subcase_destroy (&rank
->sc
);
248 pool_destroy (rank
->pool
);
252 parse_into (struct lexer
*lexer
, struct rank
*cmd
,
253 struct string_set
*new_names
)
256 struct rank_spec
*rs
= NULL
;
258 cmd
->rs
= pool_realloc (cmd
->pool
, cmd
->rs
, sizeof (*cmd
->rs
) * (cmd
->n_rs
+ 1));
259 rs
= &cmd
->rs
[cmd
->n_rs
];
261 if (lex_match_id (lexer
, "RANK"))
265 else if (lex_match_id (lexer
, "NORMAL"))
269 else if (lex_match_id (lexer
, "RFRACTION"))
271 rs
->rfunc
= RFRACTION
;
273 else if (lex_match_id (lexer
, "N"))
277 else if (lex_match_id (lexer
, "SAVAGE"))
281 else if (lex_match_id (lexer
, "PERCENT"))
285 else if (lex_match_id (lexer
, "PROPORTION"))
287 rs
->rfunc
= PROPORTION
;
289 else if (lex_match_id (lexer
, "NTILES"))
291 if ( !lex_force_match (lexer
, T_LPAREN
))
294 if (! lex_force_int (lexer
) )
297 cmd
->k_ntiles
= lex_integer (lexer
);
300 if ( !lex_force_match (lexer
, T_RPAREN
))
307 lex_error (lexer
, NULL
);
312 rs
->dest_names
= pool_calloc (cmd
->pool
, cmd
->n_vars
,
313 sizeof *rs
->dest_names
);
315 if (lex_match_id (lexer
, "INTO"))
317 while( lex_token (lexer
) == T_ID
)
319 const char *name
= lex_tokcstr (lexer
);
321 if ( var_count
>= subcase_get_n_fields (&cmd
->sc
) )
322 msg (SE
, _("Too many variables in %s clause."), "INTO");
323 else if ( dict_lookup_var (cmd
->dict
, name
) != NULL
)
324 msg (SE
, _("Variable %s already exists."), name
);
325 else if (string_set_contains (new_names
, name
))
326 msg (SE
, _("Duplicate variable name %s."), name
);
329 string_set_insert (new_names
, name
);
330 rs
->dest_names
[var_count
++] = pool_strdup (cmd
->pool
, name
);
343 /* Hardly a rank function !! */
345 rank_n (const struct rank
*cmd UNUSED
, double c UNUSED
, double cc UNUSED
, double cc_1 UNUSED
,
346 int i UNUSED
, double w
)
353 rank_rank (const struct rank
*cmd
, double c
, double cc
, double cc_1
,
354 int i
, double w UNUSED
)
369 rank
= cc_1
+ (c
+ 1.0)/ 2.0;
389 rank
= cc_1
+ c
/ 2.0 ;
404 rank_rfraction (const struct rank
*cmd
, double c
, double cc
, double cc_1
,
407 return rank_rank (cmd
, c
, cc
, cc_1
, i
, w
) / w
;
412 rank_percent (const struct rank
*cmd
, double c
, double cc
, double cc_1
,
415 return rank_rank (cmd
, c
, cc
, cc_1
, i
, w
) * 100.0 / w
;
420 rank_proportion (const struct rank
*cmd
, double c
, double cc
, double cc_1
,
423 const double r
= rank_rank (cmd
, c
, cc
, cc_1
, i
, w
) ;
427 switch ( cmd
->fraction
)
430 f
= (r
- 3.0/8.0) / (w
+ 0.25);
436 f
= (r
- 1.0/3.0) / (w
+ 1.0/3.0);
446 return (f
> 0) ? f
: SYSMIS
;
450 rank_normal (const struct rank
*cmd
, double c
, double cc
, double cc_1
,
453 double f
= rank_proportion (cmd
, c
, cc
, cc_1
, i
, w
);
455 return gsl_cdf_ugaussian_Pinv (f
);
459 rank_ntiles (const struct rank
*cmd
, double c
, double cc
, double cc_1
,
462 double r
= rank_rank (cmd
, c
, cc
, cc_1
, i
, w
);
465 return ( floor (( r
* cmd
->k_ntiles
) / ( w
+ 1) ) + 1);
468 /* Expected value of the order statistics from an exponential distribution */
470 ee (int j
, double w_star
)
475 for (k
= 1 ; k
<= j
; k
++)
476 sum
+= 1.0 / ( w_star
+ 1 - k
);
483 rank_savage (const struct rank
*cmd UNUSED
, double c
, double cc
, double cc_1
,
484 int i UNUSED
, double w
)
487 const int i_1
= floor (cc_1
);
488 const int i_2
= floor (cc
);
490 const double w_star
= (modf (w
, &int_part
) == 0 ) ? w
: floor (w
) + 1;
492 const double g_1
= cc_1
- i_1
;
493 const double g_2
= cc
- i_2
;
495 /* The second factor is infinite, when the first is zero.
496 Therefore, evaluate the second, only when the first is non-zero */
497 const double expr1
= (1 - g_1
) ? (1 - g_1
) * ee(i_1
+1, w_star
) : ( 1 - g_1
);
498 const double expr2
= g_2
? g_2
* ee (i_2
+1, w_star
) : g_2
;
501 return ee (i_1
+ 1, w_star
) - 1;
503 if ( i_1
+ 1 == i_2
)
504 return ( ( expr1
+ expr2
)/c
) - 1;
506 if ( i_1
+ 2 <= i_2
)
510 for (j
= i_1
+ 2 ; j
<= i_2
; ++j
)
511 sigma
+= ee (j
, w_star
);
512 return ( (expr1
+ expr2
+ sigma
) / c
) -1;
519 sum_weights (const struct casereader
*input
, int weight_idx
)
521 if (weight_idx
== -1)
522 return casereader_count_cases (input
);
525 struct casereader
*pass
;
530 pass
= casereader_clone (input
);
531 for (; (c
= casereader_read (pass
)) != NULL
; case_unref (c
))
532 w
+= case_num_idx (c
, weight_idx
);
533 casereader_destroy (pass
);
540 rank_sorted_file (struct casereader
*input
,
541 struct casewriter
*output
,
543 const struct rank
*cmd
)
545 struct casegrouper
*tie_grouper
;
546 struct casereader
*tied_cases
;
547 struct subcase input_var
;
553 /* Get total group weight. */
554 w
= sum_weights (input
, weight_idx
);
557 subcase_init (&input_var
, 0, 0, SC_ASCEND
);
558 tie_grouper
= casegrouper_create_subcase (input
, &input_var
);
559 subcase_destroy (&input_var
);
560 for (; casegrouper_get_next_group (tie_grouper
, &tied_cases
);
561 casereader_destroy (tied_cases
))
563 double tw
= sum_weights (tied_cases
, weight_idx
);
567 taint_propagate (casereader_get_taint (tied_cases
),
568 casewriter_get_taint (output
));
570 /* Rank tied cases. */
571 for (; (c
= casereader_read (tied_cases
)) != NULL
; case_unref (c
))
573 struct ccase
*out_case
;
576 out_case
= case_create (casewriter_get_proto (output
));
577 case_data_rw_idx (out_case
, 0)->f
= case_num_idx (c
, 1);
578 for (i
= 0; i
< cmd
->n_rs
; ++i
)
580 rank_function_t func
= rank_func
[cmd
->rs
[i
].rfunc
];
581 double rank
= func (cmd
, tw
, cc
, cc_1
, tie_group
, w
);
582 case_data_rw_idx (out_case
, i
+ 1)->f
= rank
;
585 casewriter_write (output
, out_case
);
589 casegrouper_destroy (tie_grouper
);
594 rank_cmd (struct dataset
*ds
, const struct rank
*cmd
);
597 fraction_name (const struct rank
*cmd
)
599 switch (cmd
->fraction
)
601 case FRAC_BLOM
: return "BLOM";
602 case FRAC_RANKIT
: return "RANKIT";
603 case FRAC_TUKEY
: return "TUKEY";
604 case FRAC_VW
: return "VW";
605 default: NOT_REACHED ();
609 /* Returns a label for a variable derived from SRC_VAR with function F. */
611 create_var_label (struct rank
*cmd
, const struct variable
*src_var
,
615 const char *pool_label
;
617 ds_init_empty (&label
);
619 if ( cmd
->n_group_vars
> 0 )
621 struct string group_var_str
;
624 ds_init_empty (&group_var_str
);
626 for (g
= 0 ; g
< cmd
->n_group_vars
; ++g
)
628 if ( g
> 0 ) ds_put_cstr (&group_var_str
, " ");
629 ds_put_cstr (&group_var_str
, var_get_name (cmd
->group_vars
[g
]));
632 ds_put_format (&label
, _("%s of %s by %s"), function_name
[f
],
633 var_get_name (src_var
), ds_cstr (&group_var_str
));
634 ds_destroy (&group_var_str
);
637 ds_put_format (&label
, _("%s of %s"),
638 function_name
[f
], var_get_name (src_var
));
640 pool_label
= pool_strdup (cmd
->pool
, ds_cstr (&label
));
648 cmd_rank (struct lexer
*lexer
, struct dataset
*ds
)
650 struct string_set new_names
;
652 struct rank_spec
*rs
;
655 subcase_init_empty (&rank
.sc
);
659 rank
.exclude
= MV_ANY
;
660 rank
.n_group_vars
= 0;
661 rank
.group_vars
= NULL
;
662 rank
.dict
= dataset_dict (ds
);
663 rank
.ties
= TIES_MEAN
;
664 rank
.fraction
= FRAC_BLOM
;
666 rank
.pool
= pool_create ();
668 string_set_init (&new_names
);
670 if (lex_match_id (lexer
, "VARIABLES"))
671 lex_force_match (lexer
, T_EQUALS
);
673 if (!parse_sort_criteria (lexer
, rank
.dict
,
678 rank
.n_vars
= rank
.sc
.n_fields
;
680 if (lex_match (lexer
, T_BY
) )
682 if ( ! parse_variables_const (lexer
, rank
.dict
,
683 &rank
.group_vars
, &rank
.n_group_vars
,
684 PV_NO_DUPLICATE
| PV_NO_SCRATCH
))
689 while (lex_token (lexer
) != T_ENDCMD
)
691 lex_force_match (lexer
, T_SLASH
);
692 if (lex_match_id (lexer
, "TIES"))
694 lex_force_match (lexer
, T_EQUALS
);
695 if (lex_match_id (lexer
, "MEAN"))
697 rank
.ties
= TIES_MEAN
;
699 else if (lex_match_id (lexer
, "LOW"))
701 rank
.ties
= TIES_LOW
;
703 else if (lex_match_id (lexer
, "HIGH"))
705 rank
.ties
= TIES_HIGH
;
707 else if (lex_match_id (lexer
, "CONDENSE"))
709 rank
.ties
= TIES_CONDENSE
;
713 lex_error (lexer
, NULL
);
717 else if (lex_match_id (lexer
, "FRACTION"))
719 lex_force_match (lexer
, T_EQUALS
);
720 if (lex_match_id (lexer
, "BLOM"))
722 rank
.fraction
= FRAC_BLOM
;
724 else if (lex_match_id (lexer
, "TUKEY"))
726 rank
.fraction
= FRAC_TUKEY
;
728 else if (lex_match_id (lexer
, "VW"))
730 rank
.fraction
= FRAC_VW
;
732 else if (lex_match_id (lexer
, "RANKIT"))
734 rank
.fraction
= FRAC_RANKIT
;
738 lex_error (lexer
, NULL
);
742 else if (lex_match_id (lexer
, "PRINT"))
744 lex_force_match (lexer
, T_EQUALS
);
745 if (lex_match_id (lexer
, "YES"))
749 else if (lex_match_id (lexer
, "NO"))
755 lex_error (lexer
, NULL
);
759 else if (lex_match_id (lexer
, "MISSING"))
761 lex_force_match (lexer
, T_EQUALS
);
762 if (lex_match_id (lexer
, "INCLUDE"))
764 rank
.exclude
= MV_SYSTEM
;
766 else if (lex_match_id (lexer
, "EXCLUDE"))
768 rank
.exclude
= MV_ANY
;
772 lex_error (lexer
, NULL
);
776 else if (! parse_into (lexer
, &rank
, &new_names
))
781 /* If no rank specs are given, then apply a default */
784 struct rank_spec
*rs
;
786 rs
= pool_calloc (rank
.pool
, 1, sizeof *rs
);
788 rs
->dest_names
= pool_calloc (rank
.pool
, rank
.n_vars
,
789 sizeof *rs
->dest_names
);
795 /* Choose variable names for all rank destinations which haven't already been
796 created with INTO. */
797 for (rs
= rank
.rs
; rs
< &rank
.rs
[rank
.n_rs
]; rs
++)
801 rs
->dest_labels
= pool_calloc (rank
.pool
, rank
.n_vars
,
802 sizeof *rs
->dest_labels
);
803 for ( v
= 0 ; v
< rank
.n_vars
; v
++ )
805 const char **dst_name
= &rs
->dest_names
[v
];
806 if ( *dst_name
== NULL
)
808 *dst_name
= rank_choose_dest_name (rank
.dict
, &new_names
,
810 var_get_name (rank
.vars
[v
]));
811 if (*dst_name
== NULL
)
815 rs
->dest_labels
[v
] = create_var_label (&rank
, rank
.vars
[v
],
824 tab_output_text_format (0, _("Variables Created By %s"), "RANK");
825 tab_output_text (0, "");
827 for (i
= 0 ; i
< rank
.n_rs
; ++i
)
829 for ( v
= 0 ; v
< rank
.n_vars
; v
++ )
831 if ( rank
.n_group_vars
> 0 )
833 struct string varlist
;
836 ds_init_empty (&varlist
);
837 for ( g
= 0 ; g
< rank
.n_group_vars
; ++g
)
839 ds_put_cstr (&varlist
, var_get_name (rank
.group_vars
[g
]));
841 if ( g
< rank
.n_group_vars
- 1)
842 ds_put_cstr (&varlist
, " ");
845 if ( rank
.rs
[i
].rfunc
== NORMAL
||
846 rank
.rs
[i
].rfunc
== PROPORTION
)
847 tab_output_text_format (0,
848 _("%s into %s(%s of %s using %s BY %s)"),
849 var_get_name (rank
.vars
[v
]),
850 rank
.rs
[i
].dest_names
[v
],
851 function_name
[rank
.rs
[i
].rfunc
],
852 var_get_name (rank
.vars
[v
]),
853 fraction_name (&rank
),
857 tab_output_text_format (0,
858 _("%s into %s(%s of %s BY %s)"),
859 var_get_name (rank
.vars
[v
]),
860 rank
.rs
[i
].dest_names
[v
],
861 function_name
[rank
.rs
[i
].rfunc
],
862 var_get_name (rank
.vars
[v
]),
864 ds_destroy (&varlist
);
868 if ( rank
.rs
[i
].rfunc
== NORMAL
||
869 rank
.rs
[i
].rfunc
== PROPORTION
)
870 tab_output_text_format (0,
871 _("%s into %s(%s of %s using %s)"),
872 var_get_name (rank
.vars
[v
]),
873 rank
.rs
[i
].dest_names
[v
],
874 function_name
[rank
.rs
[i
].rfunc
],
875 var_get_name (rank
.vars
[v
]),
876 fraction_name (&rank
));
879 tab_output_text_format (0,
880 _("%s into %s(%s of %s)"),
881 var_get_name (rank
.vars
[v
]),
882 rank
.rs
[i
].dest_names
[v
],
883 function_name
[rank
.rs
[i
].rfunc
],
884 var_get_name (rank
.vars
[v
]));
891 rank_cmd (ds
, &rank
);
893 destroy_rank (&rank
);
894 string_set_destroy (&new_names
);
899 destroy_rank (&rank
);
900 string_set_destroy (&new_names
);
904 /* RANK transformation. */
909 struct rank_trns_input_var
*input_vars
;
915 struct rank_trns_input_var
917 struct casereader
*input
;
918 struct ccase
*current
;
920 struct variable
**output_vars
;
924 advance_ranking (struct rank_trns_input_var
*iv
)
926 case_unref (iv
->current
);
927 iv
->current
= casereader_read (iv
->input
);
931 rank_trns_proc (void *trns_
, struct ccase
**c
, casenumber case_idx UNUSED
)
933 struct rank_trns
*trns
= trns_
;
934 double order
= case_num_idx (*c
, trns
->order_case_idx
);
935 struct rank_trns_input_var
*iv
;
937 *c
= case_unshare (*c
);
938 for (iv
= trns
->input_vars
; iv
< &trns
->input_vars
[trns
->n_input_vars
]; iv
++)
939 while (iv
->current
!= NULL
)
941 double iv_order
= case_num_idx (iv
->current
, 0);
942 if (iv_order
== order
)
946 for (i
= 0; i
< trns
->n_funcs
; i
++)
947 case_data_rw (*c
, iv
->output_vars
[i
])->f
948 = case_num_idx (iv
->current
, i
+ 1);
949 advance_ranking (iv
);
952 else if (iv_order
> order
)
955 advance_ranking (iv
);
957 return TRNS_CONTINUE
;
961 rank_trns_free (void *trns_
)
963 struct rank_trns
*trns
= trns_
;
964 struct rank_trns_input_var
*iv
;
966 for (iv
= trns
->input_vars
; iv
< &trns
->input_vars
[trns
->n_input_vars
]; iv
++)
968 casereader_destroy (iv
->input
);
969 case_unref (iv
->current
);
971 free (iv
->output_vars
);
973 free (trns
->input_vars
);
980 rank_cmd (struct dataset
*ds
, const struct rank
*cmd
)
982 struct dictionary
*d
= dataset_dict (ds
);
983 struct variable
*weight_var
= dict_get_weight (d
);
984 struct casewriter
**outputs
;
985 struct variable
*order_var
;
986 struct casereader
*input
;
987 struct rank_trns
*trns
;
991 order_var
= add_permanent_ordering_transformation (ds
);
993 /* Create output files. */
995 struct caseproto
*output_proto
;
996 struct subcase by_order
;
998 output_proto
= caseproto_create ();
999 for (i
= 0; i
< cmd
->n_rs
+ 1; i
++)
1000 output_proto
= caseproto_add_width (output_proto
, 0);
1002 subcase_init (&by_order
, 0, 0, SC_ASCEND
);
1004 outputs
= xnmalloc (cmd
->n_vars
, sizeof *outputs
);
1005 for (i
= 0; i
< cmd
->n_vars
; i
++)
1006 outputs
[i
] = sort_create_writer (&by_order
, output_proto
);
1008 subcase_destroy (&by_order
);
1009 caseproto_unref (output_proto
);
1012 /* Open the active file and make one pass per input variable. */
1013 input
= proc_open (ds
);
1014 input
= casereader_create_filter_weight (input
, d
, NULL
, NULL
);
1015 for (i
= 0 ; i
< cmd
->n_vars
; ++i
)
1017 const struct variable
*input_var
= cmd
->vars
[i
];
1018 struct casereader
*input_pass
;
1019 struct casegrouper
*split_grouper
;
1020 struct casereader
*split_group
;
1021 struct subcase rank_ordering
;
1022 struct subcase projection
;
1023 struct subcase split_vars
;
1024 struct subcase group_vars
;
1028 /* Discard cases that have missing values of input variable. */
1029 input_pass
= i
== cmd
->n_vars
- 1 ? input
: casereader_clone (input
);
1030 input_pass
= casereader_create_filter_missing (input_pass
, &input_var
, 1,
1031 cmd
->exclude
, NULL
, NULL
);
1033 /* Keep only the columns we really need, to save time and space when we
1034 sort them just below.
1036 After this projection, the input_pass case indexes look like:
1040 - 2 and up: cmd->n_group_vars group variables
1041 - 2 + cmd->n_group_vars and up: split variables
1042 - 2 + cmd->n_group_vars + n_split_vars: weight var
1044 subcase_init_empty (&projection
);
1045 subcase_add_var_always (&projection
, input_var
, SC_ASCEND
);
1046 subcase_add_var_always (&projection
, order_var
, SC_ASCEND
);
1047 subcase_add_vars_always (&projection
,
1048 cmd
->group_vars
, cmd
->n_group_vars
);
1049 subcase_add_vars_always (&projection
, dict_get_split_vars (d
),
1050 dict_get_split_cnt (d
));
1051 if (weight_var
!= NULL
)
1053 subcase_add_var_always (&projection
, weight_var
, SC_ASCEND
);
1054 weight_idx
= 2 + cmd
->n_group_vars
+ dict_get_split_cnt (d
);
1058 input_pass
= casereader_project (input_pass
, &projection
);
1059 subcase_destroy (&projection
);
1061 /* Prepare 'group_vars' as the set of grouping variables. */
1062 subcase_init_empty (&group_vars
);
1063 for (j
= 0; j
< cmd
->n_group_vars
; j
++)
1064 subcase_add_always (&group_vars
,
1065 j
+ 2, var_get_width (cmd
->group_vars
[j
]),
1068 /* Prepare 'rank_ordering' for sorting with the group variables as
1069 primary key and the input variable as secondary key. */
1070 subcase_clone (&rank_ordering
, &group_vars
);
1071 subcase_add (&rank_ordering
, 0, 0, subcase_get_direction (&cmd
->sc
, i
));
1073 /* Group by split variables */
1074 subcase_init_empty (&split_vars
);
1075 for (j
= 0; j
< dict_get_split_cnt (d
); j
++)
1076 subcase_add_always (&split_vars
, 2 + j
+ cmd
->n_group_vars
,
1077 var_get_width (dict_get_split_vars (d
)[j
]),
1079 split_grouper
= casegrouper_create_subcase (input_pass
, &split_vars
);
1080 subcase_destroy (&split_vars
);
1081 while (casegrouper_get_next_group (split_grouper
, &split_group
))
1083 struct casereader
*ordered
;
1084 struct casegrouper
*by_grouper
;
1085 struct casereader
*by_group
;
1087 ordered
= sort_execute (split_group
, &rank_ordering
);
1088 by_grouper
= casegrouper_create_subcase (ordered
, &group_vars
);
1089 while (casegrouper_get_next_group (by_grouper
, &by_group
))
1090 rank_sorted_file (by_group
, outputs
[i
], weight_idx
, cmd
);
1091 ok
= casegrouper_destroy (by_grouper
) && ok
;
1093 subcase_destroy (&group_vars
);
1094 subcase_destroy (&rank_ordering
);
1096 ok
= casegrouper_destroy (split_grouper
) && ok
;
1098 ok
= proc_commit (ds
) && ok
;
1100 /* Re-fetch the dictionary and order variable, because if TEMPORARY was in
1101 effect then there's a new dictionary. */
1102 d
= dataset_dict (ds
);
1103 order_var
= dict_lookup_var_assert (d
, "$ORDER");
1105 /* Merge the original data set with the ranks (which we already sorted on
1107 trns
= xmalloc (sizeof *trns
);
1108 trns
->order_case_idx
= var_get_case_index (order_var
);
1109 trns
->input_vars
= xnmalloc (cmd
->n_vars
, sizeof *trns
->input_vars
);
1110 trns
->n_input_vars
= cmd
->n_vars
;
1111 trns
->n_funcs
= cmd
->n_rs
;
1112 for (i
= 0; i
< trns
->n_input_vars
; i
++)
1114 struct rank_trns_input_var
*iv
= &trns
->input_vars
[i
];
1117 iv
->input
= casewriter_make_reader (outputs
[i
]);
1118 iv
->current
= casereader_read (iv
->input
);
1119 iv
->output_vars
= xnmalloc (trns
->n_funcs
, sizeof *iv
->output_vars
);
1120 for (j
= 0; j
< trns
->n_funcs
; j
++)
1122 struct rank_spec
*rs
= &cmd
->rs
[j
];
1123 struct variable
*var
;
1125 var
= dict_create_var_assert (d
, rs
->dest_names
[i
], 0);
1126 var_set_both_formats (var
, &dest_format
[rs
->rfunc
]);
1127 var_set_label (var
, rs
->dest_labels
[i
]);
1129 iv
->output_vars
[j
] = var
;
1134 add_transformation (ds
, rank_trns_proc
, rank_trns_free
, trns
);
1136 /* Delete our sort key, which we don't need anymore. */
1137 dict_delete_var (d
, order_var
);