2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012, 2013, 2015 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 * This module implements the graph command
26 #include "gl/xalloc.h"
27 #include <gsl/gsl_cdf.h>
29 #include "libpspp/assertion.h"
30 #include "libpspp/message.h"
31 #include "libpspp/pool.h"
34 #include "data/dataset.h"
35 #include "data/dictionary.h"
36 #include "data/casegrouper.h"
37 #include "data/casereader.h"
38 #include "data/casewriter.h"
39 #include "data/caseproto.h"
40 #include "data/subcase.h"
43 #include "data/format.h"
45 #include "math/chart-geometry.h"
46 #include "math/histogram.h"
47 #include "math/moments.h"
48 #include "math/sort.h"
49 #include "math/order-stats.h"
50 #include "output/charts/plot-hist.h"
51 #include "output/charts/scatterplot.h"
52 #include "output/charts/barchart.h"
54 #include "language/command.h"
55 #include "language/lexer/lexer.h"
56 #include "language/lexer/value-parser.h"
57 #include "language/lexer/variable-parser.h"
58 #include "language/stats/freq.h"
59 #include "language/stats/chart-category.h"
61 #include "output/tab.h"
64 #define _(msgid) gettext (msgid)
65 #define N_(msgid) msgid
97 /* Variable index for histogram case */
104 struct exploratory_stats
117 /* The minimum weight */
127 const struct variable
**dep_vars
;
128 struct exploratory_stats
*es
;
130 enum mv_class dep_excl
;
131 enum mv_class fctr_excl
;
133 const struct dictionary
*dict
;
137 /* ------------ Graph ---------------- */
138 bool normal
; /* For histograms, draw the normal curve */
140 enum chart_type chart_type
;
141 enum scatter_type scatter_type
;
142 enum bar_type bar_type
;
143 const struct variable
*by_var
[2];
146 struct subcase ordering
; /* Ordering for aggregation */
147 int agr
; /* Index into ag_func */
149 /* A caseproto that contains the plot data */
150 struct caseproto
*gr_proto
;
157 calc_mom1 (double acc
, double x
, double w
)
163 calc_mom0 (double acc
, double x UNUSED
, double w
)
169 pre_low_extreme (void)
175 calc_max (double acc
, double x
, double w UNUSED
)
177 return (acc
> x
) ? acc
: x
;
181 pre_high_extreme (void)
187 calc_min (double acc
, double x
, double w UNUSED
)
189 return (acc
< x
) ? acc
: x
;
193 post_normalise (double acc
, double cc
)
199 post_percentage (double acc
, double ccc
)
201 return acc
/ ccc
* 100.0;
205 const struct ag_func ag_func
[] =
207 {"COUNT", N_("Count"), 0, 0, NULL
, calc_mom0
, 0, 0},
208 {"PCT", N_("Percentage"), 0, 0, NULL
, calc_mom0
, 0, post_percentage
},
209 {"CUFREQ", N_("Cumulative Count"), 0, 1, NULL
, calc_mom0
, 0, 0},
210 {"CUPCT", N_("Cumulative Percent"), 0, 1, NULL
, calc_mom0
, 0, post_percentage
},
212 {"MEAN", N_("Mean"), 1, 0, NULL
, calc_mom1
, post_normalise
, 0},
213 {"SUM", N_("Sum"), 1, 0, NULL
, calc_mom1
, 0, 0},
214 {"MAXIMUM", N_("Maximum"), 1, 0, pre_low_extreme
, calc_max
, 0, 0},
215 {"MINIMUM", N_("Minimum"), 1, 0, pre_high_extreme
, calc_min
, 0, 0},
218 const int N_AG_FUNCS
= sizeof (ag_func
) / sizeof (ag_func
[0]);
221 parse_function (struct lexer
*lexer
, struct graph
*graph
)
224 for (i
= 0 ; i
< N_AG_FUNCS
; ++i
)
226 if (lex_match_id (lexer
, ag_func
[i
].name
))
237 graph
->n_dep_vars
= ag_func
[i
].arity
;
238 if (ag_func
[i
].arity
> 0)
241 if (!lex_force_match (lexer
, T_LPAREN
))
244 graph
->dep_vars
= xzalloc (sizeof (graph
->dep_vars
) * graph
->n_dep_vars
);
245 for (v
= 0; v
< ag_func
[i
].arity
; ++v
)
247 graph
->dep_vars
[v
] = parse_variable (lexer
, graph
->dict
);
248 if (! graph
->dep_vars
[v
])
252 if (!lex_force_match (lexer
, T_RPAREN
))
256 if (!lex_force_match (lexer
, T_BY
))
259 graph
->by_var
[0] = parse_variable (lexer
, graph
->dict
);
260 if (!graph
->by_var
[0])
264 subcase_add_var (&graph
->ordering
, graph
->by_var
[0], SC_ASCEND
);
267 if (lex_match (lexer
, T_BY
))
269 graph
->by_var
[1] = parse_variable (lexer
, graph
->dict
);
270 if (!graph
->by_var
[1])
274 subcase_add_var (&graph
->ordering
, graph
->by_var
[1], SC_ASCEND
);
281 lex_error (lexer
, NULL
);
287 show_scatterplot (const struct graph
*cmd
, struct casereader
*input
)
290 struct scatterplot_chart
*scatterplot
;
291 bool byvar_overflow
= false;
293 ds_init_empty (&title
);
295 if (cmd
->n_by_vars
> 0)
297 ds_put_format (&title
, _("%s vs. %s by %s"),
298 var_to_string (cmd
->dep_vars
[1]),
299 var_to_string (cmd
->dep_vars
[0]),
300 var_to_string (cmd
->by_var
[0]));
304 ds_put_format (&title
, _("%s vs. %s"),
305 var_to_string (cmd
->dep_vars
[1]),
306 var_to_string (cmd
->dep_vars
[0]));
309 scatterplot
= scatterplot_create (input
,
310 var_to_string(cmd
->dep_vars
[0]),
311 var_to_string(cmd
->dep_vars
[1]),
312 (cmd
->n_by_vars
> 0) ? cmd
->by_var
[0] : NULL
,
315 cmd
->es
[0].minimum
, cmd
->es
[0].maximum
,
316 cmd
->es
[1].minimum
, cmd
->es
[1].maximum
);
317 scatterplot_chart_submit (scatterplot
);
322 msg (MW
, _("Maximum number of scatterplot categories reached. "
323 "Your BY variable has too many distinct values. "
324 "The coloring of the plot will not be correct."));
329 show_histogr (const struct graph
*cmd
, struct casereader
*input
)
331 struct histogram
*histogram
;
334 if (cmd
->es
[0].cc
<= 0)
336 casereader_destroy (input
);
342 double bin_width
= fabs (cmd
->es
[0].minimum
- cmd
->es
[0].maximum
)
343 / (1 + log2 (cmd
->es
[0].cc
))
347 histogram_create (bin_width
, cmd
->es
[0].minimum
, cmd
->es
[0].maximum
);
350 if (NULL
== histogram
)
352 casereader_destroy (input
);
356 for (;(c
= casereader_read (input
)) != NULL
; case_unref (c
))
358 const double x
= case_data_idx (c
, HG_IDX_X
)->f
;
359 const double weight
= case_data_idx (c
, HG_IDX_WT
)->f
;
360 moments_pass_two (cmd
->es
[0].mom
, x
, weight
);
361 histogram_add (histogram
, x
, weight
);
363 casereader_destroy (input
);
371 ds_init_cstr (&label
,
372 var_to_string (cmd
->dep_vars
[0]));
374 moments_calculate (cmd
->es
[0].mom
, &n
, &mean
, &var
, NULL
, NULL
);
377 ( histogram_chart_create (histogram
->gsl_hist
,
378 ds_cstr (&label
), n
, mean
,
379 sqrt (var
), cmd
->normal
));
381 statistic_destroy (&histogram
->parent
);
387 cleanup_exploratory_stats (struct graph
*cmd
)
391 for (v
= 0; v
< cmd
->n_dep_vars
; ++v
)
393 moments_destroy (cmd
->es
[v
].mom
);
399 run_barchart (struct graph
*cmd
, struct casereader
*input
)
401 struct casegrouper
*grouper
;
402 struct casereader
*group
;
405 if ( cmd
->missing_pw
== false)
406 input
= casereader_create_filter_missing (input
,
414 input
= sort_execute (input
, &cmd
->ordering
);
416 struct freq
**freqs
= NULL
;
419 for (grouper
= casegrouper_create_vars (input
, cmd
->by_var
,
421 casegrouper_get_next_group (grouper
, &group
);
422 casereader_destroy (group
))
425 struct ccase
*c
= casereader_peek (group
, 0);
427 /* Deal with missing values in the categorical variables */
428 for (v
= 0; v
< cmd
->n_by_vars
; ++v
)
430 if (var_is_value_missing (cmd
->by_var
[v
], case_data (c
, cmd
->by_var
[v
]), cmd
->fctr_excl
) )
434 if (v
< cmd
->n_by_vars
)
440 freqs
= xrealloc (freqs
, sizeof (*freqs
) * ++n_freqs
);
441 freqs
[n_freqs
- 1] = xzalloc (sizeof (**freqs
) +
442 sizeof (union value
) * (cmd
->n_by_vars
- 1) );
444 if (ag_func
[cmd
->agr
].cumulative
&& n_freqs
>= 2)
445 freqs
[n_freqs
- 1]->count
= freqs
[n_freqs
- 2]->count
;
447 freqs
[n_freqs
- 1]->count
= 0;
448 if (ag_func
[cmd
->agr
].pre
)
449 freqs
[n_freqs
- 1]->count
= ag_func
[cmd
->agr
].pre();
452 for (v
= 0; v
< cmd
->n_by_vars
; ++v
)
454 value_clone (&freqs
[n_freqs
- 1]->values
[v
], case_data (c
, cmd
->by_var
[v
]),
455 var_get_width (cmd
->by_var
[v
])
461 for (;(c
= casereader_read (group
)) != NULL
; case_unref (c
))
463 const double weight
= dict_get_case_weight (cmd
->dict
,c
,NULL
);
464 const double x
= (cmd
->n_dep_vars
> 0) ? case_data (c
, cmd
->dep_vars
[0])->f
: SYSMIS
;
468 freqs
[n_freqs
- 1]->count
469 = ag_func
[cmd
->agr
].calc (freqs
[n_freqs
- 1]->count
, x
, weight
);
472 if (ag_func
[cmd
->agr
].post
)
473 freqs
[n_freqs
- 1]->count
474 = ag_func
[cmd
->agr
].post (freqs
[n_freqs
- 1]->count
, cc
);
479 casegrouper_destroy (grouper
);
481 for (int i
= 0; i
< n_freqs
; ++i
)
483 if (ag_func
[cmd
->agr
].ppost
)
484 freqs
[i
]->count
= ag_func
[cmd
->agr
].ppost (freqs
[i
]->count
, ccc
);
490 ds_init_empty (&label
);
492 if (cmd
->n_dep_vars
> 0)
493 ds_put_format (&label
, _("%s of %s"),
494 ag_func
[cmd
->agr
].description
,
495 var_get_name (cmd
->dep_vars
[0]));
498 ag_func
[cmd
->agr
].description
);
500 chart_item_submit (barchart_create (cmd
->by_var
, cmd
->n_by_vars
,
501 ds_cstr (&label
), false,
507 for (int i
= 0; i
< n_freqs
; ++i
)
515 run_graph (struct graph
*cmd
, struct casereader
*input
)
518 struct casereader
*reader
;
519 struct casewriter
*writer
;
521 cmd
->es
= pool_calloc (cmd
->pool
,cmd
->n_dep_vars
,sizeof(struct exploratory_stats
));
522 for(int v
=0;v
<cmd
->n_dep_vars
;v
++)
524 cmd
->es
[v
].mom
= moments_create (MOMENT_KURTOSIS
);
525 cmd
->es
[v
].cmin
= DBL_MAX
;
526 cmd
->es
[v
].maximum
= -DBL_MAX
;
527 cmd
->es
[v
].minimum
= DBL_MAX
;
529 /* Always remove cases listwise. This is correct for */
530 /* the histogram because there is only one variable */
531 /* and a simple bivariate scatterplot */
532 /* if ( cmd->missing_pw == false) */
533 input
= casereader_create_filter_missing (input
,
540 writer
= autopaging_writer_create (cmd
->gr_proto
);
542 /* The case data is copied to a new writer */
543 /* The setup of the case depends on the Charttype */
544 /* For Scatterplot x is assumed in dep_vars[0] */
545 /* y is assumed in dep_vars[1] */
546 /* For Histogram x is assumed in dep_vars[0] */
547 assert(SP_IDX_X
== 0 && SP_IDX_Y
== 1 && HG_IDX_X
== 0);
549 for (;(c
= casereader_read (input
)) != NULL
; case_unref (c
))
551 struct ccase
*outcase
= case_create (cmd
->gr_proto
);
552 const double weight
= dict_get_case_weight (cmd
->dict
,c
,NULL
);
553 if (cmd
->chart_type
== CT_HISTOGRAM
)
554 case_data_rw_idx (outcase
, HG_IDX_WT
)->f
= weight
;
555 if (cmd
->chart_type
== CT_SCATTERPLOT
&& cmd
->n_by_vars
> 0)
556 value_copy (case_data_rw_idx (outcase
, SP_IDX_BY
),
557 case_data (c
, cmd
->by_var
[0]),
558 var_get_width (cmd
->by_var
[0]));
559 for(int v
=0;v
<cmd
->n_dep_vars
;v
++)
561 const struct variable
*var
= cmd
->dep_vars
[v
];
562 const double x
= case_data (c
, var
)->f
;
564 if (var_is_value_missing (var
, case_data (c
, var
), cmd
->dep_excl
))
566 cmd
->es
[v
].missing
+= weight
;
569 /* Magically v value fits to SP_IDX_X, SP_IDX_Y, HG_IDX_X */
570 case_data_rw_idx (outcase
, v
)->f
= x
;
572 if (x
> cmd
->es
[v
].maximum
)
573 cmd
->es
[v
].maximum
= x
;
575 if (x
< cmd
->es
[v
].minimum
)
576 cmd
->es
[v
].minimum
= x
;
578 cmd
->es
[v
].non_missing
+= weight
;
580 moments_pass_one (cmd
->es
[v
].mom
, x
, weight
);
582 cmd
->es
[v
].cc
+= weight
;
584 if (cmd
->es
[v
].cmin
> weight
)
585 cmd
->es
[v
].cmin
= weight
;
587 casewriter_write (writer
,outcase
);
590 reader
= casewriter_make_reader (writer
);
592 switch (cmd
->chart_type
)
595 show_histogr (cmd
,reader
);
598 show_scatterplot (cmd
,reader
);
605 casereader_destroy (input
);
606 cleanup_exploratory_stats (cmd
);
611 cmd_graph (struct lexer
*lexer
, struct dataset
*ds
)
615 graph
.missing_pw
= false;
617 graph
.pool
= pool_create ();
619 graph
.dep_excl
= MV_ANY
;
620 graph
.fctr_excl
= MV_ANY
;
622 graph
.dict
= dataset_dict (ds
);
624 graph
.dep_vars
= NULL
;
625 graph
.chart_type
= CT_NONE
;
626 graph
.scatter_type
= ST_BIVARIATE
;
628 graph
.gr_proto
= caseproto_create ();
630 subcase_init_empty (&graph
.ordering
);
632 while (lex_token (lexer
) != T_ENDCMD
)
634 lex_match (lexer
, T_SLASH
);
636 if (lex_match_id (lexer
, "HISTOGRAM"))
638 if (graph
.chart_type
!= CT_NONE
)
640 lex_error (lexer
, _("Only one chart type is allowed."));
643 graph
.normal
= false;
644 if (lex_match (lexer
, T_LPAREN
))
646 if (!lex_force_match_id (lexer
, "NORMAL"))
649 if (!lex_force_match (lexer
, T_RPAREN
))
654 if (!lex_force_match (lexer
, T_EQUALS
))
656 graph
.chart_type
= CT_HISTOGRAM
;
657 if (!parse_variables_const (lexer
, graph
.dict
,
658 &graph
.dep_vars
, &graph
.n_dep_vars
,
659 PV_NO_DUPLICATE
| PV_NUMERIC
))
661 if (graph
.n_dep_vars
> 1)
663 lex_error (lexer
, _("Only one variable is allowed."));
667 else if (lex_match_id (lexer
, "BAR"))
669 if (graph
.chart_type
!= CT_NONE
)
671 lex_error (lexer
, _("Only one chart type is allowed."));
674 graph
.chart_type
= CT_BAR
;
675 graph
.bar_type
= CBT_SIMPLE
;
677 if (lex_match (lexer
, T_LPAREN
))
679 if (lex_match_id (lexer
, "SIMPLE"))
681 /* This is the default anyway */
683 else if (lex_match_id (lexer
, "GROUPED"))
685 graph
.bar_type
= CBT_GROUPED
;
688 else if (lex_match_id (lexer
, "STACKED"))
690 graph
.bar_type
= CBT_STACKED
;
691 lex_error (lexer
, _("%s is not yet implemented."), "STACKED");
694 else if (lex_match_id (lexer
, "RANGE"))
696 graph
.bar_type
= CBT_RANGE
;
697 lex_error (lexer
, _("%s is not yet implemented."), "RANGE");
702 lex_error (lexer
, NULL
);
705 if (!lex_force_match (lexer
, T_RPAREN
))
709 if (!lex_force_match (lexer
, T_EQUALS
))
712 if (! parse_function (lexer
, &graph
))
715 else if (lex_match_id (lexer
, "SCATTERPLOT"))
717 if (graph
.chart_type
!= CT_NONE
)
719 lex_error (lexer
, _("Only one chart type is allowed."));
722 graph
.chart_type
= CT_SCATTERPLOT
;
723 if (lex_match (lexer
, T_LPAREN
))
725 if (lex_match_id (lexer
, "BIVARIATE"))
727 /* This is the default anyway */
729 else if (lex_match_id (lexer
, "OVERLAY"))
731 lex_error (lexer
, _("%s is not yet implemented."),"OVERLAY");
734 else if (lex_match_id (lexer
, "MATRIX"))
736 lex_error (lexer
, _("%s is not yet implemented."),"MATRIX");
739 else if (lex_match_id (lexer
, "XYZ"))
741 lex_error(lexer
, _("%s is not yet implemented."),"XYZ");
746 lex_error_expecting (lexer
, "BIVARIATE", NULL
);
749 if (!lex_force_match (lexer
, T_RPAREN
))
752 if (!lex_force_match (lexer
, T_EQUALS
))
755 if (!parse_variables_const (lexer
, graph
.dict
,
756 &graph
.dep_vars
, &graph
.n_dep_vars
,
757 PV_NO_DUPLICATE
| PV_NUMERIC
))
760 if (graph
.scatter_type
== ST_BIVARIATE
&& graph
.n_dep_vars
!= 1)
762 lex_error(lexer
, _("Only one variable is allowed."));
766 if (!lex_force_match (lexer
, T_WITH
))
769 if (!parse_variables_const (lexer
, graph
.dict
,
770 &graph
.dep_vars
, &graph
.n_dep_vars
,
771 PV_NO_DUPLICATE
| PV_NUMERIC
| PV_APPEND
))
774 if (graph
.scatter_type
== ST_BIVARIATE
&& graph
.n_dep_vars
!= 2)
776 lex_error (lexer
, _("Only one variable is allowed."));
780 if (lex_match (lexer
, T_BY
))
782 const struct variable
*v
= NULL
;
783 if (!lex_match_variable (lexer
,graph
.dict
,&v
))
785 lex_error (lexer
, _("Variable expected"));
792 else if (lex_match_id (lexer
, "LINE"))
794 lex_error (lexer
, _("%s is not yet implemented."),"LINE");
797 else if (lex_match_id (lexer
, "PIE"))
799 lex_error (lexer
, _("%s is not yet implemented."),"PIE");
802 else if (lex_match_id (lexer
, "ERRORBAR"))
804 lex_error (lexer
, _("%s is not yet implemented."),"ERRORBAR");
807 else if (lex_match_id (lexer
, "PARETO"))
809 lex_error (lexer
, _("%s is not yet implemented."),"PARETO");
812 else if (lex_match_id (lexer
, "TITLE"))
814 lex_error (lexer
, _("%s is not yet implemented."),"TITLE");
817 else if (lex_match_id (lexer
, "SUBTITLE"))
819 lex_error (lexer
, _("%s is not yet implemented."),"SUBTITLE");
822 else if (lex_match_id (lexer
, "FOOTNOTE"))
824 lex_error (lexer
, _("%s is not yet implemented."),"FOOTNOTE");
825 lex_error (lexer
, _("FOOTNOTE is not implemented yet for GRAPH"));
828 else if (lex_match_id (lexer
, "MISSING"))
830 lex_match (lexer
, T_EQUALS
);
832 while (lex_token (lexer
) != T_ENDCMD
833 && lex_token (lexer
) != T_SLASH
)
835 if (lex_match_id (lexer
, "LISTWISE"))
837 graph
.missing_pw
= false;
839 else if (lex_match_id (lexer
, "VARIABLE"))
841 graph
.missing_pw
= true;
843 else if (lex_match_id (lexer
, "EXCLUDE"))
845 graph
.dep_excl
= MV_ANY
;
847 else if (lex_match_id (lexer
, "INCLUDE"))
849 graph
.dep_excl
= MV_SYSTEM
;
851 else if (lex_match_id (lexer
, "REPORT"))
853 graph
.fctr_excl
= MV_NEVER
;
855 else if (lex_match_id (lexer
, "NOREPORT"))
857 graph
.fctr_excl
= MV_ANY
;
861 lex_error (lexer
, NULL
);
868 lex_error (lexer
, NULL
);
873 switch (graph
.chart_type
)
876 /* See scatterplot.h for the setup of the case prototype */
877 graph
.gr_proto
= caseproto_add_width (graph
.gr_proto
, 0); /* x value - SP_IDX_X*/
878 graph
.gr_proto
= caseproto_add_width (graph
.gr_proto
, 0); /* y value - SP_IDX_Y*/
879 /* The by_var contains the plot categories for the different xy plot colors */
880 if (graph
.n_by_vars
> 0) /* SP_IDX_BY */
881 graph
.gr_proto
= caseproto_add_width (graph
.gr_proto
, var_get_width(graph
.by_var
[0]));
884 graph
.gr_proto
= caseproto_add_width (graph
.gr_proto
, 0); /* x value */
885 graph
.gr_proto
= caseproto_add_width (graph
.gr_proto
, 0); /* weight value */
890 lex_error_expecting (lexer
, "HISTOGRAM", "SCATTERPLOT", "BAR", NULL
);
898 struct casegrouper
*grouper
;
899 struct casereader
*group
;
902 grouper
= casegrouper_create_splits (proc_open (ds
), graph
.dict
);
903 while (casegrouper_get_next_group (grouper
, &group
))
905 if (graph
.chart_type
== CT_BAR
)
906 run_barchart (&graph
, group
);
908 run_graph (&graph
, group
);
910 ok
= casegrouper_destroy (grouper
);
911 ok
= proc_commit (ds
) && ok
;
914 subcase_destroy (&graph
.ordering
);
915 free (graph
.dep_vars
);
916 pool_destroy (graph
.pool
);
917 caseproto_unref (graph
.gr_proto
);
922 subcase_destroy (&graph
.ordering
);
923 caseproto_unref (graph
.gr_proto
);
924 free (graph
.dep_vars
);
925 pool_destroy (graph
.pool
);