3 Copyright (C) 2006,2012 Mario Rodriguez Riotorto
5 This program is free software; you can redistribute
6 it and/or modify it under the terms of the
7 GNU General Public License as published by
8 the Free Software Foundation; either version 2
9 of the License, or (at your option) any later version.
11 This program is distributed in the hope that it
12 will be useful, but WITHOUT ANY WARRANTY;
13 without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details at
16 http://www.gnu.org/copyleft/gpl.html
22 This is a Maxima package for some classical statistical inference
27 put('stats, 1, 'version) $
29 if not get('descriptive, 'version)
30 then load("descriptive")$
32 if not get('distrib, 'version)
35 load("inference_result")$
41 /* This is the mean test. The first argument 'x' is a list or a column matrix */
42 /* of expressions (one sample) */
43 /* Admits the following options: */
44 /* 'mean=0: this is the value of the mean to be checked. */
45 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
46 /* values are: 'twosided, 'greater and 'less. */
47 /* 'dev='unknown: this is the value of the standard deviation when it is */
48 /* known; valid values are: 'unknown, an expression or a positive */
50 /* 'conflevel=95/100: confidence level of the confidence interval; valid */
51 /* values are: a symbol or an expression which takes a value in (0,1)*/
52 /* 'asymptotic=false: whether it performs an exact t-test or an asymptotic */
53 /* one; valid values are true and false */
55 /* The output of this function is an 'inference_result' object with the */
56 /* following results: */
57 /* 1. 'mean_estimate= sample estimate for the mean */
58 /* 2. 'conf_level= confidence level */
59 /* 3. 'conf_interval= confidence interval */
60 /* 4. 'method= assumption about the standard deviations, asymptotic. */
61 /* 5. 'hypotheses= null hypothesis and alternative */
62 /* 6. 'statistic= statistic used in the procedure */
63 /* 7. 'distribution= distribution and its parameters */
64 /* 8. 'p_value= p-value of the sample statistic */
65 test_mean(x,[select]):=
66 block([numer:stats_numer, options, defaults, m, n, coef, cinterval, aux,
67 statistic, method, distribution, hypo, pvalue, listarith:true],
69 /* controlling sample format */
70 if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
71 then error("Sample 'x' should be a list with expressions or a column matrix")
74 /* updating and controlling options */
75 options: ['mean, 'alternative, 'dev, 'conflevel, 'asymptotic],
76 defaults: [0, 'twosided, 'unknown, 95/100, false],
78 aux: ?position(lhs(i),options),
79 if numberp(aux) and aux <= length(options) and aux >= 1
80 then defaults[aux]: rhs(i)),
81 if not member(defaults[2],['twosided, 'greater, 'less])
82 then error("Option 'alternative' is not correct"),
83 if member(sign(defaults[3]), ['neg, 'zero, 'nz])
84 then error("Option 'dev' can't be negative nor zero"),
85 if numberp(defaults[4]) and (defaults[4] <= 0 or defaults[4] >= 1)
86 then error("Option 'conflevel' can't be outside interval (0,1)"),
87 if not member(defaults[5],[true, false])
88 then error("Option 'asymptotic' must be true or false"),
92 if listp(m) then m: m[1],
94 /* coef: standard_deviation / sqrt(n) */
95 if /*standard_deviation*/ defaults[3] = 'unknown
96 then (coef: std1(x) / sqrt(n),
97 if listp(coef) then coef: coef[1])
98 else coef: defaults[3] / sqrt(n),
101 method: concat(if /*asymptotic*/ defaults[5] = true
102 then "Large sample z-test. "
103 else "Exact t-test. ",
104 if /*standard_deviation*/ defaults[3] = 'unknown
105 then "Unknown variance."
106 else "Known variance."),
108 /* confidence interval (one and two-sided), distribution and */
109 /* p-value for alternative=less */
111 then statistic: distribution: pvalue: cinterval: hypo: 'undefined
114 statistic: (m - defaults[1]) / coef,
115 if /*alternative*/ defaults[2] = 'twosided
116 then statistic: abs(statistic),
117 if /*asymptotic*/ defaults[5] = false and
118 /*standard_deviation*/ defaults[3] = 'unknown
119 then (distribution: ['student_t, n-1],
120 pvalue: cdf_student_t(statistic,n-1),
121 if /*alternative*/ defaults[2] = 'greater
122 then cinterval: [m - quantile_student_t(defaults[4], n-1) * coef,'inf]
123 else if /*alternative*/ defaults[2] = 'less
124 then cinterval: ['minf, m + quantile_student_t(defaults[4], n-1) * coef]
125 else cinterval: m + [-1,1] * quantile_student_t((1 + defaults[4])/2, n-1) * coef)
126 else (distribution: ['normal, 0, 1],
127 pvalue: cdf_normal(statistic,0,1),
128 if /*alternative*/ defaults[2] = 'greater
129 then cinterval: [m - quantile_normal(defaults[4],0,1) * coef, 'inf]
130 else if /*alternative*/ defaults[2] = 'less
131 then cinterval: ['minf, m + quantile_normal(defaults[4],0,1) * coef]
132 else cinterval: m + [-1,1] * quantile_normal((1 + defaults[4])/2,0,1) * coef),
134 /* hypotheses, pvalue (for alternative=greater and alternative=twosided) */
135 aux: string(defaults[1]),
136 if /*alternative*/ defaults[2] = 'greater
137 then (hypo: concat("H0: mean = ", aux," , H1: mean > ", aux),
139 else if /*alternative*/ defaults[2] = 'less
140 then hypo: concat("H0: mean = ", aux, " , H1: mean < ", aux)
141 else (hypo: concat("H0: mean = ", aux, " , H1: mean # ", aux),
142 pvalue: 2 * (1 - pvalue) ) ),
144 /* result as an 'inference_result' object*/
145 inference_result("MEAN TEST",
146 [ ['mean_estimate, m],
147 ['conf_level, defaults[4]],
148 ['conf_interval, cinterval],
151 ['statistic, statistic],
152 ['distribution, distribution],
153 ['p_value, pvalue] ],
154 [1,2,3,4,5,6,7,8]) )$
164 /* This is the difference of means test. The first two arguments 'x1' and 'x2' */
165 /* are lists or column matrices with expressions, generally numbers. */
166 /* Admits the following options: */
167 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
168 /* values are: 'twosided, 'greater (m1>m2) and less (m1<m2). */
169 /* 'dev1='unknown: this is the value of the standard deviation of the x1 */
170 /* sample when it is known; valid values are: 'unknown, a symbol or */
171 /* a positive number */
172 /* 'dev2='unknown: this is the value of the standard deviation of the x2 */
173 /* sample when it is known; valid values are: 'unknown, a symbol or */
174 /* a positive number */
175 /* 'varequal=false: whether variances are equal or not */
176 /* 'conflevel=95/100: confidence level of the confidence interval; valid */
177 /* values are: a symbol or an expression which takes a value in (0,1)*/
178 /* 'asymptotic=false: whether it performs an exact t-test or an asymptotic */
179 /* one; valid values are true and false */
181 /* The output of this function is an 'inference_result' object */
182 /* with the following results: */
183 /* 1. 'diff_estimate= difference of means estimate (m1-m2) */
184 /* 2. 'conf_level= confidence level */
185 /* 3. 'conf_interval= confidence interval */
186 /* 4. 'method: assumptions about the standard deviations, asymptotic. */
187 /* 5. 'hypotheses: null hypothesis and alternative */
188 /* 6. 'statistic: statistic used in the procedure */
189 /* 7. 'distribution: distribution and its parameters */
190 /* 8. 'p_value: p-value of the sample statistic */
191 test_means_difference(x1,x2,[select]):=
192 block([numer:stats_numer, options, defaults, dm, n1, n2, v1, v2, coef, cinterval, aux, df,
193 statistic, method, distribution, hypo, pvalue, listarith:true],
195 /* controlling sample format */
196 if not listofexpr(x1) and not (matrixp(x1) and length(x1[1]) = 1 and every('identity,map('listofexpr,args(x1))))
197 then error("Sample 'x1' should be a list with expressions or a column matrix")
199 if not listofexpr(x2) and not (matrixp(x2) and length(x2[1]) = 1 and every('identity,map('listofexpr,args(x2))))
200 then error("Sample 'x2' should be a list with expressions or a column matrix")
203 /* updating and controlling options */
204 options: ['alternative, 'dev1, 'dev2, 'varequal, 'conflevel, 'asymptotic],
205 defaults: ['twosided, 'unknown, 'unknown, false, 95/100, false],
207 aux: ?position(lhs(i),options),
208 if numberp(aux) and aux <= length(options) and aux >= 1
209 then defaults[aux]: rhs(i)),
210 if not member(defaults[1],['twosided, 'greater, 'less])
211 then error("Option 'alternative' is not correct"),
212 if member(sign(defaults[2]), ['neg, 'zero, 'nz]) or
213 member(sign(defaults[3]), ['neg, 'zero, 'nz])
214 then error("Option 'dev' can't be negative nor zero"),
215 /* in the next two lines, ignorance is contagious */
216 if defaults[2] = 'unknown then defaults[3]: 'unknown,
217 if defaults[3] = 'unknown then defaults[2]: 'unknown,
218 if not member(defaults[4],[true, false])
219 then error("Option 'varequal' must be true or false"),
220 if numberp(defaults[5]) and (defaults[5] <= 0 or defaults[5] >= 1)
221 then error("Option 'conflevel' can't be outside interval (0,1)"),
222 if not member(defaults[6],[true, false])
223 then error("Option 'asymptotic' must be true or false"),
225 /* difference of means estimate */
226 dm: mean(x1) - mean(x2),
227 if listp(dm) then dm: dm[1],
229 /* coef: standard_deviation / sqrt(n) */
230 if /*standard deviations*/ not defaults[2] = 'unknown
231 then coef: sqrt(defaults[2]^2 / n1 + defaults[3]^2 / n2)
233 if listp(v1) then v1: v1[1],
235 if listp(v2) then v2: v2[1],
236 if /*varequal*/ defaults[4] = true and
237 /*asymptotic*/ defaults[6] = false
238 then coef: sqrt(((n1-1)*v1 + (n2-1)*v2) / (n1+n2-2) * (1/n1+1/n2))
239 else coef: sqrt(v1 / n1 + v2 / n2)),
240 if listp(coef) then coef: coef[1],
242 /* method and Welch approximation */
243 method: concat(if /*asymptotic*/ defaults[6] = true
244 then "Asymptotic z-test (for large samples). "
245 else "Exact t-test. ",
246 if /*standard deviations*/ defaults[2] = 'unknown
247 then if /*varequal*/ defaults[4] = true
248 then "Unknown equal variances"
250 else "Known variances."),
253 then statistic: distribution: pvalue: cinterval: hypo: 'undefined
256 statistic: dm / coef,
257 if /*alternative*/ defaults[1] = 'twosided
258 then statistic: abs(statistic),
260 if /*asymptotic*/ defaults[6] = false and
261 /*standard deviations*/ defaults[2] = 'unknown and
262 /*varequal*/ defaults[4] = false
263 then /* Welch approximation */
264 df: (v1/n1+v2/n2)^2 / ((v1/n1)^2/(n1-1) + (v2/n2)^2/(n2-1))
265 else df: n1 + n2 - 2,
267 /* confidence interval, distribution and p-value */
268 if /*asymptotic*/ defaults[6] = false and
269 /*standard deviations*/ defaults[2] = 'unknown
270 then (distribution: ['student_t, df],
271 pvalue: cdf_student_t(statistic,df),
272 /* two or one-sided confidence interval */
273 if /*alternative*/ defaults[1] = 'greater
274 then cinterval: [dm-quantile_student_t(defaults[5], df) * coef,'inf]
275 else if /*alternative*/ defaults[1] = 'less
276 then cinterval: ['minf, dm+quantile_student_t(defaults[5], df) * coef]
277 else cinterval: dm + [-1,1] * quantile_student_t((1 + defaults[5])/2, df) * coef )
278 else (distribution: ['normal, 0, 1],
279 pvalue: cdf_normal(statistic, 0, 1),
280 if /*alternative*/ defaults[1] = 'greater
281 then cinterval: [dm-quantile_normal(defaults[5],0,1) * coef,'inf]
282 else if /*alternative*/ defaults[1] = 'less
283 then cinterval: ['minf, dm+quantile_normal(defaults[5],0,1) * coef]
284 else cinterval: dm + [-1,1] * quantile_normal((1 + defaults[5])/2, 0, 1) * coef ),
286 /* hypotheses, pvalue (for alternative=greater and alternative=twosided) */
287 if /*alternative*/ defaults[1] = 'greater
288 then (hypo: "H0: mean1 = mean2 , H1: mean1 > mean2",
290 else if /*alternative*/ defaults[1] = 'less
291 then hypo: "H0: mean1 = mean2 , H1: mean1 < mean2"
292 else (hypo: "H0: mean1 = mean2 , H1: mean1 # mean2",
293 pvalue: 2 * (1 - pvalue) ) ),
295 /* result as an 'inference_result' object*/
296 inference_result("DIFFERENCE OF MEANS TEST",
297 [ ['diff_estimate, dm],
298 ['conf_level, defaults[5]],
299 ['conf_interval, cinterval],
302 ['statistic, statistic],
303 ['distribution, distribution],
304 ['p_value, pvalue] ],
305 [1,2,3,4,5,6,7,8]) )$
314 /* This is the variance test for a normal population. The first argument 'x' */
315 /* is a list or a column matrix of expressions (one sample) */
316 /* Admits the following options: */
317 /* 'mean='unknown: this is the value of the population's mean when it is */
318 /* known; valid values are: 'unknown, a symbol or a number */
319 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
320 /* values are: 'twosided, 'greater and 'less. */
321 /* 'variance=1: this is the value of the variance to be checked. */
322 /* 'conflevel=95/100: confidence level of the confidence interval; valid */
323 /* values are: a symbol or an expression which takes a value in (0,1)*/
325 /* The output of this function is an 'inference_result' object with the */
326 /* following results: */
327 /* 1. 'var_estimate= variance estimate */
328 /* 2. 'conf_level= confidence level */
329 /* 3. 'conf_interval= confidence interval */
330 /* 4. 'method: method and assumptions */
331 /* 5. 'hypotheses: null hypothesis and alternative */
332 /* 6. 'statistic: statistic used in the procedure */
333 /* 7. 'distribution: distribution and its parameters */
334 /* 8. 'p_value: p-value of the sample statistic */
335 test_variance(x,[select]):=
336 block([numer:stats_numer, options, defaults, s2, n, coef, df, cinterval, aux,
337 statistic, method, distribution, hypo, pvalue, listarith:true],
339 /* controlling sample format */
340 if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
341 then error("Sample 'x' should be a list with expressions or a column matrix")
344 /* updating and controlling options */
345 options: ['mean, 'alternative, 'variance, 'conflevel],
346 defaults: ['unknown, 'twosided, 1, 95/100],
348 aux: ?position(lhs(i),options),
349 if numberp(aux) and aux <= length(options) and aux >= 1
350 then defaults[aux]: rhs(i)),
351 if not member(defaults[2],['twosided, 'greater, 'less])
352 then error("Option 'alternative' is not correct"),
353 if member(sign(defaults[3]), ['neg, 'zero, 'nz])
354 then error("Option 'variance' can't be nonpositive"),
355 if numberp(defaults[4]) and (defaults[4] <= 0 or defaults[4] >= 1)
356 then error("Option 'conflevel' can't be outside interval (0,1)"),
358 /* sample statistic's numerator (coef), degrees of freedom, */
359 /* confidence interval, distribution and variance estimate */
360 if /*mean*/ defaults[1] = 'unknown
362 if listp(s2) then s2: s2[1],
364 else (s2: mean((x - defaults[1])^2),
368 /* distribution, confidence interval and statistic */
369 distribution: ['chi2, df],
370 if /*alternative*/ defaults[2] = 'greater
371 then cinterval: [coef / quantile_chi2(defaults[4],df), 'inf]
372 else if /*alternative*/ defaults[2] = 'less
373 then cinterval: [0, coef / quantile_chi2(1-defaults[4],df)]
374 else cinterval: coef / [quantile_chi2((1+defaults[4])/2,df),
375 quantile_chi2((1-defaults[4])/2,df)],
376 statistic: coef / defaults[3],
379 method: concat("Variance Chi-square test. ",
380 if /*mean*/ defaults[1] = 'unknown
384 /* hypotheses, pvalue */
385 pvalue: cdf_chi2(statistic,df), /* pvalue for alternative=less */
386 aux: string(defaults[3]),
387 if /*alternative*/ defaults[2] = 'greater
388 then (hypo: concat("H0: var = ", aux, " , H1: var > ", aux),
390 else if /*alternative*/ defaults[2] = 'less
391 then hypo: concat("H0: var = ", aux," , H1: var < ", aux)
392 else (hypo: concat("H0: var = ", aux," , H1: var # ", aux),
393 if /* compares the sample statistics to the median */
394 statistic <= quantile_chi2(1/2,df)
395 then pvalue: 2 * pvalue
396 else pvalue: 2 * (1 - pvalue) ),
398 /* result as an 'inference_result' object*/
399 inference_result("VARIANCE TEST",
400 [ ['var_estimate, s2],
401 ['conf_level, defaults[4]],
402 ['conf_interval, cinterval],
405 ['statistic, statistic],
406 ['distribution, distribution],
407 ['p_value, pvalue] ],
408 [1,2,3,4,5,6,7,8]) )$
416 /* This is the variance ratio test. The first two arguments 'x1' and 'x2' */
417 /* are lists or column matrices with expressions, generally numbers. */
418 /* Admits the following options: */
419 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
420 /* values are: 'twosided, 'greater (m1>m2) and less (m1<m2). */
421 /* 'mean1='unknown: this is the value of the mean in the x1 sample when it */
422 /* is known; valid values are: 'unknown, a symbol or number */
423 /* 'mean2='unknown: this is the value of the mean in the x2 sample when it */
424 /* is known; valid values are: 'unknown, a symbol or number */
425 /* 'conflevel=95/100: confidence level of the confidence interval; valid */
426 /* values are: a symbol or an expression which takes a value in (0,1) */
428 /* The output of this function is an 'inference_result' object */
429 /* with the following results: */
430 /* 1. 'ratio_estimate= variance ratio estimate (variance1/variance2) */
431 /* 2. 'conf_level= confidence level */
432 /* 3. 'conf_interval= confidence interval */
433 /* 4. 'method: assumptions about the means. */
434 /* 5. 'hypotheses: null hypothesis and alternative */
435 /* 6. 'statistic: statistic used in the procedure */
436 /* 7. 'distribution: distribution and its parameters */
437 /* 8. 'p_value: p-value of the sample statistic */
438 test_variance_ratio(x1,x2,[select]):=
439 block([numer:stats_numer, options, defaults, v1, v2, vr, n1, n2, t1, t2, df1, df2,
440 cinterval, aux, statistic, method, distribution, hypo, pvalue, listarith:true],
442 /* controlling sample format */
443 if not listofexpr(x1) and not (matrixp(x1) and length(x1[1]) = 1 and every('identity,map('listofexpr,args(x1))))
444 then error("Sample 'x1' should be a list with expressions or a column matrix")
445 else (n1: length(x1)),
446 if not listofexpr(x2) and not (matrixp(x2) and length(x2[1]) = 1 and every('identity,map('listofexpr,args(x2))))
447 then error("Sample 'x2' should be a list with expressions or a column matrix")
450 /* updating and controlling options */
451 options: ['alternative, 'mean1, 'mean2, 'conflevel],
452 defaults: ['twosided, 'unknown, 'unknown, 95/100],
454 aux: ?position(lhs(i),options),
455 if numberp(aux) and aux <= length(options) and aux >= 1
456 then defaults[aux]: rhs(i)),
457 if not member(defaults[1],['twosided, 'greater, 'less])
458 then error("Option 'alternative' is not correct"),
459 /* in the next two lines, ignorance about the means is contagious */
460 if defaults[2] = 'unknown then defaults[3]: 'unknown,
461 if defaults[3] = 'unknown then defaults[2]: 'unknown,
462 if numberp(defaults[4]) and (defaults[4] <= 0 or defaults[4] >= 1)
463 then error("Option 'conflevel' can't be outside interval (0,1)"),
466 method: concat("Variance ratio F-test. ",
467 if /*means*/ defaults[2] = 'unknown
468 then "Unknown means."
469 else "Known means."),
474 then vr: statistic: distribution: pvalue: cinterval: hypo: 'undefined
476 /* variance ratio estimate, degrees of freedom, */
477 if /*means*/ defaults[2] = 'unknown
479 if listp(vr) then vr: vr[1],
482 else (t1: mean((x1 - defaults[2])^2),
483 t2: mean((x2 - defaults[3])^2),
488 /* distribution, confidence interval and statistic */
489 distribution: ['f, df1, df2],
490 if /*alternative*/ defaults[1] = 'greater
491 then cinterval: [vr / quantile_f(defaults[4],df1,df2), 'inf]
492 else if /*alternative*/ defaults[1] = 'less
493 then cinterval: [0, vr / quantile_f(1-defaults[4],df1,df2)]
494 else cinterval: vr / [quantile_f((1+defaults[4])/2,df1,df2),
495 quantile_f((1-defaults[4])/2,df1,df2)],
498 /* hypotheses, pvalue */
499 pvalue: cdf_f(statistic,df1,df2), /* pvalue for alternative=less */
500 if /*alternative*/ defaults[1] = 'greater
501 then (hypo: "H0: var1 = var2 , H1: var1 > var2",
503 else if /*alternative*/ defaults[1] = 'less
504 then hypo: "H0: var1 = var2 , H1: var1 < var2"
505 else (hypo: "H0: var1 = var2 , H1: var1 # var2",
506 if /* compares the sample statistics to the median */
507 statistic <= quantile_f(1/2,df1,df2)
508 then pvalue: 2 * pvalue
509 else pvalue: 2 * (1 - pvalue))),
511 /* result as an 'inference_result' object*/
512 inference_result("VARIANCE RATIO TEST",
513 [ ['ratio_estimate, vr],
514 ['conf_level, defaults[4]],
515 ['conf_interval, cinterval],
518 ['statistic, statistic],
519 ['distribution, distribution],
520 ['p_value, pvalue] ],
521 [1,2,3,4,5,6,7,8]) )$
529 /* This is the non parametric sign test for the median. Argument 'x' is a */
530 /* list or column matrix with expressions, generally numbers. */
531 /* Admits the following option: */
532 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
533 /* values are: 'twosided, 'greater (med1>median) and less (med1<median)*/
534 /* 'median=0: the median value to be checked */
536 /* The output of this function is an 'inference_result' object */
537 /* with the following results: */
538 /* 1. 'med_estimate= median estimate */
539 /* 2. 'method= assumptions about the means. */
540 /* 3. 'hypotheses= null hypothesis and alternative */
541 /* 4. 'statistic= statistic used in the procedure */
542 /* 5. 'distribution= distribution and its parameters */
543 /* 6. 'p_value= p-value of the sample statistic */
544 test_sign(x,[select]):=
545 block([numer:stats_numer, options, defaults, med, n, npos, aux, xm,
546 statistic, method, distribution, hypo, pvalue, listarith:true],
548 /* controlling sample format */
549 if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
550 then error("Sample 'x' should be a list with expressions or a column matrix"),
552 /* updating and controlling options */
553 options: ['alternative, 'median],
554 defaults: ['twosided, 0],
556 aux: ?position(lhs(i),options),
557 if numberp(aux) and aux <= length(options) and aux >= 1
558 then defaults[aux]: rhs(i)),
559 if not member(defaults[1],['twosided, 'greater, 'less])
560 then error("Option 'alternative' is not correct"),
562 /* median estimate */
564 if listp(med) then med: med[1],
568 if matrixp(xm) then xm: transpose(xm)[1],
569 statistic: apply("+", map(lambda([z], if is(z<0) then 1 else 0), xm)),
570 npos: apply("+", map(lambda([z], if is(z>0) then 1 else 0), xm)),
574 method: "Non parametric sign test.",
577 distribution: ['binomial, n, 1/2],
579 /* hypotheses, pvalue */
580 aux: string(defaults[2]),
581 if /*alternative*/ defaults[1] = 'greater
582 then (hypo: concat("H0: median = ", aux," , H1: median > ", aux),
583 pvalue: 1 - cdf_binomial(statistic,n,1/2) )
584 else if /*alternative*/ defaults[1] = 'less
585 then (hypo: concat("H0: median = ", aux," , H1: median < ", aux),
586 pvalue: cdf_binomial(statistic,n,1/2) )
587 else (hypo: concat("H0: median = ", aux," , H1: median # ", aux),
589 then pvalue: 2 * cdf_binomial(statistic,n,1/2)
590 else pvalue: 2 * (1 - cdf_binomial(statistic,n,1/2)) ),
592 /* result as an 'inference_result' object*/
593 inference_result("SIGN TEST",
594 [ ['med_estimate, med],
597 ['statistic, statistic],
598 ['distribution, distribution],
599 ['p_value, pvalue] ],
607 /* This is the signed rank test to make inferences about the median of a */
608 /* continuous population. Argument 'x' is a list or column matrix with */
609 /* expressions, generally numbers. Performs normal approximation if the */
610 /* sample size is >20, or if there are zeroes or ties. (Cuadras, 13.2; */
611 /* R, wilcox.test.R) */
612 /* Admits the following option: */
613 /* 'median=0: this is the value of the median to be checked. */
614 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
615 /* values are: 'twosided, 'greater (med1>med2) and less (med1<med2). */
616 /* The output of this function is an 'inference_result' object */
617 /* with the following results: */
618 /* 1. 'med_estimate= median estimate */
619 /* 2. 'method= assumptions about the means. */
620 /* 3. 'hypotheses= null hypothesis and alternative */
621 /* 4. 'statistic= statistic used in the procedure */
622 /* 5. 'distribution= distribution and its parameters */
623 /* 6. 'p_value= p-value of the sample statistic */
624 test_signed_rank(x,[select]):=
625 block([numer:stats_numer, options, defaults, n, aux, med, zeroes:false,
626 pos, nequal, npositive, rank, ties:[], sigma, mu, noties, statistic,
627 method, distribution, hypo, pvalue, listarith:true],
629 /* controlling sample format */
630 if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
631 then error("Sample 'x' should be a list with expressions or a column matrix")
634 /* updating and controlling options */
635 options: ['median, 'alternative],
636 defaults: [0, 'twosided],
638 aux: ?position(lhs(i),options),
639 if numberp(aux) and aux <= length(options) and aux >= 1
640 then defaults[aux]: rhs(i)),
641 if not member(defaults[2],['twosided, 'greater, 'less])
642 then error("Option 'alternative' is not correct"),
644 /* sample size and median estimate */
647 if listp(med) then med: med[1],
650 x: x - defaults[1], /* sustract the median to be checked */
652 x: sublist(x, lambda([z], is(float(z) # 0.0))),
656 x: sort(makelist([x[k],abs(x[k])],k,1,n),
657 lambda([u,v], orderlessp(u[2], v[2]))),
666 while pos+nequal <= n and x[pos+nequal][2] = x[pos][2] do
667 (if x[pos+nequal][1] > 0 then npositive: npositive + 1,
668 rank: rank + pos + nequal,
670 statistic: statistic + npositive * rank / nequal,
672 ties: cons(nequal, ties)),
674 /* pvalue, method, distribution */
675 noties: every(lambda([z],z=1), ties),
677 if n < 20 and noties and not zeroes
678 then (/* performs exact test */
679 method: "Exact test",
680 distribution: ['signed_rank, n],
681 if /*alternative*/ defaults[2] = 'twosided
682 then (if statistic > mu
683 then pvalue: 1 - cdf_signed_rank(statistic-1, n)
684 else pvalue: cdf_signed_rank(statistic, n),
685 pvalue: min(2*pvalue, 1) )
686 else if /*alternative*/ defaults[2] = 'greater
687 then pvalue: 1 - cdf_signed_rank(statistic-1, n)
688 else pvalue: cdf_signed_rank(statistic, n) )
689 else (/* asymptotic test */
690 method: "Asymptotic test",
691 if not noties then method: concat(method, ". Ties"),
692 if zeroes then method: concat(method, ". Zeroes"),
693 sigma: sqrt(mu * (2*n + 1) / 6 - apply("+",ties^3-ties) / 48),
694 if /*alternative*/ defaults[2] = 'twosided
695 then (pvalue: cdf_normal(statistic, mu + signum(statistic - mu)/2, sigma),
696 pvalue: 2 * min(pvalue, 1 - pvalue),
697 distribution: ['normal, mu + signum(statistic - mu)/2, sigma])
698 else if /*alternative*/ defaults[2] = 'greater
699 then (pvalue: 1 - cdf_normal(statistic, mu+1/2, sigma),
700 distribution: ['normal, mu + 1/2, sigma] )
701 else pvalue: (cdf_normal(statistic, mu-1/2, sigma),
702 distribution: ['normal, mu-1/2, sigma] )),
705 aux: string(defaults[1]),
706 if /*alternative*/ defaults[2] = 'greater
707 then hypo: concat("H0: med = ", aux," , H1: med > ", aux)
708 else if /*alternative*/ defaults[2] = 'less
709 then hypo: concat("H0: med = ", aux," , H1: med < ", aux)
710 else hypo: concat("H0: med = ", aux," , H1: med # ", aux),
712 /* result as an 'inference_result' object*/
713 inference_result("SIGNED RANK TEST",
714 [ ['med_estimate, med],
717 ['statistic, statistic],
718 ['distribution, distribution],
719 ['p_value, pvalue] ],
728 /* This is the Wilcoxon-Mann-Whitney test to compare the medians of two */
729 /* independent samples taken from two continuous populations. The first two */
730 /* arguments 'x1' and 'x2' are lists or column matrices with expressions, */
731 /* generally numbers. Performs normal approximation if any of the sample sizes */
732 /* is >10, or if there are ties. (Cuadras, 13.3; R, wilcox.test.R) */
733 /* Admits the following option: */
734 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
735 /* values are: 'twosided, 'greater (med1>med2) and less (med1<med2). */
736 /* The output of this function is an 'inference_result' object */
737 /* with the following results: */
738 /* 1. 'method= type of test. */
739 /* 2. 'hypotheses: null hypothesis and alternative */
740 /* 3. 'statistic: statistic used in the procedure */
741 /* 4. 'distribution: sample statistic distribution. */
742 /* 5. 'p_value: p-value of the sample statistic */
743 test_rank_sum(x1,x2,[select]):=
744 block([numer:stats_numer, options, defaults, n1, n2, n, aux, ordered, pos, nequal, nfirst, rank, ties:[],
745 sigma, mu, noties, statistic, method, distribution, hypo, pvalue, listarith:true],
747 /* controlling sample format */
748 if not listofexpr(x1) and not (matrixp(x1) and length(x1[1]) = 1 and every('identity,map('listofexpr,args(x1))))
749 then error("Sample 'x1' should be a list with expressions or a column matrix")
750 else (n1: length(x1)),
751 if not listofexpr(x2) and not (matrixp(x2) and length(x2[1]) = 1 and every('identity,map('listofexpr,args(x2))))
752 then error("Sample 'x2' should be a list with expressions or a column matrix")
755 /* updating and controlling options */
756 options: ['alternative],
757 defaults: ['twosided],
759 aux: ?position(lhs(i),options),
760 if numberp(aux) and aux <= length(options) and aux >= 1
761 then defaults[aux]: rhs(i)),
762 if not member(defaults[1],['twosided, 'greater, 'less])
763 then error("Option 'alternative' is not correct"),
770 /* statistic W: both samples are combined in one */
771 /* ordered list and ranks of the second sample are */
772 /* added. In case of ties, ranks of equal numbers */
773 /* are averaged and the p value is computed by the */
774 /* normal approximation. [Cuadras, p. 13.13] */
776 ordered: sort(append(makelist([x1[i],1],i,1,n1),
777 makelist([x2[i],2],i,1,n2)),
778 lambda([u,v], orderlessp(u[1], v[1]))),
782 if ordered[pos][2] = 2
786 while pos+nequal <= n and ordered[pos+nequal][1] = ordered[pos][1] do
787 (if ordered[pos+nequal][2] = 1 then nfirst: nfirst + 1,
788 rank: rank + pos + nequal,
790 statistic: statistic + nfirst * rank / nequal,
792 ties: cons(nequal, ties)),
793 /* convert to Mann-Whitney statistic */
794 statistic: statistic - n1 * (n1 + 1) / 2,
796 /* pvalue, method, distribution */
797 noties: every(lambda([z],z=1), ties),
799 if n1 < 10 and n2 < 10 and noties
800 then (/* performs exact test */
801 method: "Exact test",
802 distribution: ['rank_sum, n1, n2],
803 if /*alternative*/ defaults[1] = 'twosided
804 then (if statistic > mu
805 then pvalue: 1 - cdf_rank_sum(statistic-1, n1, n2)
806 else pvalue: cdf_rank_sum(statistic, n1, n2),
807 pvalue: min(2*pvalue, 1) )
808 else if /*alternative*/ defaults[1] = 'greater
809 then pvalue: 1 - cdf_rank_sum(statistic-1, n1, n2)
810 else pvalue: cdf_rank_sum(statistic, n1, n2) )
811 else (/* asymptotic test */
812 method: "Asymptotic test",
813 if not noties then method: concat(method, ". Ties"),
814 sigma: sqrt(mu / 6 * (n + 1 - apply("+",ties^3-ties) / (n * (n-1)))),
815 if /*alternative*/ defaults[1] = 'twosided
816 then (pvalue: cdf_normal(statistic, mu + signum(statistic - mu)/2, sigma),
817 pvalue: 2 * min(pvalue, 1 - pvalue),
818 distribution: ['normal, mu + signum(statistic - mu)/2, sigma])
819 else if /*alternative*/ defaults[1] = 'greater
820 then (pvalue: 1 - cdf_normal(statistic, mu+1/2, sigma),
821 distribution: ['normal, mu + 1/2, sigma] )
822 else (pvalue: cdf_normal(statistic, mu-1/2, sigma),
823 distribution: ['normal, mu-1/2, sigma] )),
826 if /*alternative*/ defaults[1] = 'greater
827 then hypo: "H0: med1 = med2 , H1: med1 > med2"
828 else if /*alternative*/ defaults[1] = 'less
829 then hypo: "H0: med1 = med2 , H1: med1 < med2"
830 else hypo: "H0: med1 = med2 , H1: med1 # med2",
832 /* result as an 'inference_result' object*/
833 inference_result("RANK SUM TEST",
836 ['statistic, statistic],
837 ['distribution, distribution],
838 ['p_value, pvalue] ],
848 /* This is the proportions test. The first argument 'x' is the number of */
849 /* successes, the second number 'n>=x' is the number of trials. */
850 /* It admits the following options: */
851 /* 'proportion=1/2: this is the value of the proportion to be checked. */
852 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
853 /* values are: 'twosided, 'greater and 'less. */
854 /* 'conflevel=95/100: confidence level of the confidence interval; valid */
855 /* values are: a symbol or an expression which takes a value in (0,1) */
856 /* 'asymptotic=false: whether it performs an exact test based on the binomial */
857 /* distribution, or an asymptotic one based on the normal. */
858 /* 'correct=true: whether Yates correction must be applied in case of the */
859 /* asymptotic calculation of the confidence interval. */
861 /* The output of this function is an 'inference_result' object with the */
862 /* following results: */
863 /* 1. 'sample_proportion= sample estimate for the mean */
864 /* 2. 'conf_level= confidence level */
865 /* 3. 'conf_interval= confidence interval */
866 /* 4. 'method= type of test and whether Yates correction is applied. */
867 /* 5. 'hypotheses= null hypothesis and alternative */
868 /* 6. 'statistic= statistic used in the procedure */
869 /* 7. 'distribution= statistic's distribution and its parameters */
870 /* 8. 'p_value= p-value of this test */
871 test_proportion(x,n,[select]):=
872 block([numer:stats_numer, options, defaults, aux, phat, method, statistic,
873 distribution, cinterval, hypo, pvalue, alpha, pl:0, pu:1, stdev],
875 /* controlling input data */
876 if integerp(x) and integerp(n) and (n < x or x < 0)
877 then error("Input data must be 0 <= x <= n"),
879 /* updating and controlling options */
880 options: ['proportion, 'alternative, 'conflevel, 'correct, 'asymptotic],
881 defaults: [1/2, 'twosided , 95/100, true, false],
883 aux: ?position(lhs(i),options),
884 if numberp(aux) and aux <= length(options) and aux >= 1
885 then defaults[aux]: rhs(i)),
886 if numberp(defaults[1]) and (defaults[1] <= 0 or defaults[1] >= 1)
887 then error("Option 'proportion' is not correct"),
888 if not member(defaults[2],['twosided, 'greater, 'less])
889 then error("Option 'alternative' is not correct"),
890 if numberp(defaults[3]) and (defaults[3] <= 0 or defaults[3] >= 1)
891 then error("Option 'conflevel' can't be outside interval (0,1)"),
892 if not member(defaults[4],[true, false])
893 then error("Option 'correct' must be true or false"),
894 if not member(defaults[5],[true, false])
895 then error("Option 'asymptotic' must be true or false"),
897 /* proportion estimate */
901 method: concat(if /*asymptotic*/ defaults[5] = true
902 then "Asymptotic test"
903 else "Exact binomial test",
904 if /*correct*/ defaults[4] and defaults[5]
905 then " with Yates correction."
909 if /*asymptotic*/ defaults[5] = false
911 distribution: ['binomial, n, defaults[1]],
912 if /*alternative*/ defaults[2] = 'twosided
913 then (alpha: (1 - defaults[3])/2,
915 then pl: quantile_beta(alpha, x, n-x+1),
917 then pu: quantile_beta(1-alpha, x+1, n-x),
919 block([m: n * defaults[1],
920 d: pdf_binomial(x, n, defaults[1]),
922 if sign(x - m) = 'zero
925 then (for k: ceiling(m) thru n do
926 if pdf_binomial(k, n, defaults[1]) <= d
928 pvalue: cdf_binomial(x, n, defaults[1]) +
929 1 - cdf_binomial(n-y, n, defaults[1]))
930 else (for k: 0 thru floor(m) do
931 if pdf_binomial(k, n, defaults[1]) <= d
933 pvalue: cdf_binomial(y-1, n, defaults[1]) +
934 1 - cdf_binomial(x-1, n, defaults[1]))))
935 elseif /*alternative*/ defaults[2] = 'less
937 then pu: quantile_beta(defaults[3], x+1, n-x),
939 pvalue: cdf_binomial(x, n, defaults[1]))
940 else ( /* alternative is greater */
942 then pl: quantile_beta(1 - defaults[3], x, n-x+1),
944 pvalue: 1 - cdf_binomial(x-1, n, defaults[1])))
946 else (/* asymptotic test*/
948 stdev: sqrt((phat*(1-phat))/n),
949 distribution: ['normal, defaults[1], stdev],
951 /* calculate pvalue */
952 if defaults[2] = 'twosided
953 then /* alternative is twosided */
954 pvalue: 2*(1-cdf_normal(defaults[1]+abs(phat-defaults[1]), defaults[1], stdev))
955 elseif defaults[2] = 'less
956 then /* alternative is less */
957 pvalue: cdf_normal(phat, defaults[1], stdev)
958 else /* alternative is greater */
959 pvalue: 1 - cdf_normal(phat, defaults[1], stdev),
961 /* calculate Wilson score confidence interval */
962 block([z, yates, z22n, pc, pl:0, pu:1],
963 if defaults[2] = 'twosided
964 then z: quantile_normal((1 + defaults[3])/2, 0 ,1)
965 else z: quantile_normal(defaults[3], 0 ,1),
966 if defaults[4] /* Yates correction */
967 then yates: min(1/2, abs(x - n * defaults[1]))
970 if defaults[2] = 'twosided or defaults[2] = 'greater
971 then (pc: phat - yates/n,
973 then pl: (pc+z22n-z*sqrt((pc*(1-pc))/n + z22n/(2*n))) / (1+2*z22n)),
974 if defaults[2] = 'twosided or defaults[2] = 'less
975 then (pc: phat + yates/n,
977 then pu: (pc+z22n+z*sqrt((pc*(1-pc))/n + z22n/(2*n))) / (1+2*z22n)),
978 cinterval: [pl, pu] )),
981 aux: string(defaults[1]),
982 if /*alternative*/ defaults[2] = 'greater
983 then hypo: concat("H0: p = ", aux," , H1: p > ", aux)
984 elseif /*alternative*/ defaults[2] = 'less
985 then hypo: concat("H0: p = ", aux," , H1: p < ", aux)
986 else hypo: concat("H0: p = ", aux," , H1: p # ", aux),
988 /* result as an 'inference_result' object*/
989 inference_result("PROPORTION TEST",
990 [ ['sample_proportion, phat],
991 ['conf_level, defaults[3]],
992 ['conf_interval, cinterval],
995 ['statistic, statistic],
996 ['distribution, distribution],
997 ['p_value, pvalue] ],
998 [1,2,3,4,5,6,7,8]) )$
1006 /* Test for the difference of two proportions. The first and second arguments, */
1007 /* 'x1' and 'n1>=x1', are the number of successes and total number of trials in */
1008 /* the first sample, respectively; and the third and fourth arguments, */
1009 /* 'x2' and 'n2>=x2', are the corresponding numbers in the second sample. */
1010 /* This is an asymptotic test which requires n1 and n2 to be both >= 10, */
1011 /* and both samples are considered independent. */
1012 /* It admits the following options: */
1013 /* 'alternative='twosided: this is the alternative hypothesis H1; valid */
1014 /* values are: 'twosided, 'greater and 'less. */
1015 /* 'conflevel=95/100: confidence level of the confidence interval; valid */
1016 /* values are: a symbol or an expression which takes a value in (0,1) */
1017 /* 'correct=true: whether Yates correction must be applied or not */
1019 /* The output of this function is an 'inference_result' object with the */
1020 /* following results: */
1021 /* 1. 'proportions= list with the two sample proportions */
1022 /* 2. 'conf_level= confidence level */
1023 /* 3. 'conf_interval= confidence interval */
1024 /* 4. 'method= name of the test and/or warning message in case of n1 or n2<10 */
1025 /* 5. 'hypotheses= null hypothesis and alternative */
1026 /* 6. 'statistic= statistic used in the procedure, namely the difference p1-p2 */
1027 /* 7. 'distribution= statistic's asymptotic distribution and its parameters */
1028 /* 8. 'p_value= p-value of this test */
1029 test_proportions_difference(x1,n1,x2,n2,[select]) :=
1030 block([numer:stats_numer, options, defaults, aux, phat, difphat, method,
1031 cinterval, hypo, sd, yates: 1/2, width],
1033 /* controlling input data */
1034 if integerp(x1) and integerp(n1) and (n1 < x1 or x1 < 0)
1035 then error("Input data must be 0 <= x1 <= n1"),
1036 if integerp(x2) and integerp(n2) and (n2 < x2 or x2 < 0)
1037 then error("Input data must be 0 <= x2 <= n2"),
1039 /* updating and controlling options */
1040 options: ['alternative, 'conflevel, 'correct],
1041 defaults: ['twosided , 95/100, true],
1043 aux: ?position(lhs(i),options),
1044 if numberp(aux) and aux <= length(options) and aux >= 1
1045 then defaults[aux]: rhs(i)),
1046 if not member(defaults[1],['twosided, 'greater, 'less])
1047 then error("Option 'alternative' is not correct"),
1048 if numberp(defaults[2]) and (defaults[2] <= 0 or defaults[2] >= 1)
1049 then error("Option 'conflevel' can't be outside interval (0,1)"),
1050 if not member(defaults[3],[true, false])
1051 then error("Option 'correct' must be true or false"),
1053 /* proportions estimates */
1054 phat: [x1/n1, x2/n2],
1055 difphat: phat[1] - phat[2],
1058 method: concat("Asymptotic test.",
1060 then " Yates correction."
1062 if integerp(n1) and n1 < 10 or integerp(n2) and n2 < 10
1063 then " Warning: small sample."
1066 /* confidence interval */
1067 if not defaults[3] /* don't apply Yates correction */
1069 yates: min(yates, abs(difphat) / (1/n1 + 1/n2)),
1070 if /*alternative*/ defaults[1] = 'twosided
1071 then aux: quantile_normal((1 + defaults[2])/2, 0, 1)
1072 else aux: quantile_normal(defaults[2], 0, 1),
1073 width: aux * sqrt((phat[1]*(1-phat[1]))/n1 + (phat[2]*(1-phat[2]))/n2) +
1074 yates * (1/n1 + 1/n2),
1075 if /*alternative*/ defaults[1] = 'twosided
1076 then cinterval : [max(difphat - width, -1), min(difphat + width, 1)]
1077 elseif defaults[1] = 'greater
1078 then cinterval : [max(difphat - width, -1), 1]
1079 else cinterval : [-1, min(difphat + width, 1)],
1082 if /*alternative*/ defaults[1] = 'greater
1083 then hypo: concat("H0: p1 = p2 , H1: p1 > p2")
1084 elseif /*alternative*/ defaults[1] = 'less
1085 then hypo: concat("H0: p1 = p2 , H1: p1 < p2")
1086 else hypo: concat("H0: p1 = p2 , H1: p1 # p2"),
1088 /* distribution and p-value */
1089 aux: (x1+x2)/(n1+n2),
1090 sd: sqrt(aux * (1-aux) * (1/n1 + 1/n2)),
1091 distribution: ['normal, 0, sd],
1092 if /*alternative*/ defaults[1] = 'twosided
1093 then pvalue : 2 * (1 - cdf_normal(abs(difphat), 0, sd))
1094 elseif defaults[1] = 'greater
1095 then pvalue : 1 - cdf_normal(difphat, 0, sd)
1096 else pvalue : cdf_normal(difphat, 0, sd),
1098 /* result as an 'inference_result' object*/
1099 inference_result("DIFFERENCE OF PROPORTIONS TEST",
1100 [ ['proportions, phat],
1101 ['conf_level, defaults[2]],
1102 ['conf_interval, cinterval],
1104 ['hypotheses, hypo],
1105 ['statistic, difphat],
1106 ['distribution, distribution],
1107 ['p_value, pvalue] ],
1108 [1,2,3,4,5,6,7,8]) ) $
1114 simple_linear_regression(dat,[select]):=block([numer:stats_numer, options, defaults, n, aux,
1115 means, covar, corr, resvar, adc, a, b, pred, res, sig2, aconfint,
1116 coef, bconfint, vconfint,
1117 statistic, distribution, hypo, pvalue, listarith:true],
1119 /* controlling sample format */
1120 if not matrixp(dat) then dat: apply('matrix,dat),
1121 if length(dat[1]) # 2 or not every('identity,map('listofexpr,args(dat)))
1122 then error("Sample must contain pairs of expressions")
1123 else (n: length(dat)),
1125 if n < 3 then error("Sample size must be greater than 2"),
1127 /* updating and controlling options */
1128 options: ['alternative, 'conflevel, 'regressor],
1129 defaults: ['twosided, 95/100, 'x],
1131 aux: ?position(lhs(i),options),
1132 if numberp(aux) and aux <= length(options) and aux >= 1
1133 then defaults[aux]: rhs(i)),
1134 if not member(defaults[1],['twosided, 'greater, 'less])
1135 then error("Option 'alternative' is not correct"),
1136 if numberp(defaults[2]) and (defaults[2] <= 0 or defaults[2] >= 1)
1137 then error("Option 'conflevel' can't be outside interval (0,1)"),
1138 if not symbolp(defaults[3])
1139 then error("Name of independent variable must be a symbol"),
1144 corr: covar[1,2] / sqrt(covar[1,1] * covar[2,2]),
1145 b: covar[1,2] / covar[1,1],
1146 a: means[2] - b * means[1],
1148 /* computing predictions, residuals, residual variance and adc */
1149 pred: transpose(a + b * col(dat,1))[1],
1150 res: transpose(col(dat,2))[1] - pred,
1151 sig2: mean(res^2), /* ml estimator for sigma^2 */
1152 resvar: n * sig2 / (n-2), /* residual variance */
1153 adc: 1 - (1-1/n) * resvar / covar[2,2],
1155 /* two sided confidence interval for a */
1156 aconfint: a + [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2) *
1157 sqrt(apply("+",transpose(col(dat,1))[1]^2) * resvar / (n^2 * covar[1,1])),
1159 /* confidence interval for b and hypothesis test. */
1160 /* Note that at this moment, there is not any option for the */
1161 /* alternative hypothesis, it is always considered two-sided.*/
1162 /* I maintain these conditionals here in case there are */
1163 /* changes in future releases. */
1164 if float(resvar) = 0.0
1165 then /* data on the straight line */
1166 bconfint: statistic: hypo: statistic: distribution: pvalue: 'undefined
1168 coef: sqrt(resvar / (n * covar[1,1])),
1169 statistic: b / coef,
1170 if /*alternative*/ defaults[1] = 'twosided
1171 then statistic: abs(statistic),
1172 if /*alternative*/ defaults[1] = 'greater
1173 then (bconfint: [b - quantile_student_t(defaults[2], n-2) * coef,'inf],
1174 hypo: "H0: b = 0 ,H1: b > 0",
1175 pvalue: 1 - cdf_student_t(statistic,n-2) )
1176 else if /*alternative*/ defaults[1] = 'less
1177 then (bconfint: ['minf, b + quantile_student_t(defaults[2], n-2) * coef],
1178 hypo: "H0: b = 0 ,H1: b < 0",
1179 pvalue: cdf_student_t(statistic,n-2) )
1180 else /* twosided alternative */
1181 (bconfint: b + [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2) * coef,
1182 hypo: "H0: b = 0 ,H1: b # 0",
1183 pvalue: 2 * (1 - cdf_student_t(statistic,n-2)) ),
1184 distribution: ['student_t, n-2] ),
1186 /* two sided confidence interval for sigma^2 */
1187 vconfint: (n-2) * resvar / [quantile_chi2((1+defaults[2])/2,n-2),
1188 quantile_chi2((1-defaults[2])/2,n-2)],
1190 /* result as an 'inference_result' object*/
1191 inference_result("SIMPLE LINEAR REGRESSION",
1192 [ ['model, a + b * defaults[3] /* = regressor name */],
1194 ['variances, [covar[1,1], covar[2,2]]],
1195 ['correlation, corr],
1198 ['a_conf_int, aconfint],
1200 ['b_conf_int, bconfint],
1201 ['hypotheses, hypo],
1202 ['statistic, statistic],
1203 ['distribution, distribution],
1205 ['v_estimation, resvar],
1206 ['v_conf_int, vconfint],
1207 ['cond_mean_conf_int,
1209 [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2)*
1210 sqrt(resvar * (1/n+(means[1]-defaults[3])^2/(covar[1,1]*n)))],
1211 ['new_pred_conf_int,
1213 [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2)*
1214 sqrt(resvar * ((n+1)/n+(means[1]-defaults[3])^2/(covar[1,1]*n)))],
1215 ['residuals, sort(args(transpose(matrix(pred,res))),
1216 lambda([x,y], orderlessp(x,y)) )] ],
1217 [1,4,14,9,10,11,12,13]) )$
1224 /* Multivariate linear regression, y_i=b_0+b_1*x_1i+...+b_k*x_ki+u_i, where */
1225 /* u_i are N(0,sigma) iid random variables. Argument dat must be a matrix */
1226 /* with more than one column. */
1227 /* Admits the following options: */
1228 /* 'conflevel=95/100: confidence level of the confidence intervals; valid */
1229 /* values are: a symbol or an expression which takes a value in (0,1) */
1230 /* The output of this function is an 'inference_result' object */
1231 /* with the following results: */
1232 /* 1. 'b_estimation= coefficient estimates */
1233 /* - 2. 'b_covariances= covariance matrix of coefficient estimates */
1234 /* - 3. 'b_conf_int= confidence intervals of coefficient estimates */
1235 /* 4. 'b_statistics= statistics for testing coefficient */
1236 /* 5. 'b_p_values= p-values for coefficient tests */
1237 /* 6. 'b_distribution= probability distribution for coefficient tests */
1238 /* 7. 'v_estimation= unbiased variance estimator */
1239 /* 8. 'v_conf_int= variance confidence interval */
1240 /* 9. 'v_distribution= probability distribution for variance test */
1241 /* -10. 'covariances= data covariance matrix */
1242 /* -11. 'residuals= residuals */
1243 /* 12. 'adc= adjusted determination coefficient */
1244 /* -13. 'aic= akaike's information criterion */
1245 /* -14. 'bic= bayes's information criterion */
1246 /* Items marked with the minus sign are kept hidden. */
1247 linear_regression(dat,[select]) :=
1249 [numer:stats_numer, options, defaults, n, p, aux, x, y, covb, q, b, r,
1250 df, sR2, sumsquares, Rbar, ci, t, pv, s2, aic, bic],
1252 /* controlling sample format */
1253 if not matrixp(dat) then dat: apply('matrix,dat),
1254 if length(dat[1]) < 2 or not every('identity,map('listofexpr,args(dat)))
1255 then error("Sample must be a matrix or a list of lists of equal length"),
1257 /* updating and controlling options */
1258 options: ['conflevel],
1261 aux: ?position(lhs(i),options),
1262 if numberp(aux) and aux <= length(options) and aux >= 1
1263 then defaults[aux]: rhs(i)),
1264 if numberp(defaults[1]) and (defaults[1] <= 0 or defaults[1] >= 1)
1265 then error("Option 'conflevel' can't be outside interval (0,1)"),
1269 /* number or regressors */
1270 p: length(dat[1]) - 1,
1274 x: addcol(apply(matrix, makelist([1],k,n)), submatrix(dat, p+1)),
1276 b: block([xt: transpose(x)],
1277 covb: invert(xt . x),
1283 /* degrees of freedom */
1285 /* unbiased variance estimator */
1287 sR2: sumsquares / df,
1288 /* covariance matrix of estimators */
1290 q: makelist(covb[j,j],j,length(covb)),
1291 /* adjusted coefficient of determination */
1292 Rbar: 1 - sR2/first(var1(y)),
1293 /* confidence intervals for coefficients */
1294 aux: quantile_student_t((1+defaults[1])/2, df) * sqrt(q),
1295 ci: makelist(b[k]+[-1,1]*aux[k],k,1,length(b)),
1296 /* statistic contrasts */
1299 pv: 2*(1-map(lambda([z], cdf_student_t(abs(z),df)), t)),
1300 /* confidence interval for the variance */
1301 s2: df*sR2*[1,1] / [quantile_chi2((1+defaults[1])/2,df),
1302 quantile_chi2((1-defaults[1])/2,df)],
1304 aic: n*log(sumsquares/n) + 2*(p+1),
1305 bic: n*log(sumsquares/n) + log(n)*(p+1),
1308 [b,covb,ci,t,pv,sR2,s2,r,Rbar,aic,bic]: float([b,covb,ci,t,pv,sR2,s2,r,Rbar,aic,bic]),
1310 /* result as an 'inference_result' object */
1311 inference_result("LINEAR REGRESSION MODEL",
1312 [ ['b_estimation, b],
1313 ['b_covariances, covb],
1317 ['b_distribution, ['student_t, df]],
1318 ['v_estimation, sR2],
1320 ['v_distribution, ['chi2, df]],
1322 ['adc, Rbar], /* adjusted determination coefficient */
1325 [1,4,5,6,7,8,9,11]) )$
1337 /***************************************/
1338 /* SPECIAL PROBABILITY DISTRIBUTIONS */
1339 /***************************************/
1342 /* SOME AUXILIARY FUNCTIONS TO BE */
1343 /* USED BY THE SPECIAL PROBABILITY DISTRIBUTIONS */
1346 /* If m and n are positive integers, it returns 1.
1347 If at least one of them is not a positive integer, the
1348 function returns -1. If there is not enough information, the
1351 if integerp(m) and sign(m)='pos and
1352 integerp(n) and sign(n)='pos
1354 else if numberp(m) and not integerp(m) or
1355 numberp(n) and not integerp(n) or
1356 member(sign(m),['neg,'zero,'nz]) or
1357 member(sign(n),['neg,'zero,'nz])
1362 /* SIGNED RANK DISTRIBUTION */
1365 /* R: dsignrank(x,n) */
1366 pdf_signed_rank(x,n):=block([cp:controli(n),t],
1367 if cp=-1 then error("Illegal parameter"),
1368 if cp=0 then return(funmake('pdf_signed_rank,[x,n])),
1370 if sign(x)='neg or numberp(x) and not integerp(x) or sign(x-t)='pos
1372 if numberp(x) and integerp(x) and numberp(n)
1373 then (/* take advantage of the symmetry */
1374 if x > t/2 then x: t - x,
1375 return(?signed_rank_recursion(x,n) / 2^n)),
1376 funmake('pdf_signed_rank,[x,n]) )$
1378 /* R: psignrank(x,n) */
1379 cdf_signed_rank(x,n):=block([cp:controli(n),t,xbis,sum:0],
1380 if cp=-1 then error("Illegal parameter"),
1381 if cp=0 then return(funmake('cdf_signed_rank,[x,n])),
1383 if sign(x)='neg then return(0),
1384 if sign(x-t)='pos then return(1),
1385 if numberp(x) and numberp(n)
1386 then (/* take advantage of the symmetry */
1388 then xbis: t - floor(x) - 1
1389 else xbis: floor(x),
1390 for k:0 thru xbis do
1391 sum: sum + ?signed_rank_recursion(k,n),
1393 if x > t/2 then sum: 1 - sum,
1395 funmake('cdf_signed_rank,[x,n]) )$
1398 /* RANK SUM DISTRIBUTION */
1401 /* R: dwilcox(x,m,n) */
1402 pdf_rank_sum(x,m,n):=block([cp:controlw(m,n),t],
1403 if cp=-1 then error("Illegal parameter"),
1404 if cp=0 then return(funmake('pdf_rank_sum,[x,m,n])),
1406 if sign(x)='neg or numberp(x) and not integerp(x) or sign(x-t)='pos
1408 if numberp(x) and integerp(x) and numberp(m) and numberp(n)
1409 then (/* take advantage of the symmetry */
1410 if x > t/2 then x: t - x,
1411 return(?rank_sum_recursion(x,m+n,m) / binomial(m+n,m))),
1412 funmake('pdf_rank_sum,[x,m,n]) )$
1414 /* R: pwilcox(x,m,n) */
1415 cdf_rank_sum(x,m,n):=block([cp:controlw(m,n),t,xbis,sum:0],
1416 if cp=-1 then error("Illegal parameter"),
1417 if cp=0 then return(funmake('cdf_rank_sum,[x,m,n])),
1419 if sign(x)='neg then return(0),
1420 if sign(x-t)='pos then return(1),
1421 if numberp(x) and numberp(m) and numberp(n)
1422 then (/* take advantage of the symmetry */
1424 then xbis: t - floor(x) - 1
1425 else xbis: floor(x),
1426 for k:0 thru xbis do
1427 sum: sum + ?rank_sum_recursion(k,m+n,m),
1428 sum: sum / binomial(m+n,m),
1429 if x > t/2 then sum: 1 - sum,
1431 funmake('cdf_rank_sum,[x,m,n]) )$