share/stats/stats.mac

   1 /*               COPYRIGHT NOTICE
   2
   3 Copyright (C) 2006,2012 Mario Rodriguez Riotorto
   4
   5 This program is free software; you can redistribute
   6 it and/or modify it under the terms of the
   7 GNU General Public License as published by
   8 the Free Software Foundation; either version 2
   9 of the License, or (at your option) any later version.
  10
  11 This program is distributed in the hope that it
  12 will be useful, but WITHOUT ANY WARRANTY;
  13 without even the implied warranty of MERCHANTABILITY
  14 or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 GNU General Public License for more details at
  16 http://www.gnu.org/copyleft/gpl.html
  17 */
  18
  19
  20 /*             INTRODUCTION
  21
  22 This is a Maxima package for some classical statistical inference
  23 procedures.
  24
  25 */
  26
  27 put('stats, 1, 'version) $
  28
  29 if not get('descriptive, 'version)
  30    then load("descriptive")$
  31
  32 if not get('distrib, 'version)
  33    then load("distrib")$
  34
  35 load("inference_result")$
  36 load("numstats")$
  37
  38
  39 stats_numer : true$
  40
  41 /* This is the mean test. The first argument 'x' is a list or a column matrix  */
  42 /* of expressions (one sample)                                                 */
  43 /* Admits the following options:                                               */
  44 /*   'mean=0: this is the value of the mean to be checked.                     */
  45 /*   'alternative='twosided: this is the alternative hypothesis H1; valid      */
  46 /*            values are: 'twosided, 'greater and 'less.                       */
  47 /*   'dev='unknown: this is the value of the standard deviation when it is     */
  48 /*            known; valid values are: 'unknown, an expression or a positive   */
  49 /*            number                                                           */
  50 /*   'conflevel=95/100: confidence level of the confidence interval; valid     */
  51 /*           values are: a symbol or an expression which takes a value in (0,1)*/
  52 /*   'asymptotic=false: whether it performs an exact t-test or an asymptotic   */
  53 /*           one; valid values are true and false                              */
  54 /*                                                                             */
  55 /* The output of this function is an 'inference_result' object with the        */
  56 /* following results:                                                          */
  57 /*   1. 'mean_estimate= sample estimate for the mean                           */
  58 /*   2. 'conf_level= confidence level                                          */
  59 /*   3. 'conf_interval= confidence interval                                    */
  60 /*   4. 'method= assumption about the standard deviations, asymptotic.         */
  61 /*   5. 'hypotheses= null hypothesis and alternative                           */
  62 /*   6. 'statistic= statistic used in the procedure                            */
  63 /*   7. 'distribution= distribution and its parameters                         */
  64 /*   8. 'p_value= p-value of the sample statistic                              */
  65 test_mean(x,[select]):=
  66 block([numer:stats_numer, options, defaults, m, n, coef, cinterval, aux,
  67        statistic, method, distribution, hypo, pvalue, listarith:true],
  68
  69   /* controlling sample format */
  70   if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
  71      then error("Sample 'x' should be a list with expressions or a column matrix")
  72      else n: length(x),
  73
  74   /* updating and controlling options */
  75   options:  ['mean, 'alternative, 'dev,     'conflevel, 'asymptotic],
  76   defaults: [0,     'twosided,    'unknown,  95/100,     false],
  77   for i in select do(
  78      aux: ?position(lhs(i),options),
  79      if numberp(aux) and aux <= length(options) and aux >= 1
  80         then defaults[aux]: rhs(i)),
  81   if not member(defaults[2],['twosided, 'greater, 'less])
  82      then error("Option 'alternative' is not correct"),
  83   if member(sign(defaults[3]), ['neg, 'zero, 'nz])
  84      then error("Option 'dev' can't be negative nor zero"),
  85   if numberp(defaults[4]) and (defaults[4] <= 0 or defaults[4] >= 1)
  86      then error("Option 'conflevel' can't be outside interval (0,1)"),
  87   if not member(defaults[5],[true, false])
  88      then error("Option 'asymptotic' must be true or false"),
  89
  90   /* mean estimate */
  91   m: mean(x),
  92   if listp(m) then m: m[1],
  93
  94   /* coef: standard_deviation / sqrt(n) */
  95   if /*standard_deviation*/ defaults[3] = 'unknown
  96      then (coef: std1(x) / sqrt(n),
  97            if listp(coef) then coef: coef[1])
  98      else coef: defaults[3] / sqrt(n),
  99
 100   /* method */
 101   method: concat(if /*asymptotic*/ defaults[5] = true
 102                      then "Large sample z-test. "
 103                      else "Exact t-test. ",
 104                   if /*standard_deviation*/ defaults[3] = 'unknown
 105                      then "Unknown variance."
 106                      else "Known variance."),
 107
 108   /* confidence interval (one and two-sided), distribution and */
 109   /* p-value for alternative=less                              */
 110   if float(coef) = 0.0
 111   then statistic: distribution: pvalue: cinterval: hypo: 'undefined
 112   else(
 113     /* statistic */
 114     statistic: (m - defaults[1]) / coef,
 115     if /*alternative*/ defaults[2] = 'twosided
 116        then statistic: abs(statistic),
 117     if /*asymptotic*/ defaults[5] = false and
 118        /*standard_deviation*/ defaults[3] = 'unknown
 119        then (distribution: ['student_t, n-1],
 120              pvalue: cdf_student_t(statistic,n-1),
 121              if /*alternative*/ defaults[2] = 'greater
 122                 then cinterval: [m - quantile_student_t(defaults[4], n-1) * coef,'inf]
 123                 else if /*alternative*/ defaults[2] = 'less
 124                         then cinterval: ['minf, m + quantile_student_t(defaults[4], n-1) * coef]
 125                         else cinterval: m + [-1,1] * quantile_student_t((1 + defaults[4])/2, n-1) * coef)
 126        else (distribution: ['normal, 0, 1],
 127              pvalue: cdf_normal(statistic,0,1),
 128              if /*alternative*/ defaults[2] = 'greater
 129                 then cinterval: [m - quantile_normal(defaults[4],0,1) * coef, 'inf]
 130                 else if /*alternative*/ defaults[2] = 'less
 131                         then cinterval: ['minf, m + quantile_normal(defaults[4],0,1) * coef]
 132                         else cinterval: m + [-1,1] * quantile_normal((1 + defaults[4])/2,0,1) * coef),
 133
 134     /* hypotheses, pvalue (for alternative=greater and alternative=twosided) */
 135     aux: string(defaults[1]),
 136     if /*alternative*/ defaults[2] = 'greater
 137        then (hypo: concat("H0: mean = ", aux," , H1: mean > ", aux),
 138              pvalue: 1 - pvalue )
 139        else if /*alternative*/ defaults[2] = 'less
 140                then  hypo: concat("H0: mean = ", aux, " , H1: mean < ", aux)
 141                else (hypo: concat("H0: mean = ", aux, " , H1: mean # ", aux),
 142                      pvalue: 2 * (1 - pvalue) ) ),
 143
 144   /* result as an 'inference_result' object*/
 145   inference_result("MEAN TEST",
 146                  [  ['mean_estimate, m],
 147                     ['conf_level, defaults[4]],
 148                     ['conf_interval, cinterval],
 149                     ['method, method],
 150                     ['hypotheses, hypo],
 151                     ['statistic, statistic],
 152                     ['distribution, distribution],
 153                     ['p_value, pvalue]  ],
 154                  [1,2,3,4,5,6,7,8])    )$
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164 /* This is the difference of means test. The first two arguments 'x1' and 'x2' */
 165 /* are lists or column matrices with expressions, generally numbers.           */
 166 /* Admits the following options:                                               */
 167 /*   'alternative='twosided: this is the alternative hypothesis H1; valid      */
 168 /*            values are: 'twosided, 'greater (m1>m2) and less (m1<m2).        */
 169 /*   'dev1='unknown: this is the value of the standard deviation of the x1     */
 170 /*            sample when it is known; valid values are: 'unknown, a symbol or */
 171 /*            a positive number                                                */
 172 /*   'dev2='unknown: this is the value of the standard deviation of the x2     */
 173 /*            sample when it is known; valid values are: 'unknown, a symbol or */
 174 /*            a positive number                                                */
 175 /*   'varequal=false: whether variances are equal or not                       */
 176 /*   'conflevel=95/100: confidence level of the confidence interval; valid     */
 177 /*           values are: a symbol or an expression which takes a value in (0,1)*/
 178 /*   'asymptotic=false: whether it performs an exact t-test or an asymptotic   */
 179 /*           one; valid values are true and false                              */
 180 /*                                                                             */
 181 /* The output of this function is an 'inference_result' object                 */
 182 /* with the following results:                                                 */
 183 /*   1. 'diff_estimate= difference of means estimate (m1-m2)                   */
 184 /*   2. 'conf_level= confidence level                                          */
 185 /*   3. 'conf_interval= confidence interval                                    */
 186 /*   4. 'method: assumptions about the standard deviations, asymptotic.         */
 187 /*   5. 'hypotheses: null hypothesis and alternative                            */
 188 /*   6. 'statistic: statistic used in the procedure                             */
 189 /*   7. 'distribution: distribution and its parameters                          */
 190 /*   8. 'p_value: p-value of the sample statistic                               */
 191 test_means_difference(x1,x2,[select]):=
 192 block([numer:stats_numer, options, defaults, dm, n1, n2, v1, v2, coef, cinterval, aux, df,
 193        statistic, method, distribution, hypo, pvalue, listarith:true],
 194
 195   /* controlling sample format */
 196   if not listofexpr(x1) and not (matrixp(x1) and length(x1[1]) = 1 and every('identity,map('listofexpr,args(x1))))
 197      then error("Sample 'x1' should be a list with expressions or a column matrix")
 198      else n1: length(x1),
 199   if not listofexpr(x2) and not (matrixp(x2) and length(x2[1]) = 1 and every('identity,map('listofexpr,args(x2))))
 200      then error("Sample 'x2' should be a list with expressions or a column matrix")
 201      else n2: length(x2),
 202
 203   /* updating and controlling options */
 204   options:  ['alternative, 'dev1,    'dev2,    'varequal, 'conflevel, 'asymptotic],
 205   defaults: ['twosided,    'unknown, 'unknown, false,      95/100,     false],
 206   for i in select do(
 207      aux: ?position(lhs(i),options),
 208      if numberp(aux) and aux <= length(options) and aux >= 1
 209         then defaults[aux]: rhs(i)),
 210   if not member(defaults[1],['twosided, 'greater, 'less])
 211      then error("Option 'alternative' is not correct"),
 212   if member(sign(defaults[2]), ['neg, 'zero, 'nz]) or
 213      member(sign(defaults[3]), ['neg, 'zero, 'nz])
 214      then error("Option 'dev' can't be negative nor zero"),
 215   /* in the next two lines, ignorance is contagious */
 216   if defaults[2] = 'unknown then defaults[3]: 'unknown,
 217   if defaults[3] = 'unknown then defaults[2]: 'unknown,
 218   if not member(defaults[4],[true, false])
 219      then error("Option 'varequal' must be true or false"),
 220   if numberp(defaults[5]) and (defaults[5] <= 0 or defaults[5] >= 1)
 221      then error("Option 'conflevel' can't be outside interval (0,1)"),
 222   if not member(defaults[6],[true, false])
 223      then error("Option 'asymptotic' must be true or false"),
 224
 225   /* difference of means estimate */
 226   dm: mean(x1) - mean(x2),
 227   if listp(dm) then dm: dm[1],
 228
 229   /* coef: standard_deviation / sqrt(n) */
 230   if /*standard deviations*/ not defaults[2] = 'unknown
 231      then coef: sqrt(defaults[2]^2 / n1 + defaults[3]^2 / n2)
 232      else (v1: var1(x1),
 233            if listp(v1) then v1: v1[1],
 234            v2: var1(x2),
 235            if listp(v2) then v2: v2[1],
 236            if /*varequal*/ defaults[4] = true and
 237               /*asymptotic*/ defaults[6] = false
 238               then coef: sqrt(((n1-1)*v1 + (n2-1)*v2) / (n1+n2-2) * (1/n1+1/n2))
 239               else coef: sqrt(v1 / n1 + v2 / n2)),
 240   if listp(coef) then coef: coef[1],
 241
 242   /* method and Welch approximation */
 243   method: concat(if /*asymptotic*/ defaults[6] = true
 244                      then "Asymptotic z-test (for large samples). "
 245                      else "Exact t-test. ",
 246                   if /*standard deviations*/ defaults[2] = 'unknown
 247                      then if /*varequal*/ defaults[4] = true
 248                              then "Unknown equal variances"
 249                              else "Welch approx."
 250                      else "Known variances."),
 251
 252   if float(coef) = 0.0
 253     then statistic: distribution: pvalue: cinterval: hypo: 'undefined
 254     else (
 255       /* statistic */
 256       statistic: dm / coef,
 257       if /*alternative*/ defaults[1] = 'twosided
 258          then statistic: abs(statistic),
 259
 260       if /*asymptotic*/ defaults[6] = false and
 261          /*standard deviations*/ defaults[2] = 'unknown and
 262          /*varequal*/ defaults[4] = false
 263          then /* Welch approximation */
 264               df: (v1/n1+v2/n2)^2 / ((v1/n1)^2/(n1-1) + (v2/n2)^2/(n2-1))
 265          else df: n1 + n2 - 2,
 266
 267       /* confidence interval, distribution and p-value */
 268       if /*asymptotic*/ defaults[6] = false and
 269          /*standard deviations*/ defaults[2] = 'unknown
 270          then (distribution: ['student_t, df],
 271                pvalue: cdf_student_t(statistic,df),
 272                /* two or one-sided confidence interval */
 273                if /*alternative*/ defaults[1] = 'greater
 274                  then cinterval: [dm-quantile_student_t(defaults[5], df) * coef,'inf]
 275                  else if /*alternative*/ defaults[1] = 'less
 276                         then cinterval: ['minf, dm+quantile_student_t(defaults[5], df) * coef]
 277                         else cinterval: dm + [-1,1] * quantile_student_t((1 + defaults[5])/2, df) * coef )
 278          else (distribution: ['normal, 0, 1],
 279                pvalue: cdf_normal(statistic, 0, 1),
 280                if /*alternative*/ defaults[1] = 'greater
 281                  then cinterval: [dm-quantile_normal(defaults[5],0,1) * coef,'inf]
 282                  else if /*alternative*/ defaults[1] = 'less
 283                         then cinterval: ['minf, dm+quantile_normal(defaults[5],0,1) * coef]
 284                         else cinterval: dm + [-1,1] * quantile_normal((1 + defaults[5])/2, 0, 1) * coef ),
 285
 286       /* hypotheses, pvalue (for alternative=greater and alternative=twosided) */
 287       if /*alternative*/ defaults[1] = 'greater
 288          then (hypo: "H0: mean1 = mean2 , H1: mean1 > mean2",
 289                pvalue: 1 - pvalue )
 290          else if /*alternative*/ defaults[1] = 'less
 291                  then hypo: "H0: mean1 = mean2 , H1: mean1 < mean2"
 292                  else (hypo: "H0: mean1 = mean2 , H1: mean1 # mean2",
 293                        pvalue: 2 * (1 - pvalue) ) ),
 294
 295   /* result as an 'inference_result' object*/
 296   inference_result("DIFFERENCE OF MEANS TEST",
 297                  [  ['diff_estimate, dm],
 298                     ['conf_level, defaults[5]],
 299                     ['conf_interval, cinterval],
 300                     ['method, method],
 301                     ['hypotheses, hypo],
 302                     ['statistic, statistic],
 303                     ['distribution, distribution],
 304                     ['p_value, pvalue]  ],
 305                  [1,2,3,4,5,6,7,8])     )$
 306
 307
 308
 309
 310
 311
 312
 313
 314 /* This is the variance test for a normal population. The first argument 'x'   */
 315 /* is a list or a column matrix of expressions (one sample)                    */
 316 /* Admits the following options:                                               */
 317 /*   'mean='unknown: this is the value of the population's mean when it is     */
 318 /*            known; valid values are: 'unknown, a symbol or a number          */
 319 /*   'alternative='twosided: this is the alternative hypothesis H1; valid      */
 320 /*            values are: 'twosided, 'greater and 'less.                       */
 321 /*   'variance=1: this is the value of the variance to be checked.             */
 322 /*   'conflevel=95/100: confidence level of the confidence interval; valid     */
 323 /*           values are: a symbol or an expression which takes a value in (0,1)*/
 324 /*                                                                             */
 325 /* The output of this function is an 'inference_result' object with the        */
 326 /* following results:                                                          */
 327 /*   1. 'var_estimate= variance estimate                                       */
 328 /*   2. 'conf_level= confidence level                                          */
 329 /*   3. 'conf_interval= confidence interval                                    */
 330 /*   4. 'method: method and assumptions                                         */
 331 /*   5. 'hypotheses: null hypothesis and alternative                            */
 332 /*   6. 'statistic: statistic used in the procedure                             */
 333 /*   7. 'distribution: distribution and its parameters                          */
 334 /*   8. 'p_value: p-value of the sample statistic                               */
 335 test_variance(x,[select]):=
 336 block([numer:stats_numer, options, defaults, s2, n, coef, df, cinterval, aux,
 337        statistic, method, distribution, hypo, pvalue, listarith:true],
 338
 339   /* controlling sample format */
 340   if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
 341      then error("Sample 'x' should be a list with expressions or a column matrix")
 342      else n: length(x),
 343
 344   /* updating and controlling options */
 345   options:  ['mean,    'alternative, 'variance, 'conflevel],
 346   defaults: ['unknown, 'twosided,    1,          95/100],
 347   for i in select do(
 348      aux: ?position(lhs(i),options),
 349      if numberp(aux) and aux <= length(options) and aux >= 1
 350         then defaults[aux]: rhs(i)),
 351   if not member(defaults[2],['twosided, 'greater, 'less])
 352      then error("Option 'alternative' is not correct"),
 353   if member(sign(defaults[3]), ['neg, 'zero, 'nz])
 354      then error("Option 'variance' can't be nonpositive"),
 355   if numberp(defaults[4]) and (defaults[4] <= 0 or defaults[4] >= 1)
 356      then error("Option 'conflevel' can't be outside interval (0,1)"),
 357
 358   /* sample statistic's numerator (coef), degrees of freedom, */
 359   /* confidence interval, distribution and variance estimate  */
 360   if /*mean*/ defaults[1] = 'unknown
 361      then (s2: var1(x),
 362            if listp(s2) then s2: s2[1],
 363            df: n - 1)
 364      else (s2: mean((x - defaults[1])^2),
 365            df: n),
 366   coef: df * s2,
 367
 368   /* distribution, confidence interval and statistic */
 369   distribution: ['chi2, df],
 370   if /*alternative*/ defaults[2] = 'greater
 371      then cinterval: [coef / quantile_chi2(defaults[4],df), 'inf]
 372      else if /*alternative*/ defaults[2] = 'less
 373              then cinterval: [0, coef / quantile_chi2(1-defaults[4],df)]
 374              else cinterval: coef / [quantile_chi2((1+defaults[4])/2,df),
 375                                      quantile_chi2((1-defaults[4])/2,df)],
 376   statistic: coef / defaults[3],
 377
 378   /* method */
 379   method: concat("Variance Chi-square test. ",
 380                   if /*mean*/ defaults[1] = 'unknown
 381                      then "Unknown mean."
 382                      else "Known mean."),
 383
 384   /* hypotheses, pvalue */
 385   pvalue: cdf_chi2(statistic,df), /* pvalue for alternative=less */
 386   aux: string(defaults[3]),
 387   if /*alternative*/ defaults[2] = 'greater
 388      then (hypo: concat("H0: var = ", aux, " , H1: var > ", aux),
 389            pvalue: 1 - pvalue )
 390      else if /*alternative*/ defaults[2] = 'less
 391              then hypo: concat("H0: var = ", aux," , H1: var < ", aux)
 392              else (hypo: concat("H0: var = ", aux," , H1: var # ", aux),
 393                    if /* compares the sample statistics to the median */
 394                       statistic <= quantile_chi2(1/2,df)
 395                      then pvalue: 2 * pvalue
 396                      else pvalue: 2 * (1 - pvalue) ),
 397
 398   /* result as an 'inference_result' object*/
 399   inference_result("VARIANCE TEST",
 400                  [  ['var_estimate, s2],
 401                     ['conf_level, defaults[4]],
 402                     ['conf_interval, cinterval],
 403                     ['method, method],
 404                     ['hypotheses, hypo],
 405                     ['statistic, statistic],
 406                     ['distribution, distribution],
 407                     ['p_value, pvalue]  ],
 408                  [1,2,3,4,5,6,7,8])    )$
 409
 410
 411
 412
 413
 414
 415
 416 /* This is the variance ratio test. The first two arguments 'x1' and 'x2'       */
 417 /* are lists or column matrices with expressions, generally numbers.            */
 418 /* Admits the following options:                                                */
 419 /*   'alternative='twosided: this is the alternative hypothesis H1; valid       */
 420 /*            values are: 'twosided, 'greater (m1>m2) and less (m1<m2).         */
 421 /*   'mean1='unknown: this is the value of the mean in the x1 sample when it    */
 422 /*            is known; valid values are: 'unknown, a symbol or number          */
 423 /*   'mean2='unknown: this is the value of the mean in the x2 sample when it    */
 424 /*            is known; valid values are: 'unknown, a symbol or number          */
 425 /*   'conflevel=95/100: confidence level of the confidence interval; valid      */
 426 /*           values are: a symbol or an expression which takes a value in (0,1) */
 427 /*                                                                              */
 428 /* The output of this function is an 'inference_result' object                  */
 429 /* with the following results:                                                  */
 430 /*   1. 'ratio_estimate= variance ratio estimate (variance1/variance2)          */
 431 /*   2. 'conf_level= confidence level                                           */
 432 /*   3. 'conf_interval= confidence interval                                     */
 433 /*   4. 'method: assumptions about the means.                                   */
 434 /*   5. 'hypotheses: null hypothesis and alternative                            */
 435 /*   6. 'statistic: statistic used in the procedure                             */
 436 /*   7. 'distribution: distribution and its parameters                          */
 437 /*   8. 'p_value: p-value of the sample statistic                               */
 438 test_variance_ratio(x1,x2,[select]):=
 439 block([numer:stats_numer, options, defaults, v1, v2, vr, n1, n2, t1, t2, df1, df2,
 440        cinterval, aux, statistic, method, distribution, hypo, pvalue, listarith:true],
 441
 442   /* controlling sample format */
 443   if not listofexpr(x1) and not (matrixp(x1) and length(x1[1]) = 1 and every('identity,map('listofexpr,args(x1))))
 444      then error("Sample 'x1' should be a list with expressions or a column matrix")
 445      else (n1: length(x1)),
 446   if not listofexpr(x2) and not (matrixp(x2) and length(x2[1]) = 1 and every('identity,map('listofexpr,args(x2))))
 447      then error("Sample 'x2' should be a list with expressions or a column matrix")
 448      else n2: length(x2),
 449
 450   /* updating and controlling options */
 451   options:  ['alternative, 'mean1,   'mean2,  'conflevel],
 452   defaults: ['twosided,    'unknown, 'unknown, 95/100],
 453   for i in select do(
 454      aux: ?position(lhs(i),options),
 455      if numberp(aux) and aux <= length(options) and aux >= 1
 456         then defaults[aux]: rhs(i)),
 457   if not member(defaults[1],['twosided, 'greater, 'less])
 458      then error("Option 'alternative' is not correct"),
 459   /* in the next two lines, ignorance about the means is contagious */
 460   if defaults[2] = 'unknown then defaults[3]: 'unknown,
 461   if defaults[3] = 'unknown then defaults[2]: 'unknown,
 462   if numberp(defaults[4]) and (defaults[4] <= 0 or defaults[4] >= 1)
 463      then error("Option 'conflevel' can't be outside interval (0,1)"),
 464
 465   /* method */
 466   method: concat("Variance ratio F-test. ",
 467                   if /*means*/ defaults[2] = 'unknown
 468                      then "Unknown means."
 469                      else "Known means."),
 470   v1: var1(x1),
 471   v2: var1(x2),
 472
 473   if float(v2) = 0.0
 474     then vr: statistic: distribution: pvalue: cinterval: hypo: 'undefined
 475   else (
 476     /* variance ratio estimate, degrees of freedom, */
 477     if /*means*/ defaults[2] = 'unknown
 478        then (vr: v1 / v2,
 479              if listp(vr) then vr: vr[1],
 480              df1: n1 - 1,
 481              df2: n2 - 1)
 482        else (t1: mean((x1 - defaults[2])^2),
 483              t2: mean((x2 - defaults[3])^2),
 484              vr: t1 / t2,
 485              df1: n1,
 486              df2: n2),
 487
 488     /* distribution, confidence interval and statistic */
 489     distribution: ['f, df1, df2],
 490     if /*alternative*/ defaults[1] = 'greater
 491        then cinterval: [vr / quantile_f(defaults[4],df1,df2), 'inf]
 492        else if /*alternative*/ defaults[1] = 'less
 493                then cinterval: [0, vr / quantile_f(1-defaults[4],df1,df2)]
 494                else cinterval: vr / [quantile_f((1+defaults[4])/2,df1,df2),
 495                                      quantile_f((1-defaults[4])/2,df1,df2)],
 496     statistic: vr,
 497
 498     /* hypotheses, pvalue */
 499     pvalue: cdf_f(statistic,df1,df2), /* pvalue for alternative=less */
 500     if /*alternative*/ defaults[1] = 'greater
 501        then (hypo: "H0: var1 = var2 , H1: var1 > var2",
 502              pvalue: 1 - pvalue )
 503        else if /*alternative*/ defaults[1] = 'less
 504                then hypo: "H0: var1 = var2 , H1: var1 < var2"
 505                else (hypo: "H0: var1 = var2 , H1: var1 # var2",
 506                      if /* compares the sample statistics to the median */
 507                         statistic <= quantile_f(1/2,df1,df2)
 508                        then pvalue: 2 * pvalue
 509                        else pvalue: 2 * (1 - pvalue))),
 510
 511   /* result as an 'inference_result' object*/
 512   inference_result("VARIANCE RATIO TEST",
 513                  [  ['ratio_estimate, vr],
 514                     ['conf_level, defaults[4]],
 515                     ['conf_interval, cinterval],
 516                     ['method, method],
 517                     ['hypotheses, hypo],
 518                     ['statistic, statistic],
 519                     ['distribution, distribution],
 520                     ['p_value, pvalue]  ],
 521                  [1,2,3,4,5,6,7,8])     )$
 522
 523
 524
 525
 526
 527
 528
 529 /* This is the non parametric sign test for the median. Argument 'x' is a         */
 530 /* list or column matrix with expressions, generally numbers.                     */
 531 /* Admits the following option:                                                   */
 532 /*   'alternative='twosided: this is the alternative hypothesis H1; valid         */
 533 /*            values are: 'twosided, 'greater (med1>median) and less (med1<median)*/
 534 /*   'median=0: the median value to be checked                                    */
 535 /*                                                                                */
 536 /* The output of this function is an 'inference_result' object                    */
 537 /* with the following results:                                                    */
 538 /*   1. 'med_estimate= median estimate                                            */
 539 /*   2. 'method= assumptions about the means.                                     */
 540 /*   3. 'hypotheses= null hypothesis and alternative                              */
 541 /*   4. 'statistic= statistic used in the procedure                               */
 542 /*   5. 'distribution= distribution and its parameters                            */
 543 /*   6. 'p_value= p-value of the sample statistic                                 */
 544 test_sign(x,[select]):=
 545 block([numer:stats_numer, options, defaults, med, n, npos, aux, xm,
 546        statistic, method, distribution, hypo, pvalue, listarith:true],
 547
 548   /* controlling sample format */
 549   if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
 550      then error("Sample 'x' should be a list with expressions or a column matrix"),
 551
 552   /* updating and controlling options */
 553   options:  ['alternative, 'median],
 554   defaults: ['twosided,    0],
 555   for i in select do(
 556      aux: ?position(lhs(i),options),
 557      if numberp(aux) and aux <= length(options) and aux >= 1
 558         then defaults[aux]: rhs(i)),
 559   if not member(defaults[1],['twosided, 'greater, 'less])
 560      then error("Option 'alternative' is not correct"),
 561
 562   /* median estimate */
 563   med: median(x),
 564   if listp(med) then med: med[1],
 565
 566   /* statistic */
 567   xm: x - defaults[2],
 568   if matrixp(xm) then xm: transpose(xm)[1],
 569   statistic: apply("+", map(lambda([z], if is(z<0) then 1 else 0), xm)),
 570   npos: apply("+", map(lambda([z], if is(z>0) then 1 else 0), xm)),
 571   n: statistic + npos,
 572
 573   /* method */
 574   method: "Non parametric sign test.",
 575
 576   /* distribution */
 577   distribution: ['binomial, n, 1/2],
 578
 579   /* hypotheses, pvalue */
 580   aux: string(defaults[2]),
 581   if /*alternative*/ defaults[1] = 'greater
 582      then (hypo: concat("H0: median = ", aux," , H1: median > ", aux),
 583            pvalue: 1 - cdf_binomial(statistic,n,1/2) )
 584      else if /*alternative*/ defaults[1] = 'less
 585              then (hypo: concat("H0: median = ", aux," , H1: median < ", aux),
 586                    pvalue: cdf_binomial(statistic,n,1/2) )
 587              else (hypo: concat("H0: median = ", aux," , H1: median # ", aux),
 588                    if statistic < n/2
 589                       then pvalue: 2 * cdf_binomial(statistic,n,1/2)
 590                       else pvalue: 2 * (1 - cdf_binomial(statistic,n,1/2)) ),
 591
 592   /* result as an 'inference_result' object*/
 593   inference_result("SIGN TEST",
 594                  [  ['med_estimate, med],
 595                     ['method, method],
 596                     ['hypotheses, hypo],
 597                     ['statistic, statistic],
 598                     ['distribution, distribution],
 599                     ['p_value, pvalue]  ],
 600                  [1,2,3,4,5,6]) )$
 601
 602
 603
 604
 605
 606
 607 /* This is the signed rank test to make inferences about the median of a        */
 608 /* continuous population. Argument 'x' is a list or column matrix with          */
 609 /* expressions, generally numbers. Performs normal approximation if the         */
 610 /* sample size is >20, or if there are zeroes or ties. (Cuadras, 13.2;          */
 611 /* R, wilcox.test.R)                                                            */
 612 /* Admits the following option:                                                 */
 613 /*   'median=0: this is the value of the median to be checked.                  */
 614 /*   'alternative='twosided: this is the alternative hypothesis H1; valid       */
 615 /*            values are: 'twosided, 'greater (med1>med2) and less (med1<med2). */
 616 /* The output of this function is an 'inference_result' object                  */
 617 /* with the following results:                                                  */
 618 /*   1. 'med_estimate= median estimate                                          */
 619 /*   2. 'method= assumptions about the means.                                   */
 620 /*   3. 'hypotheses= null hypothesis and alternative                            */
 621 /*   4. 'statistic= statistic used in the procedure                             */
 622 /*   5. 'distribution= distribution and its parameters                          */
 623 /*   6. 'p_value= p-value of the sample statistic                               */
 624 test_signed_rank(x,[select]):=
 625 block([numer:stats_numer, options, defaults, n, aux, med, zeroes:false,
 626        pos, nequal, npositive, rank, ties:[], sigma, mu, noties, statistic,
 627        method, distribution, hypo, pvalue, listarith:true],
 628
 629   /* controlling sample format */
 630   if not listofexpr(x) and not (matrixp(x) and length(x[1]) = 1 and every('identity,map('listofexpr,args(x))))
 631      then error("Sample 'x' should be a list with expressions or a column matrix")
 632      else n: length(x),
 633
 634   /* updating and controlling options */
 635   options:  ['median, 'alternative],
 636   defaults: [0,       'twosided],
 637   for i in select do(
 638      aux: ?position(lhs(i),options),
 639      if numberp(aux) and aux <= length(options) and aux >= 1
 640         then defaults[aux]: rhs(i)),
 641   if not member(defaults[2],['twosided, 'greater, 'less])
 642      then error("Option 'alternative' is not correct"),
 643
 644   /* sample size and median estimate */
 645   n: length(x),
 646   med: median(x),
 647   if listp(med) then med: med[1],
 648
 649   /* statistic */
 650   x: x - defaults[1], /* sustract the median to be checked */
 651   /* drop zeroes */
 652   x: sublist(x, lambda([z], is(float(z) # 0.0))),
 653   if length(x) < n
 654     then (zeroes: true,
 655           n: length(x)),
 656   x: sort(makelist([x[k],abs(x[k])],k,1,n),
 657           lambda([u,v], orderlessp(u[2], v[2]))),
 658   statistic: 0,
 659   pos:1,
 660   while pos <= n do
 661      (nequal: 1,
 662       if x[pos][1] > 0
 663          then npositive: 1
 664          else npositive: 0,
 665       rank: pos,
 666       while pos+nequal <= n and x[pos+nequal][2] = x[pos][2] do
 667         (if x[pos+nequal][1] > 0 then npositive: npositive + 1,
 668          rank: rank + pos + nequal,
 669          nequal: nequal + 1),
 670       statistic: statistic + npositive * rank / nequal,
 671       pos: pos + nequal,
 672       ties: cons(nequal, ties)),
 673
 674   /* pvalue, method, distribution */
 675   noties: every(lambda([z],z=1), ties),
 676   mu: n * (n + 1) / 4,
 677   if n < 20 and noties and not zeroes
 678     then (/* performs exact test */
 679           method: "Exact test",
 680           distribution: ['signed_rank, n],
 681           if /*alternative*/ defaults[2] = 'twosided
 682             then (if statistic > mu
 683                    then pvalue: 1 - cdf_signed_rank(statistic-1, n)
 684                    else pvalue: cdf_signed_rank(statistic, n),
 685                   pvalue: min(2*pvalue, 1) )
 686             else if /*alternative*/ defaults[2] = 'greater
 687                    then pvalue: 1 - cdf_signed_rank(statistic-1, n)
 688                    else pvalue: cdf_signed_rank(statistic, n) )
 689     else (/* asymptotic test */
 690           method: "Asymptotic test",
 691           if not noties then method: concat(method, ". Ties"),
 692           if zeroes then method: concat(method, ". Zeroes"),
 693           sigma: sqrt(mu * (2*n + 1) / 6 - apply("+",ties^3-ties) / 48),
 694           if /*alternative*/ defaults[2] = 'twosided
 695             then (pvalue: cdf_normal(statistic, mu + signum(statistic - mu)/2, sigma),
 696                   pvalue: 2 * min(pvalue, 1 - pvalue),
 697                   distribution: ['normal, mu + signum(statistic - mu)/2, sigma])
 698             else if /*alternative*/ defaults[2] = 'greater
 699                    then (pvalue: 1 - cdf_normal(statistic, mu+1/2, sigma),
 700                          distribution: ['normal, mu + 1/2, sigma] )
 701                    else pvalue: (cdf_normal(statistic, mu-1/2, sigma),
 702                                  distribution: ['normal, mu-1/2, sigma]   )),
 703
 704   /* hypotheses */
 705   aux: string(defaults[1]),
 706   if /*alternative*/ defaults[2] = 'greater
 707      then hypo: concat("H0: med = ", aux," , H1: med > ", aux)
 708      else if /*alternative*/ defaults[2] = 'less
 709              then hypo: concat("H0: med = ", aux," , H1: med < ", aux)
 710              else hypo: concat("H0: med = ", aux," , H1: med # ", aux),
 711
 712   /* result as an 'inference_result' object*/
 713   inference_result("SIGNED RANK TEST",
 714                  [  ['med_estimate, med],
 715                     ['method, method],
 716                     ['hypotheses, hypo],
 717                     ['statistic, statistic],
 718                     ['distribution, distribution],
 719                     ['p_value, pvalue] ],
 720                  [1,2,3,4,5,6])     )$
 721
 722
 723
 724
 725
 726
 727
 728 /* This is the Wilcoxon-Mann-Whitney test to compare the medians of two         */
 729 /* independent samples taken from two continuous populations. The first two     */
 730 /* arguments 'x1' and 'x2' are lists or column matrices with expressions,       */
 731 /* generally numbers. Performs normal approximation if any of the sample sizes  */
 732 /* is >10, or if there are ties. (Cuadras, 13.3; R, wilcox.test.R)              */
 733 /* Admits the following option:                                                 */
 734 /*   'alternative='twosided: this is the alternative hypothesis H1; valid       */
 735 /*            values are: 'twosided, 'greater (med1>med2) and less (med1<med2). */
 736 /* The output of this function is an 'inference_result' object                  */
 737 /* with the following results:                                                  */
 738 /*   1. 'method= type of test.                                                   */
 739 /*   2. 'hypotheses: null hypothesis and alternative                             */
 740 /*   3. 'statistic: statistic used in the procedure                              */
 741 /*   4. 'distribution: sample statistic distribution.                            */
 742 /*   5. 'p_value: p-value of the sample statistic                                */
 743 test_rank_sum(x1,x2,[select]):=
 744 block([numer:stats_numer, options, defaults, n1, n2, n, aux, ordered, pos, nequal, nfirst, rank, ties:[],
 745        sigma, mu, noties, statistic, method, distribution, hypo, pvalue, listarith:true],
 746
 747   /* controlling sample format */
 748   if not listofexpr(x1) and not (matrixp(x1) and length(x1[1]) = 1 and every('identity,map('listofexpr,args(x1))))
 749      then error("Sample 'x1' should be a list with expressions or a column matrix")
 750      else (n1: length(x1)),
 751   if not listofexpr(x2) and not (matrixp(x2) and length(x2[1]) = 1 and every('identity,map('listofexpr,args(x2))))
 752      then error("Sample 'x2' should be a list with expressions or a column matrix")
 753      else n2: length(x2),
 754
 755   /* updating and controlling options */
 756   options:  ['alternative],
 757   defaults: ['twosided],
 758   for i in select do(
 759      aux: ?position(lhs(i),options),
 760      if numberp(aux) and aux <= length(options) and aux >= 1
 761         then defaults[aux]: rhs(i)),
 762   if not member(defaults[1],['twosided, 'greater, 'less])
 763      then error("Option 'alternative' is not correct"),
 764
 765   /* sample sizes */
 766   n1: length(x1),
 767   n2: length(x2),
 768   n: n1 + n2,
 769
 770   /* statistic W: both samples are combined in one    */
 771   /* ordered list and ranks of the second sample are  */
 772   /* added. In case of ties, ranks of equal numbers   */
 773   /* are averaged and the p value is computed by the  */
 774   /* normal approximation. [Cuadras, p. 13.13]        */
 775   statistic: 0,
 776   ordered: sort(append(makelist([x1[i],1],i,1,n1),
 777                        makelist([x2[i],2],i,1,n2)),
 778                 lambda([u,v], orderlessp(u[1], v[1]))),
 779   pos:1,
 780   while pos <= n do
 781      (nequal: 1,
 782       if ordered[pos][2] = 2
 783          then nfirst: 0
 784          else nfirst: 1,
 785       rank: pos,
 786       while pos+nequal <= n and ordered[pos+nequal][1] = ordered[pos][1] do
 787          (if ordered[pos+nequal][2] = 1 then nfirst: nfirst + 1,
 788           rank: rank + pos + nequal,
 789           nequal: nequal + 1),
 790       statistic: statistic + nfirst * rank / nequal,
 791       pos: pos + nequal,
 792       ties: cons(nequal, ties)),
 793   /* convert to Mann-Whitney statistic */
 794   statistic: statistic - n1 * (n1 + 1) / 2,
 795
 796   /* pvalue, method, distribution */
 797   noties: every(lambda([z],z=1), ties),
 798   mu: n1 * n2 / 2,
 799   if n1 < 10 and n2 < 10 and noties
 800     then (/* performs exact test */
 801           method: "Exact test",
 802           distribution: ['rank_sum, n1, n2],
 803           if /*alternative*/ defaults[1] = 'twosided
 804             then (if statistic > mu
 805                    then pvalue: 1 - cdf_rank_sum(statistic-1, n1, n2)
 806                    else pvalue: cdf_rank_sum(statistic, n1, n2),
 807                   pvalue: min(2*pvalue, 1) )
 808             else if /*alternative*/ defaults[1] = 'greater
 809                    then pvalue: 1 - cdf_rank_sum(statistic-1, n1, n2)
 810                    else pvalue: cdf_rank_sum(statistic, n1, n2) )
 811     else (/* asymptotic test */
 812           method: "Asymptotic test",
 813           if not noties then method: concat(method, ". Ties"),
 814           sigma: sqrt(mu / 6 * (n + 1 - apply("+",ties^3-ties) / (n * (n-1)))),
 815           if /*alternative*/ defaults[1] = 'twosided
 816             then (pvalue: cdf_normal(statistic, mu + signum(statistic - mu)/2, sigma),
 817                   pvalue: 2 * min(pvalue, 1 - pvalue),
 818                   distribution: ['normal, mu + signum(statistic - mu)/2, sigma])
 819             else if /*alternative*/ defaults[1] = 'greater
 820                    then (pvalue: 1 - cdf_normal(statistic, mu+1/2, sigma),
 821                          distribution: ['normal, mu + 1/2, sigma] )
 822                    else (pvalue: cdf_normal(statistic, mu-1/2, sigma),
 823                          distribution: ['normal, mu-1/2, sigma]   )),
 824
 825   /* hypotheses */
 826   if /*alternative*/ defaults[1] = 'greater
 827      then hypo: "H0: med1 = med2 , H1: med1 > med2"
 828      else if /*alternative*/ defaults[1] = 'less
 829              then hypo: "H0: med1 = med2 , H1: med1 < med2"
 830              else hypo: "H0: med1 = med2 , H1: med1 # med2",
 831
 832   /* result as an 'inference_result' object*/
 833   inference_result("RANK SUM TEST",
 834                  [  ['method, method],
 835                     ['hypotheses, hypo],
 836                     ['statistic, statistic],
 837                     ['distribution, distribution],
 838                     ['p_value, pvalue] ],
 839                  [1,2,3,4,5])     )$
 840
 841
 842
 843
 844
 845
 846
 847
 848 /* This is the proportions test. The first argument 'x' is the number of        */
 849 /* successes, the second number 'n>=x' is the number of trials.                 */
 850 /* It admits the following options:                                             */
 851 /*   'proportion=1/2: this is the value of the proportion to be checked.        */
 852 /*   'alternative='twosided: this is the alternative hypothesis H1; valid       */
 853 /*            values are: 'twosided, 'greater and 'less.                        */
 854 /*   'conflevel=95/100: confidence level of the confidence interval; valid      */
 855 /*           values are: a symbol or an expression which takes a value in (0,1) */
 856 /*   'asymptotic=false: whether it performs an exact test based on the binomial */
 857 /*           distribution, or an asymptotic one based on the normal.            */
 858 /*   'correct=true: whether Yates correction must be applied in case of the     */
 859 /*           asymptotic calculation of the confidence interval.                 */
 860 /*                                                                             */
 861 /* The output of this function is an 'inference_result' object with the        */
 862 /* following results:                                                          */
 863 /*   1. 'sample_proportion= sample estimate for the mean                       */
 864 /*   2. 'conf_level= confidence level                                          */
 865 /*   3. 'conf_interval= confidence interval                                    */
 866 /*   4. 'method= type of test and whether Yates correction is applied.         */
 867 /*   5. 'hypotheses= null hypothesis and alternative                           */
 868 /*   6. 'statistic= statistic used in the procedure                            */
 869 /*   7. 'distribution= statistic's distribution and its parameters             */
 870 /*   8. 'p_value= p-value of this test                                         */
 871 test_proportion(x,n,[select]):=
 872 block([numer:stats_numer, options, defaults, aux, phat, method, statistic,
 873        distribution, cinterval, hypo, pvalue, alpha, pl:0, pu:1, stdev],
 874
 875   /* controlling input data */
 876   if integerp(x) and integerp(n) and (n < x or x < 0)
 877      then error("Input data must be 0 <= x <= n"),
 878
 879   /* updating and controlling options */
 880   options:  ['proportion, 'alternative, 'conflevel, 'correct, 'asymptotic],
 881   defaults: [1/2,         'twosided   ,  95/100,     true,    false],
 882   for i in select do(
 883      aux: ?position(lhs(i),options),
 884      if numberp(aux) and aux <= length(options) and aux >= 1
 885         then defaults[aux]: rhs(i)),
 886   if numberp(defaults[1]) and (defaults[1] <= 0 or defaults[1] >= 1)
 887      then error("Option 'proportion' is not correct"),
 888   if not member(defaults[2],['twosided, 'greater, 'less])
 889      then error("Option 'alternative' is not correct"),
 890   if numberp(defaults[3]) and (defaults[3] <= 0 or defaults[3] >= 1)
 891      then error("Option 'conflevel' can't be outside interval (0,1)"),
 892   if not member(defaults[4],[true, false])
 893      then error("Option 'correct' must be true or false"),
 894   if not member(defaults[5],[true, false])
 895      then error("Option 'asymptotic' must be true or false"),
 896
 897   /* proportion estimate */
 898   phat: x/n,
 899
 900   /* method */
 901   method: concat(if /*asymptotic*/ defaults[5] = true
 902                      then "Asymptotic test"
 903                      else "Exact binomial test",
 904                  if /*correct*/ defaults[4] and defaults[5]
 905                      then " with Yates correction."
 906                      else "."),
 907
 908   /* exact test */
 909   if /*asymptotic*/ defaults[5] = false
 910      then (statistic: x,
 911            distribution: ['binomial, n, defaults[1]],
 912            if /*alternative*/ defaults[2] = 'twosided
 913               then (alpha: (1 - defaults[3])/2,
 914                     if x # 0
 915                       then pl: quantile_beta(alpha, x, n-x+1),
 916                     if x # n
 917                       then pu: quantile_beta(1-alpha, x+1, n-x),
 918                     cinterval: [pl, pu],
 919                     block([m: n * defaults[1],
 920                            d: pdf_binomial(x, n, defaults[1]),
 921                            y: 0],
 922                       if sign(x - m) = 'zero
 923                         then pvalue: 1
 924                       elseif x < m
 925                         then (for k: ceiling(m) thru n do
 926                                 if pdf_binomial(k, n, defaults[1]) <= d
 927                                   then y: y+1,
 928                               pvalue: cdf_binomial(x, n, defaults[1]) +
 929                                       1 - cdf_binomial(n-y, n, defaults[1]))
 930                         else (for k: 0 thru floor(m) do
 931                                 if pdf_binomial(k, n, defaults[1]) <= d
 932                                   then y: y+1,
 933                               pvalue: cdf_binomial(y-1, n, defaults[1]) +
 934                                       1 - cdf_binomial(x-1, n, defaults[1]))))
 935            elseif /*alternative*/ defaults[2] = 'less
 936               then (if x # n
 937                       then pu: quantile_beta(defaults[3], x+1, n-x),
 938                     cinterval: [0, pu],
 939                     pvalue: cdf_binomial(x, n, defaults[1]))
 940               else ( /* alternative is greater */
 941                     if x # 0
 942                       then pl: quantile_beta(1 - defaults[3], x, n-x+1),
 943                     cinterval: [pl, 1],
 944                     pvalue: 1 - cdf_binomial(x-1, n, defaults[1])))
 945
 946      else (/* asymptotic test*/
 947            statistic: phat,
 948            stdev: sqrt((phat*(1-phat))/n),
 949            distribution: ['normal, defaults[1], stdev],
 950
 951            /* calculate pvalue */
 952            if defaults[2] = 'twosided
 953              then /* alternative is twosided */
 954                   pvalue: 2*(1-cdf_normal(defaults[1]+abs(phat-defaults[1]), defaults[1], stdev))
 955            elseif defaults[2] = 'less
 956              then /* alternative is less */
 957                   pvalue: cdf_normal(phat, defaults[1], stdev)
 958              else /* alternative is greater */
 959                    pvalue: 1 - cdf_normal(phat, defaults[1], stdev),
 960
 961            /* calculate Wilson score confidence interval */
 962            block([z, yates, z22n, pc, pl:0, pu:1],
 963              if defaults[2] = 'twosided
 964                then z: quantile_normal((1 + defaults[3])/2, 0 ,1)
 965                else z: quantile_normal(defaults[3], 0 ,1),
 966              if defaults[4] /* Yates correction */
 967                 then yates: min(1/2, abs(x - n * defaults[1]))
 968                 else yates: 0,
 969              z22n: z*z/(2*n),
 970              if defaults[2] = 'twosided or defaults[2] = 'greater
 971                then (pc: phat - yates/n,
 972                      if pc > 0
 973                        then pl: (pc+z22n-z*sqrt((pc*(1-pc))/n + z22n/(2*n))) / (1+2*z22n)),
 974              if defaults[2] = 'twosided or defaults[2] = 'less
 975                then (pc: phat + yates/n,
 976                      if pc < 1
 977                        then pu: (pc+z22n+z*sqrt((pc*(1-pc))/n + z22n/(2*n))) / (1+2*z22n)),
 978              cinterval: [pl, pu] )),
 979
 980   /* hypotheses */
 981   aux: string(defaults[1]),
 982   if /*alternative*/ defaults[2] = 'greater
 983      then hypo: concat("H0: p = ", aux," , H1: p > ", aux)
 984   elseif /*alternative*/ defaults[2] = 'less
 985      then hypo: concat("H0: p = ", aux," , H1: p < ", aux)
 986      else hypo: concat("H0: p = ", aux," , H1: p # ", aux),
 987
 988   /* result as an 'inference_result' object*/
 989   inference_result("PROPORTION TEST",
 990                  [  ['sample_proportion, phat],
 991                     ['conf_level, defaults[3]],
 992                     ['conf_interval, cinterval],
 993                     ['method, method],
 994                     ['hypotheses, hypo],
 995                     ['statistic, statistic],
 996                     ['distribution, distribution],
 997                     ['p_value, pvalue]  ],
 998                  [1,2,3,4,5,6,7,8])    )$
 999
1000
1001
1002
1003
1004
1005
1006 /* Test for the difference of two proportions. The first and second arguments,   */
1007 /* 'x1' and 'n1>=x1', are the number of successes and total number of trials in  */
1008 /* the first sample, respectively; and the third and fourth arguments,           */
1009 /* 'x2' and 'n2>=x2', are the corresponding numbers in the second sample.        */
1010 /* This is an asymptotic test which requieres n1 and n2 to be both >= 10,        */
1011 /* and both samples are considered independent.                                  */
1012 /* It admits the following options:                                              */
1013 /*   'alternative='twosided: this is the alternative hypothesis H1; valid        */
1014 /*            values are: 'twosided, 'greater and 'less.                         */
1015 /*   'conflevel=95/100: confidence level of the confidence interval; valid       */
1016 /*           values are: a symbol or an expression which takes a value in (0,1)  */
1017 /*   'correct=true: whether Yates correction must be applied or not              */
1018 /*                                                                               */
1019 /* The output of this function is an 'inference_result' object with the          */
1020 /* following results:                                                            */
1021 /*   1. 'proportions= list with the two sample proportions                       */
1022 /*   2. 'conf_level= confidence level                                            */
1023 /*   3. 'conf_interval= confidence interval                                      */
1024 /*   4. 'method= name of the test and/or warning message in case of n1 or n2<10  */
1025 /*   5. 'hypotheses= null hypothesis and alternative                             */
1026 /*   6. 'statistic= statistic used in the procedure, namely the difference p1-p2 */
1027 /*   7. 'distribution= statistic's asymptotic distribution and its parameters    */
1028 /*   8. 'p_value= p-value of this test                                           */
1029 test_proportions_difference(x1,n1,x2,n2,[select]) :=
1030 block([numer:stats_numer, options, defaults, aux, phat, difphat, method,
1031        cinterval, hypo, sd, yates: 1/2, width],
1032
1033   /* controlling input data */
1034   if integerp(x1) and integerp(n1) and (n1 < x1 or x1 < 0)
1035      then error("Input data must be 0 <= x1 <= n1"),
1036   if integerp(x2) and integerp(n2) and (n2 < x2 or x2 < 0)
1037      then error("Input data must be 0 <= x2 <= n2"),
1038
1039   /* updating and controlling options */
1040   options:  ['alternative, 'conflevel, 'correct],
1041   defaults: ['twosided   ,  95/100,    true],
1042   for i in select do(
1043      aux: ?position(lhs(i),options),
1044      if numberp(aux) and aux <= length(options) and aux >= 1
1045         then defaults[aux]: rhs(i)),
1046   if not member(defaults[1],['twosided, 'greater, 'less])
1047      then error("Option 'alternative' is not correct"),
1048   if numberp(defaults[2]) and (defaults[2] <= 0 or defaults[2] >= 1)
1049      then error("Option 'conflevel' can't be outside interval (0,1)"),
1050   if not member(defaults[3],[true, false])
1051      then error("Option 'correct' must be true or false"),
1052
1053   /* proportions estimates */
1054   phat: [x1/n1, x2/n2],
1055   difphat: phat[1] - phat[2],
1056
1057   /* method */
1058   method: concat("Asymptotic test.",
1059                  if defaults[3]
1060                    then " Yates correction."
1061                    else "",
1062                  if integerp(n1) and n1 < 10 or integerp(n2) and n2 < 10
1063                    then " Warning: small sample."
1064                    else ""),
1065
1066   /* confidence interval */
1067   if not defaults[3] /* don't apply Yates correction */
1068     then yates: 0,
1069   yates: min(yates, abs(difphat) / (1/n1 + 1/n2)),
1070   if /*alternative*/ defaults[1] = 'twosided
1071     then aux: quantile_normal((1 + defaults[2])/2, 0, 1)
1072     else aux: quantile_normal(defaults[2], 0, 1),
1073   width: aux * sqrt((phat[1]*(1-phat[1]))/n1 + (phat[2]*(1-phat[2]))/n2) +
1074            yates * (1/n1 + 1/n2),
1075   if /*alternative*/ defaults[1] = 'twosided
1076     then cinterval : [max(difphat - width, -1), min(difphat + width, 1)]
1077   elseif defaults[1] = 'greater
1078     then cinterval : [max(difphat - width, -1), 1]
1079     else cinterval : [-1, min(difphat + width, 1)],
1080
1081   /* hypotheses */
1082   if /*alternative*/ defaults[1] = 'greater
1083      then hypo: concat("H0: p1 = p2 , H1: p1 > p2")
1084   elseif /*alternative*/ defaults[1] = 'less
1085      then hypo: concat("H0: p1 = p2 , H1: p1 < p2")
1086      else hypo: concat("H0: p1 = p2 , H1: p1 # p2"),
1087
1088   /* distribution and p-value */
1089   aux: (x1+x2)/(n1+n2),
1090   sd: sqrt(aux * (1-aux) * (1/n1 + 1/n2)),
1091   distribution: ['normal, 0, sd],
1092   if /*alternative*/ defaults[1] = 'twosided
1093     then pvalue : 2 * (1 - cdf_normal(abs(difphat), 0, sd))
1094   elseif defaults[1] = 'greater
1095     then pvalue : 1 - cdf_normal(difphat, 0, sd)
1096     else pvalue : cdf_normal(difphat, 0, sd),
1097
1098   /* result as an 'inference_result' object*/
1099   inference_result("DIFFERENCE OF PROPORTIONS TEST",
1100                  [  ['proportions, phat],
1101                     ['conf_level, defaults[2]],
1102                     ['conf_interval, cinterval],
1103                     ['method, method],
1104                     ['hypotheses, hypo],
1105                     ['statistic, difphat],
1106                     ['distribution, distribution],
1107                     ['p_value, pvalue]  ],
1108                  [1,2,3,4,5,6,7,8])  ) $
1109
1110
1111
1112
1113
1114 simple_linear_regression(dat,[select]):=block([numer:stats_numer, options, defaults, n, aux,
1115          means, covar, corr, resvar, adc, a, b, pred, res, sig2, aconfint,
1116          coef, bconfint, vconfint,
1117          statistic, distribution, hypo, pvalue, listarith:true],
1118
1119   /* controlling sample format */
1120   if not matrixp(dat) then dat: apply('matrix,dat),
1121   if length(dat[1]) # 2  or not every('identity,map('listofexpr,args(dat)))
1122      then error("Sample must contain pairs of expressions")
1123      else (n: length(dat)),
1124
1125   if n < 3 then error("Sample size must be greater than 2"),
1126
1127   /* updating and controlling options */
1128   options:  ['alternative, 'conflevel, 'regressor],
1129   defaults: ['twosided,    95/100,     'x],
1130   for i in select do(
1131      aux: ?position(lhs(i),options),
1132      if numberp(aux) and aux <= length(options) and aux >= 1
1133         then defaults[aux]: rhs(i)),
1134   if not member(defaults[1],['twosided, 'greater, 'less])
1135      then error("Option 'alternative' is not correct"),
1136   if numberp(defaults[2]) and (defaults[2] <= 0 or defaults[2] >= 1)
1137      then error("Option 'conflevel' can't be outside interval (0,1)"),
1138   if not symbolp(defaults[3])
1139      then error("Name of independent variable must be a symbol"),
1140
1141   /* estimations */
1142   means: mean(dat),
1143   covar: cov(dat),
1144   corr: covar[1,2] / sqrt(covar[1,1] * covar[2,2]),
1145   b: covar[1,2] / covar[1,1],
1146   a: means[2] - b * means[1],
1147
1148   /* computing predictions, residuals, residual variance and adc */
1149   pred: transpose(a + b * col(dat,1))[1],
1150   res: transpose(col(dat,2))[1] - pred,
1151   sig2: mean(res^2),        /* ml estimator for sigma^2 */
1152   resvar: n * sig2 / (n-2), /* residual variance */
1153   adc: 1 - (1-1/n) * resvar / covar[2,2],
1154
1155   /* two sided confidence interval for a */
1156   aconfint: a + [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2) *
1157                 sqrt(apply("+",transpose(col(dat,1))[1]^2) * resvar / (n^2 * covar[1,1])),
1158
1159   /* confidence interval for b and hypothesis test.            */
1160   /* Note that at this moment, there is not any option for the */
1161   /* alternative hypothesis, it is always considered two-sided.*/
1162   /* I maintain these conditionals here in case there are      */
1163   /* changes in future releases.                               */
1164   if float(resvar) = 0.0
1165     then /* data on the straight line */
1166          bconfint: statistic: hypo: statistic: distribution: pvalue: 'undefined
1167     else (
1168       coef: sqrt(resvar / (n * covar[1,1])),
1169       statistic: b / coef,
1170       if /*alternative*/ defaults[1] = 'twosided
1171          then statistic: abs(statistic),
1172       if /*alternative*/ defaults[1] = 'greater
1173          then (bconfint: [b - quantile_student_t(defaults[2], n-2) * coef,'inf],
1174                hypo: "H0: b = 0 ,H1: b > 0",
1175                pvalue: 1 - cdf_student_t(statistic,n-2) )
1176          else if /*alternative*/ defaults[1] = 'less
1177                  then (bconfint: ['minf, b + quantile_student_t(defaults[2], n-2) * coef],
1178                        hypo: "H0: b = 0 ,H1: b < 0",
1179                        pvalue: cdf_student_t(statistic,n-2) )
1180                  else /* twosided alternative */
1181                        (bconfint: b + [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2) * coef,
1182                        hypo: "H0: b = 0 ,H1: b # 0",
1183                        pvalue: 2 * (1 - cdf_student_t(statistic,n-2))  ),
1184       distribution: ['student_t, n-2] ),
1185
1186   /* two sided confidence interval for sigma^2 */
1187   vconfint: (n-2) * resvar / [quantile_chi2((1+defaults[2])/2,n-2),
1188                                      quantile_chi2((1-defaults[2])/2,n-2)],
1189
1190   /* result as an 'inference_result' object*/
1191   inference_result("SIMPLE LINEAR REGRESSION",
1192                  [  ['model, a + b * defaults[3] /* = regressor name */],
1193                     ['means, means],
1194                     ['variances, [covar[1,1], covar[2,2]]],
1195                     ['correlation, corr],
1196                     ['adc, adc],
1197                     ['a_estimation, a],
1198                     ['a_conf_int, aconfint],
1199                     ['b_estimation, b],
1200                     ['b_conf_int, bconfint],
1201                     ['hypotheses, hypo],
1202                     ['statistic, statistic],
1203                     ['distribution, distribution],
1204                     ['p_value, pvalue],
1205                     ['v_estimation, resvar],
1206                     ['v_conf_int, vconfint],
1207                     ['cond_mean_conf_int,
1208                         a + b*defaults[3] +
1209                         [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2)*
1210                                  sqrt(resvar * (1/n+(means[1]-defaults[3])^2/(covar[1,1]*n)))],
1211                     ['new_pred_conf_int,
1212                         a + b*defaults[3] +
1213                         [-1,1] * quantile_student_t((1 + defaults[2])/2, n-2)*
1214                                  sqrt(resvar * ((n+1)/n+(means[1]-defaults[3])^2/(covar[1,1]*n)))],
1215                     ['residuals, sort(args(transpose(matrix(pred,res))),
1216                                       lambda([x,y], orderlessp(x,y)) )] ],
1217                  [1,4,14,9,10,11,12,13])     )$
1218
1219
1220
1221
1222
1223
1224 /* Multivariate linear regression, y_i=b_0+b_1*x_1i+...+b_k*x_ki+u_i, where     */
1225 /* u_i are N(0,sigma) iid random variables. Argument dat must be a matrix       */
1226 /* with more than one column.                                                   */
1227 /* Admits the following options:                                                */
1228 /*   'conflevel=95/100: confidence level of the confidence intervals; valid     */
1229 /*           values are: a symbol or an expression which takes a value in (0,1) */
1230 /* The output of this function is an 'inference_result' object                  */
1231 /* with the following results:                                                  */
1232 /*   1. 'b_estimation= coefficient estimates                                    */
1233 /* - 2. 'b_covariances= covariance matrix of coefficient estimates              */
1234 /* - 3. 'b_conf_int= confidence intervals of coefficient estimates              */
1235 /*   4. 'b_statistics= statistics for testing coefficient                       */
1236 /*   5. 'b_p_values= p-values for coefficient tests                             */
1237 /*   6. 'b_distribution= probability distribution for coefficient tests         */
1238 /*   7. 'v_estimation= unbiased variance estimator                              */
1239 /*   8. 'v_conf_int= variance confidence interval                               */
1240 /*   9. 'v_distribution= probability distribution for variance test             */
1241 /* -10. 'covariances= data covariance matrix                                    */
1242 /* -11. 'residuals= residuals                                                   */
1243 /*  12. 'adc= adjusted determination coefficient                                */
1244 /* -13. 'aic= akaike's information criterion                                    */
1245 /* -14. 'bic= bayes's information criterion                                     */
1246 /* Items marked with the minus sign are kept hidden.                            */
1247 linear_regression(dat,[select]) :=
1248  block(
1249     [numer:stats_numer, options, defaults, n, p, aux, x, y, covb, q, b, r,
1250      df, sR2, sumsquares, Rbar, ci, t, pv, s2, aic, bic],
1251
1252   /* controlling sample format */
1253   if not matrixp(dat) then dat: apply('matrix,dat),
1254   if length(dat[1]) < 2  or not every('identity,map('listofexpr,args(dat)))
1255      then error("Sample must be a matrix or a list of lists of equal length"),
1256
1257   /* updating and controlling options */
1258   options:  ['conflevel],
1259   defaults: [95/100],
1260   for i in select do(
1261      aux: ?position(lhs(i),options),
1262      if numberp(aux) and aux <= length(options) and aux >= 1
1263         then defaults[aux]: rhs(i)),
1264   if numberp(defaults[1]) and (defaults[1] <= 0 or defaults[1] >= 1)
1265      then error("Option 'conflevel' can't be outside interval (0,1)"),
1266
1267   /* sample size */
1268   n: length(dat),
1269   /* number or regressors */
1270   p: length(dat[1]) - 1,
1271   /* responses */
1272   y: col(dat, p+1),
1273   /* regressor */
1274   x: addcol(apply(matrix, makelist([1],k,n)), submatrix(dat, p+1)),
1275   /* coefficients */
1276   b: block([xt: transpose(x)],
1277        covb: invert(xt . x),
1278        covb . (xt . y)),
1279   b: transpose(b)[1],
1280   /* residuals */
1281   r: y - x . b,
1282   r: transpose(r)[1],
1283   /* degrees of freedom */
1284   df: n-p-1,
1285   /* unbiased variance estimator */
1286   sumsquares: r . r,
1287   sR2: sumsquares / df,
1288   /* covariance matrix of estimators */
1289   covb: sR2 * covb,
1290   q: makelist(covb[j,j],j,length(covb)),
1291   /* adjusted coefficient of determination */
1292   Rbar: 1 - sR2/first(var1(y)),
1293   /* confidence intervals for coefficients */
1294   aux: quantile_student_t((1+defaults[1])/2, df) * sqrt(q),
1295   ci: makelist(b[k]+[-1,1]*aux[k],k,1,length(b)),
1296   /* statistic contrasts */
1297   t: b / sqrt(q),
1298   /* p-values */
1299   pv: 2*(1-map(lambda([z], cdf_student_t(abs(z),df)), t)),
1300   /* confidence interval for the variance */
1301   s2: df*sR2*[1,1] / [quantile_chi2((1+defaults[1])/2,df),
1302                       quantile_chi2((1-defaults[1])/2,df)],
1303   /* AIC and BIC */
1304   aic: n*log(sumsquares/n) + 2*(p+1),
1305   bic: n*log(sumsquares/n) + log(n)*(p+1),
1306
1307   if stats_numer then
1308     [b,covb,ci,t,pv,sR2,s2,r,Rbar,aic,bic]: float([b,covb,ci,t,pv,sR2,s2,r,Rbar,aic,bic]),
1309
1310   /* result as an 'inference_result' object */
1311   inference_result("LINEAR REGRESSION MODEL",
1312                  [  ['b_estimation, b],
1313                     ['b_covariances, covb],
1314                     ['b_conf_int, ci],
1315                     ['b_statistics, t],
1316                     ['b_p_values, pv],
1317                     ['b_distribution, ['student_t, df]],
1318                     ['v_estimation, sR2],
1319                     ['v_conf_int, s2],
1320                     ['v_distribution, ['chi2, df]],
1321                     ['residuals, r],
1322                     ['adc, Rbar], /* adjusted determination coefficient */
1323                     ['aic, aic],
1324                     ['bic, bic] ],
1325                  [1,4,5,6,7,8,9,11]) )$
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337 /***************************************/
1338 /*  SPECIAL PROBABILITY DISTRIBUTIONS  */
1339 /***************************************/
1340
1341
1342 /*        SOME AUXILIARY FUNCTIONS  TO BE          */
1343 /*  USED BY THE SPECIAL PROBABILITY DISTRIBUTIONS  */
1344
1345
1346 /* If m and n are positive integers, it returns 1.
1347    If at least one of them is not a positive integer, the
1348    function returns -1. If there is not enough information, the
1349    output is 0 */
1350 controlw(m,n):=
1351    if integerp(m) and sign(m)='pos and
1352       integerp(n) and sign(n)='pos
1353       then 1
1354       else if numberp(m) and not integerp(m) or
1355               numberp(n) and not integerp(n) or
1356               member(sign(m),['neg,'zero,'nz]) or
1357               member(sign(n),['neg,'zero,'nz])
1358               then -1
1359               else 0  $
1360
1361
1362 /*    SIGNED RANK DISTRIBUTION     */
1363
1364
1365 /* R: dsignrank(x,n) */
1366 pdf_signed_rank(x,n):=block([cp:controli(n),t],
1367    if cp=-1 then error("Illegal parameter"),
1368    if cp=0 then return(funmake('pdf_signed_rank,[x,n])),
1369    t: n * (n+1) / 2,
1370    if sign(x)='neg or numberp(x) and not integerp(x) or sign(x-t)='pos
1371       then return(0),
1372    if numberp(x) and integerp(x) and numberp(n)
1373       then (/* take advantage of the symmetry */
1374             if x > t/2 then x: t - x,
1375             return(?signed_rank_recursion(x,n) / 2^n)),
1376    funmake('pdf_signed_rank,[x,n])  )$
1377
1378 /* R: psignrank(x,n) */
1379 cdf_signed_rank(x,n):=block([cp:controli(n),t,xbis,sum:0],
1380    if cp=-1 then error("Illegal parameter"),
1381    if cp=0 then return(funmake('cdf_signed_rank,[x,n])),
1382    t: n * (n+1) / 2,
1383    if sign(x)='neg then return(0),
1384    if sign(x-t)='pos then return(1),
1385    if numberp(x) and numberp(n)
1386       then (/* take advantage of the symmetry */
1387             if x > t/2
1388               then xbis: t - floor(x) - 1
1389               else xbis: floor(x),
1390             for k:0 thru xbis do
1391               sum: sum + ?signed_rank_recursion(k,n),
1392             sum: sum / 2^n,
1393             if x > t/2 then sum: 1 - sum,
1394             return(sum)),
1395    funmake('cdf_signed_rank,[x,n])  )$
1396
1397
1398 /*    RANK SUM DISTRIBUTION     */
1399
1400
1401 /* R: dwilcox(x,m,n) */
1402 pdf_rank_sum(x,m,n):=block([cp:controlw(m,n),t],
1403    if cp=-1 then error("Illegal parameter"),
1404    if cp=0 then return(funmake('pdf_rank_sum,[x,m,n])),
1405    t: m * n,
1406    if sign(x)='neg or numberp(x) and not integerp(x) or sign(x-t)='pos
1407       then return(0),
1408    if numberp(x) and integerp(x) and numberp(m) and numberp(n)
1409       then (/* take advantage of the symmetry */
1410             if x > t/2 then x: t - x,
1411             return(?rank_sum_recursion(x,m+n,m) / binomial(m+n,m))),
1412    funmake('pdf_rank_sum,[x,m,n])  )$
1413
1414 /* R: pwilcox(x,m,n) */
1415 cdf_rank_sum(x,m,n):=block([cp:controlw(m,n),t,xbis,sum:0],
1416    if cp=-1 then error("Illegal parameter"),
1417    if cp=0 then return(funmake('cdf_rank_sum,[x,m,n])),
1418    t: m * n,
1419    if sign(x)='neg then return(0),
1420    if sign(x-t)='pos then return(1),
1421    if numberp(x) and numberp(m) and numberp(n)
1422       then (/* take advantage of the symmetry */
1423             if x > t/2
1424               then xbis: t - floor(x) - 1
1425               else xbis: floor(x),
1426             for k:0 thru xbis do
1427               sum: sum + ?rank_sum_recursion(k,m+n,m),
1428             sum: sum / binomial(m+n,m),
1429             if x > t/2 then sum: 1 - sum,
1430             return(sum)),
1431    funmake('cdf_rank_sum,[x,m,n])  )$