share/stats/numstats.lisp

   1 ;;                 COPYRIGHT NOTICE
   2 ;;
   3 ;;  Copyright (C) 2006 Mario Rodriguez Riotorto
   4 ;;
   5 ;;  This program is free software; you can redistribute
   6 ;;  it and/or modify it under the terms of the
   7 ;;  GNU General Public License as published by
   8 ;;  the Free Software Foundation; either version 2
   9 ;;  of the License, or (at your option) any later version.
  10 ;;
  11 ;;  This program is distributed in the hope that it
  12 ;;  will be useful, but WITHOUT ANY WARRANTY;
  13 ;;  without even the implied warranty of MERCHANTABILITY
  14 ;;  or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 ;;  GNU General Public License for more details at
  16 ;;  http://www.gnu.org/copyleft/gpl.html
  17
  18 ;;  This is a set of numerical routines used by package 'stats'
  19
  20 ;; For questions, suggestions, bugs and the like, feel free
  21 ;; to contact me at
  22 ;; mario @@@ edu DOT xunta DOT es
  23 ;; www.biomates.net
  24
  25
  26 (in-package :maxima)
  27
  28 ;; para traducir funciones de usuario a lisp:
  29 ;; (meval '($translate $random_normal))
  30
  31
  32
  33 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  34 ;;                                       ;;
  35 ;;    Wilcoxon-Mann-Whitney recursion    ;;
  36 ;;                                       ;;
  37 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  38
  39 ;;  This recursion is used when computing exact probabilities
  40 ;;  for the distribution of the rank statistic in the Wilcoxon-
  41 ;;  Mann-Whitney test, specially for small samples. It's defined as
  42 ;;             a(i,j,k) = a(i,j-1,j-k) + a(i-k,j-1,j-k-1)
  43 ;;  Reference:
  44 ;;     Klotz, Jerome (2006) 'A Computational Approach to Statistics',
  45 ;;          available at www.stat.wisc.edu/~klotz/Book.pdf
  46 (defun rank_sum_recursion (i j k)
  47     (cond ((or (< i 0) (< j 0) (< k 0)) 0)
  48           ((= i 0) 1)
  49           (t (+ (rank_sum_recursion i (- j 1) (- j k))
  50                 (rank_sum_recursion (- i k) (- j 1) (+ j (- k) (- 1)) )))))
  51
  52
  53
  54 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  55 ;;                                       ;;
  56 ;;        Signed rank recursion          ;;
  57 ;;                                       ;;
  58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  59
  60 ;;  This recursion is used when computing exact probabilities
  61 ;;  for the true distribution of statistic in the signed rank test,
  62 ;;  specially for small samples. It's defined as
  63 ;;             a(i,j) = a(i,j-1) + a(i-j,j-1)
  64 ;;  Reference:
  65 ;;     Klotz, Jerome (2006) 'A Computational Approach to Statistics',
  66 ;;          available at www.stat.wisc.edu/~klotz/Book.pdf
  67 ;;     R statistical package. File signrank.c
  68 (defun signed_rank_recursion (i j)
  69   (let* ((u (/ (* j (+ j 1)) 2))
  70          (c (floor (/ u 4))))
  71     (if (> i c) (setf i (- u i)))
  72     (cond ((or (< i 0) (< j 0)) 0)
  73           ((= i 0) 1)
  74           (t (+ (signed_rank_recursion i (- j 1))
  75                 (signed_rank_recursion (- i j) (- j 1)))))))
  76
  77
  78
  79
  80 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  81 ;;                                       ;;
  82 ;;    Shapiro-Wilk test for normality    ;;
  83 ;;                                       ;;
  84 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  85
  86 ;; Calculates the algebraic polynomial of order (- (lengh cc) 1)
  87 ;; with array of coefficients cc. Zero order coefficient is (aref cc 0).
  88 (defun swpoly (cc x)
  89   (let ( (ret-val (aref cc 0))
  90          (nord (length cc))
  91          p)
  92     (when (> nord 1)
  93       (setf p (* x (aref cc (1- nord))))
  94       (do ((j (- nord 2) (1- j)))
  95           ((= j 0) 'done)
  96         (setf p (* x (+ p (aref cc j)))))
  97       (setf ret-val (+ ret-val p)) )
  98     ret-val ) )
  99
 100
 101 ;;  Calculates the Shapiro-Wilk test and its significance level
 102 ;;  This is a translation from Fortran to Lisp of swilk.f, which translation
 103 ;;  to C is used by the R statistical package. This is
 104 ;;  Algorithm AS R94, Applied Statistics (1995), vol.44, no.4, 547-551
 105 ;;  Argument 'x' is a Maxima list containing numbers. Optional argument 'n1'
 106 ;;  is the number of uncensored observations; this option is not documented
 107 ;;  in Maxima at this moment, since I haven't checked it; in fact, the R
 108 ;;  function 'shapiro.test' doesn't make use of this second argument (I don't know why).
 109 (defun $test_normality (x &optional n1)
 110   (declare (special $stats_numer))
 111    (unless (or ($listp x)
 112                (and ($matrixp x) (= ($length ($first x)) 1)))
 113      (merror "First argument of 'test_normality' must be a Maxima list"))
 114    (when ($matrixp x)
 115      (setq x ($first ($transpose x))))
 116    (setf x (sort (map 'list #'$float (rest x)) #'<))
 117    (if (null n1) (setf n1 (length x)))
 118    (let* (($numer $stats_numer)
 119           (n (length x))
 120           (n2 (floor (/ n 2)))
 121           w pw  ; W statistic and its p-value
 122           ; initialized data
 123           (z90 1.2816)  (z95 1.6449)
 124           (z99 2.3263)  (zm 1.7509)   (zss 0.56268)
 125           (bf1 0.8378)  (xx90 0.556)  (xx95 0.622)
 126           (sqrth 0.70711)  (small 1e-19)  (pi6 1.909859) (stqr 1.047198)
 127           ; polynomial coefficients
 128           (g (make-array 2 :element-type 'flonum
 129                :initial-contents '(-2.273 0.459)))
 130           (c1 (make-array 6 :element-type 'flonum
 131                :initial-contents '(0.0 0.221157 -0.147981 -2.07119 4.434685 -2.706056)))
 132           (c2 (make-array 6 :element-type 'flonum
 133                :initial-contents '(0.0 0.042981 -0.293762 -1.752461 5.682633 -3.582633)))
 134           (c3 (make-array 4 :element-type 'flonum
 135                :initial-contents '(0.544 -0.39978 0.025054 -6.714e-4)))
 136           (c4 (make-array 4 :element-type 'flonum
 137                :initial-contents '(1.3822 -0.77857 0.062767 -0.0020322)))
 138           (c5 (make-array 4 :element-type 'flonum
 139                :initial-contents '(-1.5861 -0.31082 -0.083751 0.0038915)))
 140           (c6 (make-array 3 :element-type 'flonum
 141                :initial-contents '(-0.4803 -0.082676 0.0030302)))
 142           (c7 (make-array 2 :element-type 'flonum
 143                :initial-contents '(0.164 0.533)))
 144           (c8 (make-array 2 :element-type 'flonum
 145                :initial-contents '(0.1736 0.315)))
 146           (c9 (make-array 2 :element-type 'flonum
 147                :initial-contents '(0.256 -0.00635)))
 148           ncens nn2 i1 range delta a2 a1 an25 an rsn fac ssumm2 summ2 xi xx y w1
 149           ssassx xsx asa sax ssa ssx sx sa zbar zsd zfm z99f z95f
 150           z90f bf ld s m gamma a)
 151
 152       (setf an (coerce n 'flonum))
 153       (setf ncens (- n n1))   ; number of censored observations
 154       (setf nn2 (truncate n 2))
 155       (if (or (< n 3) (> n 5000))
 156          (merror "Sample size must be between 3 and 5000"))
 157       (setf range (- (car (last x)) (first x)))
 158       (if (< range small)
 159          (merror "All observations are identical"))
 160       (if (< n1 3) (merror "Number of uncensored observations is too small"))
 161       (if (< ncens 0)
 162          (merror "Number of uncensored observations is greater than total sample size!!"))
 163       (if (and (> ncens 0) (< n 20))
 164          (merror "Sample size is too small, and censored data are not allowed"))
 165       (setf delta (/ ncens an))
 166       (if (> delta 0.8)
 167          (merror "Ratio of censored observations is too great (>80%)"))
 168
 169       ; calculate coefficients for statistic w
 170       (setf a (make-array (1+ n2) :element-type 'flonum :initial-element 0.0))
 171       (cond ((= n 3)
 172                 (setf (aref a 1) sqrth))
 173             (t  (setf an25 (+ 0.25 an))
 174                 (setf summ2 0.0)
 175                 (do ((i 1 (1+ i)))
 176                    ((> i n2) 'done)
 177                    (setf (aref a i)
 178                          (coerce (mul* 1.414213562373095
 179                                        (simplify
 180                                           (list '(%inverse_erf)
 181                                                 (add* (mul* 2.0 (/ (- i 0.375) an25)) -1))))
 182                                  'flonum))
 183                    (setf summ2 (+ summ2 (* (aref a i) (aref a i)))))
 184                 (setf summ2 (* summ2 2.0))
 185                 (setf ssumm2 (sqrt summ2))
 186                 (setf rsn (/ 1.0 (sqrt an)))
 187                 (setf a1 (- (swpoly c1 rsn) (/ (aref a 1) ssumm2)))
 188                 ; normalize a
 189                 (cond ((> n 5)
 190                          (setf i1 3)
 191                          (setf a2 (- (swpoly c2 rsn) (/ (aref a 2) ssumm2) ))
 192                          (setf fac (sqrt (/ (+ summ2
 193                                                (* (aref a 1) (aref a 1) -2.0)
 194                                                (* (aref a 2) (aref a 2) -2.0))
 195                                             (+ 1.0 (* a1 a1 -2.0) (* a2 a2 -2.0)))))
 196                          (setf (aref a 2) a2)  )
 197                       (t (setf i1 2)
 198                          (setf fac (sqrt (/ (- summ2 (* 2.0 (aref a 1) (aref a 1)))
 199                                             (- 1.0 (* 2.0 a1 a1)))))     ))
 200                 (setf (aref a 1) a1)
 201                 (do ((i i1 (1+ i)))
 202                      ((> i nn2) 'done)
 203                    (setf (aref a i) (/ (aref a i) (- fac))))     ))
 204
 205       ; check for correct sort order on range - scaled X
 206       (setf xx (/ (first x) range))
 207       (setf sx xx)
 208       (setf sa (- (aref a 1)))
 209       (do ((j (1- n) (1- j))
 210            (i 1 i))
 211           ((>= i n1) 'done)
 212         (setf xi (/ (nth i x) range))
 213         (setf sx (+ sx xi))
 214         (setf i (1+ i))
 215         (if (/= i j)
 216             (setf sa (+ sa (* (signum (- i j)) (aref a (min i j))))))
 217         (setf xx xi) )
 218
 219       ; Calculate W statistic as squared correlation
 220       ; between data and coefficients
 221       (setf sa (/ sa n1))
 222       (setf sx (/ sx n1))
 223       (setf ssa 0.0
 224             ssx 0.0
 225             sax 0.0)
 226       (do ((j (1- n) (1- j))
 227            (i 0 (1+ i)))
 228         ((>= i n1) 'done)
 229         (if (/= i j)
 230             (setf asa (- (* (signum (- i j)) (aref a (+ 1 (min i j)))) sa))
 231             (setf asa (- sa)))
 232         (setf xsx (- (/ (nth i x) range) sx))
 233         (setf ssa (+ ssa (* asa asa)))
 234         (setf ssx (+ ssx (* xsx xsx)))
 235         (setf sax (+ sax (* asa xsx)))  )
 236
 237       ; w1 equals (1-w) calculated to avoid excessive rounding error
 238       ; for W very near 1 (a potential problem in very large samples)
 239       (setf ssassx (sqrt (* ssa ssx)))
 240       (setf w1 (/ (* (- ssassx sax) (+ ssassx sax))
 241                   (* ssa ssx)))
 242       (setf w (- 1 w1))
 243
 244       ; calculate significance level for w
 245       (tagbody
 246          (when (= n 3)
 247             (setf pw (* pi6 (- ($asin (sqrt w)) stqr)))
 248             (go fin))
 249          (setf y  (log w1)
 250                xx (log an))
 251          (cond ((<= n 11)
 252                   (setf gamma (swpoly g an))
 253                   (when (>= y gamma)
 254                      (setf pw small)
 255                      (go fin))
 256                   (setf y (- (log (- gamma y)))
 257                         m (swpoly c3 an)
 258                         s (exp (swpoly c4 an))))
 259                (t ; n >= 12
 260                   (setf m (swpoly c5 xx)
 261                         s (exp (swpoly c6 xx)))))
 262
 263           ; censoring by proportion ncens/n
 264           ; calculate mean and sd of normal equivalent deviate of w
 265           (when (> ncens 0)
 266              (setf ld (- (log delta))
 267                    bf (+ 1.0 (* xx bf1)))
 268              (setf z90f (+ z90 (* bf (expt (swpoly c7 (expt xx90 xx)) ld))))
 269              (setf z95f (+ z95 (* bf (expt (swpoly c8 (expt xx95 xx)) ld))))
 270              (setf z99f (+ z99 (* bf (expt (swpoly c9 xx) ld))))
 271              ; regress z90F,...,z99F on normal deviates z90,...,z99 to get
 272              ; pseudo-mean and pseudo-sd of z as the slope and intercept
 273              (setf zfm (/ (+ z90f z95f z99f) 3.0))
 274              (setf zsd (/ (+ (* z90 (- z90f zfm))
 275                              (* z95 (- z95f zfm))
 276                              (* z99 (- z99f zfm)))
 277                           zss))
 278              (setf zbar (- zfm (* zsd zm)))
 279              (setf m (+ m (* zbar s)))
 280              (setf s (* s zsd)) )
 281
 282           (setf pw (- 0.5 (* 0.5 (mfuncall '%erf (/ (- y m)
 283                                                     (* 1.41421356 s))))))
 284         fin)
 285
 286       ; the result is an 'inference_result' Maxima object
 287       ($inference_result
 288          "SHAPIRO - WILK TEST"
 289          (list '(mlist) (list '(mlist) '$statistic w)
 290                         (list '(mlist) '$p_value pw))
 291          (list '(mlist) 1 2) )  ))
 292
 293
 294