2 ;; @description Basic statistics library
3 ;; @version 1.3 - comments redone for automatic documentation
4 ;; @author Lutz Mueller, 2001
6 ;; <h2>Functions for statistics and plotting with GNU plot</h2>
7 ;; To use this module it has to be loaded at the beginning of the
10 ;; (load "/usr/share/newlisp/stat.lsp")
12 ;; All functions work on integers and floats or a mix of both. <lists> are normal
13 ;; LISP lists. <matrices> are lists of lists, one list for each row in the
14 ;; two dimensional data matrix. See the function 'stat:matrix' on how to make matrices
17 ;; This file also shows how to use some of the built in matrix math functions
18 ;; like 'multiply', 'transpose', 'invert' and 'fft'.
20 ;; Note, that the plot functions need 'gnuplot' installed, a free graphing
21 ;; package available for most operating systems, see: @link http://www.gnuplot.org www.gnuplot.org
23 ;; This is the oldest module written in newLISP. Some parts could be rewritten for faster
24 ;; performance on big data sets using arrays instead of lists.
26 ;; <center><h2>Plot functions (requires 'gnuplot')</h2></center>
28 ;; The current directory for newLISP should be writable to use these functions.
30 ;; @syntax (stat:plot <p1> <p2> ... <pN>)
31 ;; @param <p1> First value to plot.
32 ;; @param <p2> Second value to plot.
33 ;; @param <pN> Nth value to plot.
34 ;; @return The process ID of the gnuplot process.
36 ;; @syntax (stat:plotXY X Y)
37 ;; @param <X> List of x-coordinates to plot.
38 ;; @param <Y> List of y-coordinates to plot.
39 ;; @return The process ID of the gnuplot process.
42 ;; <center><h2>General uni- and bi- variate statistics</h2></center>
44 ;; @syntax (stat:sum <X>)
45 ;; @param <X> A list of numbers,
46 ;; @return Sum of data in list <X>.
48 ;; @syntax (stat:mean <X>)
49 ;; @param <X> A list of numbers.
50 ;; @return The mean of data in list <X>.
52 ;; @syntax (stat:var <X>)
53 ;; @param <X> A list of numbers.
54 ;; @return The variance of the data in list <X>.
56 ;; @syntax (stat:sdev <X>)
57 ;; @param <X> A list of numbers.
58 ;; @return Standard deviation of data in list <X>.
60 ;; @syntax (stat:sum-sq <X>)
61 ;; @param <X> A list of numbers.
62 ;; @return Sum of <x*x> data elements in list <X>.
64 ;; @syntax (stat:sum-xy <X> <Y>)
65 ;; @param <X> A list of numbers.
66 ;; @param <Y> A list of numbers.
67 ;; @return Sum of products <x*y> data elements in lists <X> and <Y>.
69 ;; @syntax (stat:cov <X> <Y>)
70 ;; @param <X> A list of numbers.
71 ;; @param <Y> A list of numbers.
72 ;; @return Covariance of data in lists <X> and <Y>
74 ;; @syntax (stat:sum-d2 <X>)
75 ;; @param <X> A list of numbers.
76 ;; @return Sum of squared diffs <(x - mean(X))^2> in list <X>.
78 ;; @syntax (stat:corr <X> <Y>)
79 ;; @param <X> A list of numbers.
80 ;; @param <Y> A list of numbers.
81 ;; @return Correlation coefficient of lists <X> and <Y>.
83 ;; @syntax (stat:regression <X> <Y>)
84 ;; @param <X> A list of numbers.
85 ;; @param <Y> A list of numbers.
86 ;; returns <(b0 b1)> coefficients of regression <Y = b0 + b1*X>.
88 ;; @syntax (stat:fit <X> <Y>)
89 ;; @param <X> A list of numbers.
90 ;; @param <Y> A list of numbers.
91 ;; @return fitted line based on '(stat:regression X Y)'.
93 ;; @syntax (stat:sum-d2xy <X> <Y>)
94 ;; @return Sum of squared differences <(x - y)^2> of elements in lists <X> and <Y>.
96 ;; @syntax (stat:moments <X>)
97 ;; @param <X> A list of numbers.
98 ;; @return Calculates all moments of list <X>.
100 ;; @syntax (stat:f-prob <F> <df1> <df2>)
101 ;; @param <F> The variance ratio.
102 ;; @param <df1> Degrees of freedom.
103 ;; @param <df2> Degrees of freedom.
104 ;; @return Probablity of F variance ratio for <df1>, <df2> degress of freedom.
107 ;; <center><h2>Multi variate statistics</h2></center>
109 ;; @syntax (stat:multiple-reg <X> <offY>)
110 ;; @param <X> A list of numbers.
111 ;; @param <offY> Zero based offset into <Y>.
112 ;; @return Multiple regression of vars in <X> onto <Y> at <offsetY>.
114 ;; @syntax (stat:cov-matrix <X>)
115 ;; @param <X> A list of numbers.
116 ;; @return Covariance matrix of <X> with <N> rows and <k> columns.
118 ;; @syntax (stat:corr-matrix <X>)
119 ;; @param <X> A list of numbers.
120 ;; @return Correlation matrix of <X> with <N> rows and <k> columns.
123 ;; <center><h2>Time series</h2></center>
125 ;; @syntax (stat:smooth <X> <alpha>)
126 ;; @param <X> A list of numbers.
127 ;; @param <alpha> Smoothing coefficient <0 < alpha < 1>.
128 ;; @return Exponentially smoothed sequence in <X>.
130 ;; @syntax (stat:lag <X> <n>)
131 ;; @param <X> A list of numbers.
133 ;; @return A differenced list of <X> with a lag of <n>.
134 ;; If the length of list <X> is <l> then the length of the resulting
135 ;; differenced list is <l - n>.
137 ;; @syntax (stat:cumulate <X>)
138 ;; @param <X> A list of numbers.
139 ;; @return The cumulated list of <X>.
141 ;; @syntax (stat:power <TS>)
142 ;; @param <TS> A time series of numbers.
143 ;; @return The power spectrum of a time series
146 ;; <center><h2>Matrix and list utilities</h2></center>
148 ;; @syntax (stat:matrix <C1> .... <CN>)
149 ;; @param <C1> The first column list of values.
150 ;; @param <CN> The Nth column list of values.
151 ;; @return A matrix off <1> to <N> columns <C>.
153 ;; @syntax (stat:diagonal <item> <N>)
154 ;; @param <item> The diagonal element.
155 ;; @return A diagonal matrix of length <N> with <item> in the diagonal.
157 ;; @syntax (stat:get-diagonal <X>)
158 ;; @param <X> An matrix filled with numbers.
159 ;; @return A list from the diagonal elements of <X>.
161 ;; @syntax (stat:mat-map <op> <A> <B>)
162 ;; @return Matrix map, e.g. '(stat:mat-map + A B)'.
163 ;; Used for adding and subtracting matrices.
167 ;---------------------------- gnuplot interface -------------------------------
169 ; These plot functions need gnuplot installed and works fine under UNIX
172 ; The plot functions produce data files for the plot data and acommand files
173 ; for gnuplot, then execute gnuplot.
175 ; The routines need write access to the current directory to produce temporary files
178 ; The file 'plot' contains the generated plot commands for gnuplot
179 ; data is saved in files with the names of symbols containing the data
180 ; or in files named 'series-1', 'series-2' etc.
182 ; e.g.: (plot '(1 3 2 5 4)) will produce a 'series-1' ascii file
183 ; but (plot age height) will produce 'age' and 'height' data files
188 ; plot one or more lists of numbers 'args' is used to access the list of
189 ; input parameters to 'plot'
194 unix
(< (& 0xF (last (sys-info))) 5)
196 ; for each list write an ascii file
197 (dolist (elmnt (args))
202 (set 'fileName
(string elmnt
))
204 (set 'fileName
(append (env "HOME") "/tmp/series-" (string cnt
)))
205 (set 'fileName
(append "series-" (string cnt
)))))
207 (write-file fileName
(list2ascii elmnt
))
208 (push (append "'" fileName
"'") fileList
))))
210 (set 'fileList
(join (reverse fileList
) ","))
211 ; write file with plot commands depending on OS - Windows doesn't have PWD
213 (write-file (append (env "HOME") "/tmp/plot") (append "set data style lines; plot " fileList
"\r\n"))
214 (write-file "plot" (append "set data style lines; plot " fileList
";pause -1;\r\n")))
217 (process (append "gnuplot -persist " (env "HOME") "/tmp/plot"))
218 (process "gnuplot plot"))))
222 ; plot to lists of numbers in XY-fashion
224 ; optionally define a style e.g: "line"
225 ; if style is unused dots are plotted
227 (define (plotXY X Y style
, lst st fle unix tempDir
)
228 (set 'tempDir
(append (env "HOME") "/tmp"))
229 (set 'unix
(< (& 0xF (last (sys-info))) 5 ))
230 (set 'lst
(map list
(map string X
) (map string Y
)))
232 (set 'fle
(open (append (env "HOME") "/tmp/plot-XY") "write"))
233 (set 'fle
(open "plot-XY" "write")))
235 (write-line (join x
" ") fle
))
239 (set 'st
(append "set data style " style
"; ")))
242 (write-file (append tempDir
"/plot") (append st
"plot '" tempDir
"/plot-XY';"))
243 (write-file "plot" (append st
"plot 'plot-XY'; pause -1;")))
245 (process (append "gnuplot -persist " tempDir
"/plot"))
246 (process "gnuplot plot")))
249 ;------------------- General uni and bi-variate statistics --------------------
251 ; sum of a data vector X
255 ; mean of a data vector X
257 (div (sum X
) (length X
)))
259 ; variance of a data vecor X
261 (div (sum-d2 X
) (sub (length X
) 1)))
263 ; standard deviation of a data vector X
267 ; sum of squares of a data vector X
269 (apply add
(map mul X X
)))
271 ; sum of the product of data vectors X*Y
273 (apply add
(map mul X Y
)))
275 ; covariance of data vectors X Y
277 (sub (sum-xy X Y
) (div (mul (sum X
) (sum Y
)) (length X
))))
279 ; sum of sqared differenses of X to mean of X
281 (sub (sum-sq X
) (div (mul (sum X
) (sum X
)) (length X
))))
283 ; Pearson r, product moment correlation of data vectors X and Y
285 (div (cov X Y
) (sqrt (mul (sum-d2 X
) (sum-d2 Y
)))))
287 ; regression Yest = b0 + b1*X calculates intercept b0 and slope b1
288 (define (regression X Y
)
289 (set 'b1
(div (cov X Y
) (sum-d2 X
)))
290 (set 'b0
(sub (mean Y
) (mul b1
(mean X
))))
293 ; fitted line using regression Y on X
294 (define (fit X Y
, coeffs b0 b1
)
295 (set 'coeffs
(regression X Y
))
296 (set 'b0
(first coeffs
))
297 (set 'b1
(last coeffs
))
298 (map (lambda (x) (add b0
(mul x b1
))) X
))
300 ; sum of squared differences of X and Y
301 (define (sum-d2xy X Y
)
302 (apply add
(map (lambda (x y
) (mul (sub x y
) (sub x y
))) X Y
)))
306 ; moments of a vector of numbers
308 (define (moments vector
, n median mean avg-dev std-dev var skew kurtosis dev sum
)
309 (set 'n
(length vector
))
311 (set 'sum
(apply add vector
))
312 (set 'mean
(div sum n
))
314 (set 'avg-dev
0 'std-dev
0 'var
0 'skew
0 'kurtosis
0)
316 (set 'dev
(map sub vector
(dup mean n
)))
317 (set 'avg-dev
(div (apply add
(map abs dev
)) n
))
318 (set 'var
(div (apply add
(map mul dev dev
)) (- n
1)))
319 (set 'skew
(apply add
(map mul dev dev dev
)))
320 (set 'kurtosis
(apply add
(map mul dev dev dev dev
)))
322 (set 'std-dev
(sqrt var
))
326 (set 'skew
(div skew
(mul n var std-dev
)))
327 (set 'kurtosis
(sub (div kurtosis
(mul n var var
)) 3.0))))
333 (set 'median
(div (add (nth mid vector
) (nth (- mid
1) vector
)) 2))
334 (set 'median
(nth mid vector
)))
336 (list n median mean avg-dev std-dev var skew kurtosis
)
338 ; (println (format "n: %d" n))
339 ; (println (format "median: %f" median))
340 ; (println (format "mean: %f" mean))
341 ; (println (format "average_deviation: %f" avg-dev))
342 ; (println (format "standard_deviation: %f" std-dev))
343 ; (println (format "variance: %f" var))
344 ; (println (format "skew: %f" skew))
345 ; (println (format "kurtosis: %f" kurtosis))
348 ;-------------------------------- Time Series ----------------------------------
350 ; expontial smoothing with 0 < alpha <= 1
351 (define (smooth lst alpha
, previous slist
)
352 (set 'previous
(first lst
))
355 (set 'previous
(add (mul alpha elmnt
) (mul (sub 1 alpha
) previous
)))
356 (push previous slist
))
361 ; seasonal difference list with variable lag
362 ; the resulting list is lag shorterm than the original
364 (define (lag lst n
, sLst
)
366 (dotimes (i n
) (pop lst
))
367 (set 'sLst
(slice sLst
0 (length lst
)))
374 (define (cumulate lst
, sc cum
)
378 (push (inc 'sc x
) cum
))
384 ; takes a rows by 2 columns matrix with real part in the first and
385 ; imagenary part the in the second column. If all numbers are real
386 ; then the second column is just 0's.
388 ; returns a matrix with two rows. First row contains power numbers
389 ; and second row contains the respective frequencies
392 (define (power ts
, lenOrg fts n n2 ps mid frqs
)
393 ; remember original length
394 (set 'lenOrg
(length ts
))
395 ; do discrete fourier transform
396 (set 'fts
(transpose (fft ts
)))
397 ; calc power spectrum
398 (set 'n
(length (transpose fts
)))
400 (set 'ps
(map (lambda (x y
) (add (mul x x
) (mul y y
))) (nth 0 fts
) (nth 1 fts
)))
401 (set 'ps
(map (lambda (x) (mul (div x n2
) 2)) ps
))
402 ; the first and last are not multiplied by 2, divide them back
403 (nth-set (ps 0) (div (first ps
) 2))
404 (set 'mid
(sub (div n
2) 1))
405 (replace mid ps
(div (nth mid ps
) 2))
406 ; calc a vector with frequencies, adjusted for the new power-2 length
407 ; which came back from the FFT
408 (set 'frqs
(sequence 1 n
))
409 (set 'frqs
(map (lambda (x) (mul (div x n
) lenOrg
)) frqs
))
410 (transpose (matrix ps frqs
)))
414 ;------------------------- multivariate statistics -----------------------------
417 ; multiple regression of variables in X onto one of varsiables in X, Y
418 ; indicated by column offset offY
420 ; X is N rows by k columns, the column at offset offY is Y
422 ; returns a matrix with two rows:
423 ; first row is regression coefficients and multiple R: b0, b1, b2 ....., R
424 ; second row is sum of squares: regression-SQ, error-SQ, total-SQ
425 ; (the unused part of the second row is zero padded)
427 ; the SQs can be used to calculate mean sqares for regression and error:
429 ; regression-MSQ = regression-SQ / (k - 1)
430 ; error-MSQ = error-sq / (n - k - 1)
432 ; F-ratio = regression-MSQ / error-MSQ
433 ; with k and (n - k - 1) df degreees of freedom
438 (define (multiple-reg X offY
, Y Ycoffs b b0 R2 Yest sqErr sqTotal sqReg sq d
)
439 (set 'covX
(cov-matrix X
))
440 (set 'Y
(extract-col X offY
))
441 ; covX is the covariance matrix
443 (set 'cvX
(transpose covX
))
444 ; the covariance matrix is reduced to cvX and the
445 ; extracted values put in Ycoffs
446 (set 'Ycoffs
(matrix (pop cvX offY
)))
447 ; b contains the regression coefficients except for b0
448 (set 'b
(multiply (invert cvX
) Ycoffs
))
449 ; calculate multiple R2 as b'*b / sqTotal
450 (set 'sqTotal
(sum-d2 Y
))
451 (set 'R2
(div (first (first (multiply (transpose b
) Ycoffs
))) sqTotal
))
452 ; estimate Y without b0
453 (set 'Yest
(multiply (reduce-col X offY
) b
))
454 ; calculate b0, d is the difference between Y and the Y estimate
455 ; b0 is the mean of differences between Y and Yest
456 (set 'd
(mat-map sub
(matrix Y
) Yest
))
457 (set 'b0
(mean (first (transpose d
))))
458 ; estimate Y including b0
459 (set 'Yest
(mat-map add Yest
(matrix (dup b0
(length Yest
)))))
460 ; error sum of squares
461 (set 'sqErr
(sum-d2xy Y
(first (transpose Yest
))))
462 ; regression sum of squares
463 (set 'sqReg
(sub sqTotal sqErr
))
464 ; make list b out of b0, b1, b2 ... sqrt(R2)
465 (set 'b
(append (list b0
) (first (transpose b
)) (list (sqrt R2
))))
466 ; make list sq out of sqReg, sqErr and sqTotal
467 (set 'sq
(list sqReg sqErr sqTotal
))
468 ; return matrix with two rows:
469 (transpose (matrix b sq
)))
474 ; covariance matrix cov
476 ; matrix x with N rows and k columns
479 (define (cov-matrix X
, XtX N I sumX sumX2
)
480 (set 'XtX
(multiply (transpose X
) X
))
482 (set 'I
(matrix (dup t1 N
)))
483 (set 'sumX
(multiply (transpose X
) I
))
484 (set 'sumX2
(multiply sumX
(transpose sumX
)))
485 (set 'sumX2
(multiply sumX2
(diagonal (div 1 N
) (length sumX2
))))
486 (mat-map sub XtX sumX2
))
492 ; matrix X with N rows and k columns
495 (define (corr-matrix X
, covX N d dd
)
496 (set 'covX
(cov-matrix X
))
497 (set 'd
(matrix (get-diagonal covX
)))
498 (set 'dd
(multiply d
(transpose d
)))
499 (set 'dd
(map (lambda (z) (map sqrt z
)) dd
))
500 (mat-map div covX dd
))
504 ; probablity of F variance ratio with degrees of freedom df1 df2
507 (define (f-prob F df1 df2
)
508 (let (prob (mul 2 (betai (div df2
(add df2
(mul df1 F
)))
511 (div (if (> prob
1) (sub 2 prob
) prob
) 2)))
514 ;----------------------------- utility functions -------------------------------
517 ; make a matrix from 1 up to 16 lists
524 ; make a diagonal matrix n by n and elmnt in the diagonal
527 (define (diagonal elmnt n
, m lst
)
531 (nth-set (lst i
) elmnt
)
536 ; get the diagonal from a square matrix
538 (define (get-diagonal X
, d x
)
540 (dotimes (idx (length X
))
541 (push (nth idx
(nth idx X
)) d
))
547 ; e.g.: (mat-map sub A B) ;; for matrix subtraction
549 (define (mat-map op A B
)
550 (map (lambda (x y
) (map op x y
)) A B
))
553 ; reduce matrix by a column at offset
555 ; returns the reduced matrix
557 (define (reduce-col mat off
, X
)
558 (set 'mat
(transpose mat
))
563 ; extract a column from a matrix
565 ; returns the extracted column
567 (define (extract-col mat off
, X
)
568 (pop (transpose mat
) off
))
571 ; convert list to ascii lines terminated by CR-LF
572 ; for storage in files usable by Gnuplot, R, Excel etc.
576 ; (write-file "MyData.txt" (list2ascii mydata-list))
578 (define (list2ascii lst
)
579 (append (join (map string lst
) "\r\n") "\r\n"))