1 /* Original version of this file copyright 1999 by Michael Wester,
2 * and retrieved from http://www.math.unm.edu/~wester/demos/Statistics/problems.macsyma
5 * Released under the terms of the GNU General Public License, version 2,
6 * per message dated 2007-06-03 from Michael Wester to Robert Dodier
7 * (contained in the file wester-gpl-permission-message.txt).
9 * See: "A Critique of the Mathematical Abilities of CA Systems"
10 * by Michael Wester, pp 25--60 in
11 * "Computer Algebra Systems: A Practical Guide", edited by Michael J. Wester
12 * and published by John Wiley and Sons, Chichester, United Kingdom, 1999.
14 /* ----------[ M a c s y m a ]---------- */
15 /* ---------- Initialization ---------- */
18 /* ---------- Statistics ---------- */
19 /* Compute the mean of a list of numbers => 9 */
20 sample_mean([3, 7, 11, 5, 19]);
21 /* Compute the median of a list of numbers => 7 */
22 sample_median([3, 7, 11, 5, 19]);
23 /* Compute the first quartile (25% quantile) of a list of numbers => 2 or 5/2 */
24 xx: [1, 2, 3, 4, 5, 6, 7, 8]$
26 /* Compute the mode (the most frequent item) of a list of numbers => 7 */
27 [3, 7, 11, 7, 3, 5, 7];
28 /* Compute the unbiased sample standard deviation of a list of numbers
30 sample_standard_deviation([1, 2, 3, 4, 5]);
31 /* Discrete distributions---what is the probability of finding exactly 12
32 switches that work from a group of 15 where the probability of a single one
33 working is 75%? (Need to use the probability density function [PDF] of the
34 binomial distribution.) => 0.22520 */
35 binomial_density(12, 15, .75);
36 /* Replace `exactly' by `up through' in the above. (Need to use the cumulative
37 probability density function [CDF] of the binomial distribution.) => 0.76391
39 binomial_distrib(12, 15, .75);
40 /* Continuous distributions---if a radiation emission can be modeled by a
41 normal distribution with a mean of 4.35 mrem and a standard deviation of
42 0.59 mrem, what is the probability of being exposed to anywhere from 4 to 5
44 normal_distrib(5, 4.35, 0.59^2) - normal_distrib(4, 4.35, 0.59^2);
45 /* Hypothesis testing---how good of a guess is 5 for the mean of xx? */
46 xx: [1, -2, 3, -4, 5, -6, 7, -8, 9, 10]$
47 /* Using Student's T distribution (preferred) => 0.057567 */
48 students_t_distrib((sample_mean(xx) - 5)/(sample_standard_deviation(xx) /
49 sqrt(length(xx))), length(xx) - 1);
51 /* Using the normal distribution (as an alternative) => 0.040583 */
52 standard_normal_distrib((sample_mean(xx) - 5)/(sample_standard_deviation(xx) /
56 /* Chi-square test---what is the expectation that row characteristics are
57 independent of column characteristics for a two dimensional array of data?
58 => 0.469859 (chi2 = 1153/252) */
59 x: matrix([41, 27, 22], [79, 53, 78])$
62 rowSum: makelist(sum(x[i, j], j, 1, n), i, 1, m)$
63 colSum: makelist(sum(x[i, j], i, 1, m), j, 1, n)$
64 matSum: sum(rowSum[i], i, 1, m)$
65 e: genmatrix(lambda([i, j], rowSum[i]*colSum[j]/matSum), m, n)$
66 chi2: sum(sum((x[i, j] - e[i, j])^2/e[i, j], i, 1, m), j, 1, n);
67 1 - chi_square_distrib(chi2, m*n - 1);
69 remvalue(chi2, colSum, matSum, rowSum, e, m, n, x)$
70 /* Linear regression (age as a function of developmental score). See Lambert
71 H. Koopmans, _Introduction to Contemporary Statistical Methods_, Second
72 Edition, Duxbury Press, 1987, p. 459 => y' = 0.7365 x + 6.964 */
73 t: [[3.33, 3.25, 3.92, 3.50, 4.33, 4.92, 6.08, 7.42, 8.33, 8.00, 9.25,
75 [8.61, 9.40, 9.86, 9.91, 10.53, 10.61, 10.59, 13.28, 12.76, 13.44, 14.27,
77 lsq1(t[1], t[2], 1, x);
78 lsq_linear(t[1], t[2], [1, x], [x]);
80 /* Multiple linear regression (income as a function of age and years of
81 college) => y = -16278.7 + 960.925 x1 + 2975.66 x2 */
82 [[37, 45, 38, 42, 31], [4, 0, 5, 2, 4], [31200, 26800, 35000, 30300, 25400]]$
83 lsq_linear(makelist([%[1][i], %[2][i]], i, 1, length(%[1])), %[3],
84 [1, x1, x2], [x1, x2]);
85 /* Multiple linear regression using the L1 or Least Absolute Deviations
86 technique rather than the Least Squares technique (minimizing the sum of the
87 absolute values of the residuals rather than the sum of the squares of the
88 residuals). Here, the Stack-loss Data is used (percentage of ammonia lost
89 times 10 from the operation of a plant over 21 days as a function of air
90 flow to the plant, cooling water inlet temperature and acid concentration).
91 See W. N. Venables and B. D. Ripley, _Modern Applied Statistics with
92 S-plus_, Springer, 1994, p. 218.
93 => y = 0.83188 x1 + 0.57391 x2 - 0.06086 x3 - 39.68984 */
94 [[80, 80, 75, 62, 62, 62, 62, 62, 58, 58, 58, 58, 58, 58, 50, 50, 50, 50, 50,
96 [27, 27, 25, 24, 22, 23, 24, 24, 23, 18, 18, 17, 18, 19, 18, 18, 19, 19, 20,
98 [89, 88, 90, 87, 87, 87, 93, 93, 87, 80, 89, 88, 82, 93, 89, 86, 72, 79, 80,
100 [42, 37, 37, 28, 18, 18, 19, 20, 15, 14, 14, 13, 11, 12, 8, 7, 8, 8, 9,
102 lsq_linear(makelist([%[1][i], %[2][i], %[3][i]], i, 1, length(%[1])), %[4],
103 [1, x1, x2, x3], [x1, x2, x3]);
104 /* Nonlinear regression (Weight Loss Data from an Obese Patient consisting of
105 the time in days and the weight in kilograms of a patient undergoing a
106 weight rehabilitation program). Fit this using least squares to weight =
107 b0 + b1 2^(- days/th), starting at (b0, b1, th) = (90, 95, 120) [Venables
108 and Ripley, p. 225] => weight = 81.37375 + 102.6842 2^(- days/141.9105) */
110 [[ 0, 4, 7, 7, 11, 18, 24, 30, 32, 43, 46, 60, 64, 70, 71,
111 71, 73, 74, 84, 88, 95, 102, 106, 109, 115, 122, 133, 137, 140, 143,
112 147, 148, 149, 150, 153, 156, 161, 164, 165, 165, 170, 176, 179, 198, 214,
113 218, 221, 225, 233, 238, 241, 246],
114 [184.35, 182.51, 180.45, 179.91, 177.91, 175.81, 173.11, 170.06, 169.31,
115 165.10, 163.11, 158.30, 155.80, 154.31, 153.86, 154.20, 152.20, 152.80,
116 150.30, 147.80, 146.10, 145.60, 142.50, 142.30, 139.40, 137.90, 133.70,
117 133.70, 133.30, 131.20, 133.00, 132.20, 130.80, 131.30, 129.00, 127.90,
118 126.90, 127.70, 129.50, 128.40, 125.40, 124.90, 124.90, 118.20, 118.20,
119 115.30, 115.70, 116.00, 115.50, 112.60, 114.00, 112.60]]$
120 errcatch(lsq_nonlinear(q[1], q[2], b0 + b1*2^(- days/th), [days], [b0, b1, th],
122 lsq_nonlinear(q[1], q[2], b0 + b1*2.0^(- days/th), [days], [b0, b1, th],