4 // A simple, literate statistics system. The code below uses the
5 // [Javascript module pattern](http://www.adequatelygood.com/2010/3/JavaScript-Module-Pattern-In-Depth),
6 // eventually assigning `simple-statistics` to `ss` in browsers or the
7 // `exports` object for node.js
11 if (typeof module
!== 'undefined') {
12 // Assign the `ss` object to exports, so that you can require
13 // it in [node.js](http://nodejs.org/)
16 // Otherwise, in a browser, we assign `ss` to the window object,
17 // so you can simply refer to it as `ss`.
21 // # [Linear Regression](http://en.wikipedia.org/wiki/Linear_regression)
23 // [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
24 // is a simple way to find a fitted line
25 // between a set of coordinates.
26 function linear_regression() {
30 // Assign data to the model. Data is assumed to be an array.
31 linreg
.data = function(x
) {
32 if (!arguments
.length
) return data
;
37 // Calculate the slope and y-intercept of the regression line
38 // by calculating the least sum of squares
39 linreg
.mb = function() {
42 // Store data length in a local variable to reduce
43 // repeated object property lookups
44 var data_length
= data
.length
;
46 //if there's only one point, arbitrarily choose a slope of 0
47 //and a y-intercept of whatever the y of the initial point is
48 if (data_length
=== 1) {
52 // Initialize our sums and scope the `m` and `b`
53 // variables that define the line.
54 var sum_x
= 0, sum_y
= 0,
55 sum_xx
= 0, sum_xy
= 0;
57 // Use local variables to grab point values
58 // with minimal object property lookups
61 // Gather the sum of all x values, the sum of all
62 // y values, and the sum of x^2 and (x*y) for each
65 // In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
66 for (var i
= 0; i
< data_length
; i
++) {
78 // `m` is the slope of the regression line
79 m
= ((data_length
* sum_xy
) - (sum_x
* sum_y
)) /
80 ((data_length
* sum_xx
) - (sum_x
* sum_x
));
82 // `b` is the y-intercept of the line.
83 b
= (sum_y
/ data_length
) - ((m
* sum_x
) / data_length
);
86 // Return both values as an object.
87 return { m
: m
, b
: b
};
90 // a shortcut for simply getting the slope of the regression line
91 linreg
.m = function() {
95 // a shortcut for simply getting the y-intercept of the regression
97 linreg
.b = function() {
101 // ## Fitting The Regression Line
103 // This is called after `.data()` and returns the
104 // equation `y = f(x)` which gives the position
105 // of the regression line at each point in `x`.
106 linreg
.line = function() {
108 // Get the slope, `m`, and y-intercept, `b`, of the line.
109 var mb
= linreg
.mb(),
113 // Return a function that computes a `y` value for each
114 // x value it is given, based on the values of `b` and `a`
115 // that we just computed.
124 // # [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
126 // The r-squared value of data compared with a function `f`
127 // is the sum of the squared differences between the prediction
128 // and the actual value.
129 function r_squared(data
, f
) {
130 if (data
.length
< 2) return 1;
132 // Compute the average y value for the actual
133 // data set in order to compute the
134 // _total sum of squares_
135 var sum
= 0, average
;
136 for (var i
= 0; i
< data
.length
; i
++) {
139 average
= sum
/ data
.length
;
141 // Compute the total sum of squares - the
142 // squared difference between each point
143 // and the average of all points.
144 var sum_of_squares
= 0;
145 for (var j
= 0; j
< data
.length
; j
++) {
146 sum_of_squares
+= Math
.pow(average
- data
[j
][1], 2);
149 // Finally estimate the error: the squared
150 // difference between the estimate and the actual data
151 // value at each point.
153 for (var k
= 0; k
< data
.length
; k
++) {
154 err
+= Math
.pow(data
[k
][1] - f(data
[k
][0]), 2);
157 // As the error grows larger, its ratio to the
158 // sum of squares increases and the r squared
159 // value grows lower.
160 return 1 - (err
/ sum_of_squares
);
164 // # [Bayesian Classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
166 // This is a naïve bayesian classifier that takes
167 // singly-nested objects.
168 function bayesian() {
169 // The `bayes_model` object is what will be exposed
170 // by this closure, with all of its extended methods, and will
171 // have access to all scope variables, like `total_count`.
172 var bayes_model
= {},
173 // The number of items that are currently
174 // classified in the model
176 // Every item classified in the model
180 // Train the classifier with a new item, which has a single
181 // dimension of Javascript literal keys and values.
182 bayes_model
.train = function(item
, category
) {
183 // If the data object doesn't have any values
184 // for this category, create a new object for it.
185 if (!data
[category
]) data
[category
] = {};
187 // Iterate through each key in the item.
188 for (var k
in item
) {
190 // Initialize the nested object `data[category][k][item[k]]`
191 // with an object of keys that equal 0.
192 if (data
[category
][k
] === undefined) data
[category
][k
] = {};
193 if (data
[category
][k
][v
] === undefined) data
[category
][k
][v
] = 0;
195 // And increment the key for this key/value combination.
196 data
[category
][k
][item
[k
]]++;
198 // Increment the number of items classified
203 // Generate a score of how well this item matches all
204 // possible categories based on its attributes
205 bayes_model
.score = function(item
) {
206 // Initialize an empty array of odds per category.
207 var odds
= {}, category
;
208 // Iterate through each key in the item,
209 // then iterate through each category that has been used
210 // in previous calls to `.train()`
211 for (var k
in item
) {
213 for (category
in data
) {
214 // Create an empty object for storing key - value combinations
215 // for this category.
216 if (odds
[category
] === undefined) odds
[category
] = {};
218 // If this item doesn't even have a property, it counts for nothing,
219 // but if it does have the property that we're looking for from
220 // the item to categorize, it counts based on how popular it is
221 // versus the whole population.
222 if (data
[category
][k
]) {
223 odds
[category
][k
+ '_' + v
] = (data
[category
][k
][v
] || 0) / total_count
;
225 odds
[category
][k
+ '_' + v
] = 0;
230 // Set up a new object that will contain sums of these odds by category
233 for (category
in odds
) {
234 // Tally all of the odds for each category-combination pair -
235 // the non-existence of a category does not add anything to the
237 for (var combination
in odds
[category
]) {
238 if (odds_sums
[category
] === undefined) odds_sums
[category
] = 0;
239 odds_sums
[category
] += odds
[category
][combination
];
246 // Return the completed model.
252 // is simply the result of adding all numbers
253 // together, starting from zero.
255 // This runs on `O(n)`, linear time in respect to the array
258 for (var i
= 0; i
< x
.length
; i
++) {
266 // is the sum over the number of values
268 // This runs on `O(n)`, linear time in respect to the array
270 // The mean of no numbers is null
271 if (x
.length
=== 0) return null;
273 return sum(x
) / x
.length
;
278 // a mean function that is more useful for numbers in different
281 // this is the nth root of the input numbers multiplied by each other
283 // This runs on `O(n)`, linear time in respect to the array
284 function geometric_mean(x
) {
285 // The mean of no numbers is null
286 if (x
.length
=== 0) return null;
288 // the starting value.
291 for (var i
= 0; i
< x
.length
; i
++) {
292 // the geometric mean is only valid for positive numbers
293 if (x
[i
] <= 0) return null;
295 // repeatedly multiply the value by each number
299 return Math
.pow(value
, 1 / x
.length
);
305 // a mean function typically used to find the average of rates
307 // this is the reciprocal of the arithmetic mean of the reciprocals
308 // of the input numbers
310 // This runs on `O(n)`, linear time in respect to the array
311 function harmonic_mean(x
) {
312 // The mean of no numbers is null
313 if (x
.length
=== 0) return null;
315 var reciprocal_sum
= 0;
317 for (var i
= 0; i
< x
.length
; i
++) {
318 // the harmonic mean is only valid for positive numbers
319 if (x
[i
] <= 0) return null;
321 reciprocal_sum
+= 1 / x
[i
];
324 // divide n by the the reciprocal sum
325 return x
.length
/ reciprocal_sum
;
328 // root mean square (RMS)
330 // a mean function used as a measure of the magnitude of a set
331 // of numbers, regardless of their sign
333 // this is the square root of the mean of the squares of the
336 // This runs on `O(n)`, linear time in respect to the array
337 function root_mean_square(x
) {
338 if (x
.length
=== 0) return null;
340 var sum_of_squares
= 0;
341 for (var i
= 0; i
< x
.length
; i
++) {
342 sum_of_squares
+= Math
.pow(x
[i
], 2);
345 return Math
.sqrt(sum_of_squares
/ x
.length
);
350 // This is simply the minimum number in the set.
352 // This runs on `O(n)`, linear time in respect to the array
355 for (var i
= 0; i
< x
.length
; i
++) {
356 // On the first iteration of this loop, min is
357 // undefined and is thus made the minimum element in the array
358 if (x
[i
] < value
|| value
=== undefined) value
= x
[i
];
365 // This is simply the maximum number in the set.
367 // This runs on `O(n)`, linear time in respect to the array
370 for (var i
= 0; i
< x
.length
; i
++) {
371 // On the first iteration of this loop, max is
372 // undefined and is thus made the maximum element in the array
373 if (x
[i
] > value
|| value
=== undefined) value
= x
[i
];
378 // # [variance](http://en.wikipedia.org/wiki/Variance)
380 // is the sum of squared deviations from the mean
382 // depends on `mean()`
383 function variance(x
) {
384 // The variance of no numbers is null
385 if (x
.length
=== 0) return null;
387 var mean_value
= mean(x
),
390 // Make a list of squared deviations from the mean.
391 for (var i
= 0; i
< x
.length
; i
++) {
392 deviations
.push(Math
.pow(x
[i
] - mean_value
, 2));
395 // Find the mean value of that list
396 return mean(deviations
);
399 // # [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
401 // is just the square root of the variance.
403 // depends on `variance()`
404 function standard_deviation(x
) {
405 // The standard deviation of no numbers is null
406 if (x
.length
=== 0) return null;
408 return Math
.sqrt(variance(x
));
411 // The sum of deviations to the Nth power.
412 // When n=2 it's the sum of squared deviations.
413 // When n=3 it's the sum of cubed deviations.
415 // depends on `mean()`
416 function sum_nth_power_deviations(x
, n
) {
417 var mean_value
= mean(x
),
420 for (var i
= 0; i
< x
.length
; i
++) {
421 sum
+= Math
.pow(x
[i
] - mean_value
, n
);
427 // # [variance](http://en.wikipedia.org/wiki/Variance)
429 // is the sum of squared deviations from the mean
431 // depends on `sum_nth_power_deviations`
432 function sample_variance(x
) {
433 // The variance of no numbers is null
434 if (x
.length
<= 1) return null;
436 var sum_squared_deviations_value
= sum_nth_power_deviations(x
, 2);
438 // Find the mean value of that list
439 return sum_squared_deviations_value
/ (x
.length
- 1);
442 // # [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
444 // is just the square root of the variance.
446 // depends on `sample_variance()`
447 function sample_standard_deviation(x
) {
448 // The standard deviation of no numbers is null
449 if (x
.length
<= 1) return null;
451 return Math
.sqrt(sample_variance(x
));
454 // # [covariance](http://en.wikipedia.org/wiki/Covariance)
456 // sample covariance of two datasets:
457 // how much do the two datasets move together?
458 // x and y are two datasets, represented as arrays of numbers.
460 // depends on `mean()`
461 function sample_covariance(x
, y
) {
463 // The two datasets must have the same length which must be more than 1
464 if (x
.length
<= 1 || x
.length
!= y
.length
){
468 // determine the mean of each dataset so that we can judge each
469 // value of the dataset fairly as the difference from the mean. this
470 // way, if one dataset is [1, 2, 3] and [2, 3, 4], their covariance
471 // does not suffer because of the difference in absolute values
476 // for each pair of values, the covariance increases when their
477 // difference from the mean is associated - if both are well above
478 // or if both are well below
479 // the mean, the covariance increases significantly.
480 for (var i
= 0; i
< x
.length
; i
++){
481 sum
+= (x
[i
] - xmean
) * (y
[i
] - ymean
);
484 // the covariance is weighted by the length of the datasets.
485 return sum
/ (x
.length
- 1);
488 // # [correlation](http://en.wikipedia.org/wiki/Correlation_and_dependence)
490 // Gets a measure of how correlated two datasets are, between -1 and 1
492 // depends on `sample_standard_deviation()` and `sample_covariance()`
493 function sample_correlation(x
, y
) {
494 var cov
= sample_covariance(x
, y
),
495 xstd
= sample_standard_deviation(x
),
496 ystd
= sample_standard_deviation(y
);
498 if (cov
=== null || xstd
=== null || ystd
=== null) {
502 return cov
/ xstd
/ ystd
;
505 // # [median](http://en.wikipedia.org/wiki/Median)
507 // The middle number of a list. This is often a good indicator of 'the middle'
508 // when there are outliers that skew the `mean()` value.
510 // The median of an empty list is null
511 if (x
.length
=== 0) return null;
513 // Sorting the array makes it easy to find the center, but
514 // use `.slice()` to ensure the original array `x` is not modified
515 var sorted
= x
.slice().sort(function (a
, b
) { return a
- b
; });
517 // If the length of the list is odd, it's the central number
518 if (sorted
.length
% 2 === 1) {
519 return sorted
[(sorted
.length
- 1) / 2];
520 // Otherwise, the median is the average of the two numbers
521 // at the center of the list
523 var a
= sorted
[(sorted
.length
/ 2) - 1];
524 var b
= sorted
[(sorted
.length
/ 2)];
529 // # [mode](http://bit.ly/W5K4Yt)
531 // The mode is the number that appears in a list the highest number of times.
532 // There can be multiple modes in a list: in the event of a tie, this
533 // algorithm will return the most recently seen mode.
535 // This implementation is inspired by [science.js](https://github.com/jasondavies/science.js/blob/master/src/stats/mode.js)
537 // This runs on `O(n)`, linear time in respect to the array
540 // Handle edge cases:
541 // The median of an empty list is null
542 if (x
.length
=== 0) return null;
543 else if (x
.length
=== 1) return x
[0];
545 // Sorting the array lets us iterate through it below and be sure
546 // that every time we see a new number it's new and we'll never
547 // see the same number twice
548 var sorted
= x
.slice().sort(function (a
, b
) { return a
- b
; });
550 // This assumes it is dealing with an array of size > 1, since size
551 // 0 and 1 are handled immediately. Hence it starts at index 1 in the
553 var last
= sorted
[0],
554 // store the mode as we find new modes
556 // store how many times we've seen the mode
558 // how many times the current candidate for the mode
562 // end at sorted.length + 1 to fix the case in which the mode is
563 // the highest number that occurs in the sequence. the last iteration
564 // compares sorted[i], which is undefined, to the highest number
566 for (var i
= 1; i
< sorted
.length
+ 1; i
++) {
567 // we're seeing a new number pass by
568 if (sorted
[i
] !== last
) {
569 // the last number is the new mode since we saw it more
570 // often than the old one
571 if (seen_this
> max_seen
) {
572 max_seen
= seen_this
;
577 // if this isn't a new number, it's one more occurrence of
578 // the potential mode
579 } else { seen_this
++; }
584 // # [t-test](http://en.wikipedia.org/wiki/Student's_t-test)
586 // This is to compute a one-sample t-test, comparing the mean
587 // of a sample to a known value, x.
589 // in this case, we're trying to determine whether the
590 // population mean is equal to the value that we know, which is `x`
591 // here. usually the results here are used to look up a
592 // [p-value](http://en.wikipedia.org/wiki/P-value), which, for
593 // a certain level of significance, will let you determine that the
594 // null hypothesis can or cannot be rejected.
596 // Depends on `standard_deviation()` and `mean()`
597 function t_test(sample
, x
) {
598 // The mean of the sample
599 var sample_mean
= mean(sample
);
601 // The standard deviation of the sample
602 var sd
= standard_deviation(sample
);
604 // Square root the length of the sample
605 var rootN
= Math
.sqrt(sample
.length
);
607 // Compute the known value against the sample,
608 // returning the t value
609 return (sample_mean
- x
) / (sd
/ rootN
);
612 // # [2-sample t-test](http://en.wikipedia.org/wiki/Student's_t-test)
614 // This is to compute two sample t-test.
615 // Tests whether "mean(X)-mean(Y) = difference", (
616 // in the most common case, we often have `difference == 0` to test if two samples
617 // are likely to be taken from populations with the same mean value) with
618 // no prior knowledge on standard deviations of both samples
619 // other than the fact that they have the same standard deviation.
621 // Usually the results here are used to look up a
622 // [p-value](http://en.wikipedia.org/wiki/P-value), which, for
623 // a certain level of significance, will let you determine that the
624 // null hypothesis can or cannot be rejected.
626 // `diff` can be omitted if it equals 0.
628 // [This is used to confirm or deny](http://www.monarchlab.org/Lab/Research/Stats/2SampleT.aspx)
629 // a null hypothesis that the two populations that have been sampled into
630 // `sample_x` and `sample_y` are equal to each other.
632 // Depends on `sample_variance()` and `mean()`
633 function t_test_two_sample(sample_x
, sample_y
, difference
) {
634 var n
= sample_x
.length
,
637 // If either sample doesn't actually have any values, we can't
638 // compute this at all, so we return `null`.
639 if (!n
|| !m
) return null ;
641 // default difference (mu) is zero
642 if (!difference
) difference
= 0;
644 var meanX
= mean(sample_x
),
645 meanY
= mean(sample_y
);
647 var weightedVariance
= ((n
- 1) * sample_variance(sample_x
) +
648 (m
- 1) * sample_variance(sample_y
)) / (n
+ m
- 2);
650 return (meanX
- meanY
- difference
) /
651 Math
.sqrt(weightedVariance
* (1 / n
+ 1 / m
));
656 // Split an array into chunks of a specified size. This function
657 // has the same behavior as [PHP's array_chunk](http://php.net/manual/en/function.array-chunk.php)
658 // function, and thus will insert smaller-sized chunks at the end if
659 // the input size is not divisible by the chunk size.
661 // `sample` is expected to be an array, and `chunkSize` a number.
662 // The `sample` array can contain any kind of data.
663 function chunk(sample
, chunkSize
) {
665 // a list of result chunks, as arrays in an array
668 // `chunkSize` must be zero or higher - otherwise the loop below,
669 // in which we call `start += chunkSize`, will loop infinitely.
670 // So, we'll detect and return null in that case to indicate
672 if (chunkSize
<= 0) {
676 // `start` is the index at which `.slice` will start selecting
677 // new array elements
678 for (var start
= 0; start
< sample
.length
; start
+= chunkSize
) {
680 // for each chunk, slice that part of the array and add it
681 // to the output. The `.slice` function does not change
682 // the original array.
683 output
.push(sample
.slice(start
, start
+ chunkSize
));
688 // # shuffle_in_place
690 // A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
691 // in-place - which means that it will change the order of the original
692 // array by reference.
693 function shuffle_in_place(sample
, randomSource
) {
695 // a custom random number source can be provided if you want to use
696 // a fixed seed or another random number generator, like
697 // [random-js](https://www.npmjs.org/package/random-js)
698 randomSource
= randomSource
|| Math
.random
;
700 // store the current length of the sample to determine
701 // when no elements remain to shuffle.
702 var length
= sample
.length
;
704 // temporary is used to hold an item when it is being
705 // swapped between indices.
708 // The index to swap at each stage.
711 // While there are still items to shuffle
713 // chose a random index within the subset of the array
714 // that is not yet shuffled
715 index
= Math
.floor(randomSource() * length
--);
717 // store the value that we'll move temporarily
718 temporary
= sample
[length
];
720 // swap the value at `sample[length]` with `sample[index]`
721 sample
[length
] = sample
[index
];
722 sample
[index
] = temporary
;
730 // A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
731 // is a fast way to create a random permutation of a finite set.
732 function shuffle(sample
, randomSource
) {
733 // slice the original array so that it is not modified
734 sample
= sample
.slice();
736 // and then shuffle that shallow-copied array, in place
737 return shuffle_in_place(sample
.slice(), randomSource
);
742 // Create a [simple random sample](http://en.wikipedia.org/wiki/Simple_random_sample)
743 // from a given array of `n` elements.
744 function sample(array
, n
, randomSource
) {
745 // shuffle the original array using a fisher-yates shuffle
746 var shuffled
= shuffle(array
, randomSource
);
748 // and then return a subset of it - the first `n` elements.
749 return shuffled
.slice(0, n
);
754 // This is a population quantile, since we assume to know the entire
755 // dataset in this library. Thus I'm trying to follow the
756 // [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
757 // algorithm from wikipedia.
759 // Sample is a one-dimensional array of numbers,
760 // and p is either a decimal number from 0 to 1 or an array of decimal
761 // numbers from 0 to 1.
762 // In terms of a k/q quantile, p = k/q - it's just dealing with fractions or dealing
763 // with decimal values.
764 // When p is an array, the result of the function is also an array containing the appropriate
765 // quantiles in input order
766 function quantile(sample
, p
) {
768 // We can't derive quantiles from an empty list
769 if (sample
.length
=== 0) return null;
771 // Sort a copy of the array. We'll need a sorted array to index
772 // the values in sorted order.
773 var sorted
= sample
.slice().sort(function (a
, b
) { return a
- b
; });
776 // Initialize the result array
778 // For each requested quantile
779 for (var i
= 0; i
< p
.length
; i
++) {
780 results
[i
] = quantile_sorted(sorted
, p
[i
]);
784 return quantile_sorted(sorted
, p
);
790 // This is the internal implementation of quantiles: when you know
791 // that the order is sorted, you don't need to re-sort it, and the computations
793 function quantile_sorted(sample
, p
) {
794 var idx
= (sample
.length
) * p
;
795 if (p
< 0 || p
> 1) {
797 } else if (p
=== 1) {
798 // If p is 1, directly return the last element
799 return sample
[sample
.length
- 1];
800 } else if (p
=== 0) {
801 // If p is 0, directly return the first element
803 } else if (idx
% 1 !== 0) {
804 // If p is not integer, return the next element in array
805 return sample
[Math
.ceil(idx
) - 1];
806 } else if (sample
.length
% 2 === 0) {
807 // If the list has even-length, we'll take the average of this number
808 // and the next value, if there is one
809 return (sample
[idx
- 1] + sample
[idx
]) / 2;
811 // Finally, in the simple case of an integer value
812 // with an odd-length list, return the sample value at the index.
817 // # [Interquartile range](http://en.wikipedia.org/wiki/Interquartile_range)
819 // A measure of statistical dispersion, or how scattered, spread, or
820 // concentrated a distribution is. It's computed as the difference between
821 // the third quartile and first quartile.
822 function iqr(sample
) {
823 // We can't derive quantiles from an empty list
824 if (sample
.length
=== 0) return null;
826 // Interquartile range is the span between the upper quartile,
827 // at `0.75`, and lower quartile, `0.25`
828 return quantile(sample
, 0.75) - quantile(sample
, 0.25);
831 // # [Median Absolute Deviation](http://en.wikipedia.org/wiki/Median_absolute_deviation)
833 // The Median Absolute Deviation (MAD) is a robust measure of statistical
834 // dispersion. It is more resilient to outliers than the standard deviation.
836 // The mad of nothing is null
837 if (!x
|| x
.length
=== 0) return null;
839 var median_value
= median(x
),
840 median_absolute_deviations
= [];
842 // Make a list of absolute deviations from the median
843 for (var i
= 0; i
< x
.length
; i
++) {
844 median_absolute_deviations
.push(Math
.abs(x
[i
] - median_value
));
847 // Find the median value of that list
848 return median(median_absolute_deviations
);
851 // ## Compute Matrices for Jenks
853 // Compute the matrices required for Jenks breaks. These matrices
854 // can be used for any classing of data with `classes <= n_classes`
855 function jenksMatrices(data
, n_classes
) {
857 // in the original implementation, these matrices are referred to
860 // * lower_class_limits (LC): optimal lower class limits
861 // * variance_combinations (OP): optimal variance combinations for all classes
862 var lower_class_limits
= [],
863 variance_combinations
= [],
866 // the variance, as computed at each step in the calculation
869 // Initialize and fill each matrix with zeroes
870 for (i
= 0; i
< data
.length
+ 1; i
++) {
871 var tmp1
= [], tmp2
= [];
872 // despite these arrays having the same values, we need
873 // to keep them separate so that changing one does not change
875 for (j
= 0; j
< n_classes
+ 1; j
++) {
879 lower_class_limits
.push(tmp1
);
880 variance_combinations
.push(tmp2
);
883 for (i
= 1; i
< n_classes
+ 1; i
++) {
884 lower_class_limits
[1][i
] = 1;
885 variance_combinations
[1][i
] = 0;
886 // in the original implementation, 9999999 is used but
887 // since Javascript has `Infinity`, we use that.
888 for (j
= 2; j
< data
.length
+ 1; j
++) {
889 variance_combinations
[j
][i
] = Infinity
;
893 for (var l
= 2; l
< data
.length
+ 1; l
++) {
895 // `SZ` originally. this is the sum of the values seen thus
896 // far when calculating variance.
898 // `ZSQ` originally. the sum of squares of values seen
901 // `WT` originally. This is the number of
906 // in several instances, you could say `Math.pow(x, 2)`
907 // instead of `x * x`, but this is slower in some browsers
908 // introduces an unnecessary concept.
909 for (var m
= 1; m
< l
+ 1; m
++) {
912 var lower_class_limit
= l
- m
+ 1,
913 val
= data
[lower_class_limit
- 1];
915 // here we're estimating variance for each potential classing
916 // of the data, for each potential number of classes. `w`
917 // is the number of data points considered so far.
920 // increase the current sum and sum-of-squares
922 sum_squares
+= val
* val
;
924 // the variance at this point in the sequence is the difference
925 // between the sum of squares and the total x 2, over the number
927 variance
= sum_squares
- (sum
* sum
) / w
;
929 i4
= lower_class_limit
- 1;
932 for (j
= 2; j
< n_classes
+ 1; j
++) {
933 // if adding this element to an existing class
934 // will increase its variance beyond the limit, break
935 // the class at this point, setting the `lower_class_limit`
937 if (variance_combinations
[l
][j
] >=
938 (variance
+ variance_combinations
[i4
][j
- 1])) {
939 lower_class_limits
[l
][j
] = lower_class_limit
;
940 variance_combinations
[l
][j
] = variance
+
941 variance_combinations
[i4
][j
- 1];
947 lower_class_limits
[l
][1] = 1;
948 variance_combinations
[l
][1] = variance
;
951 // return the two matrices. for just providing breaks, only
952 // `lower_class_limits` is needed, but variances can be useful to
953 // evaluate goodness of fit.
955 lower_class_limits
: lower_class_limits
,
956 variance_combinations
: variance_combinations
960 // ## Pull Breaks Values for Jenks
962 // the second part of the jenks recipe: take the calculated matrices
963 // and derive an array of n breaks.
964 function jenksBreaks(data
, lower_class_limits
, n_classes
) {
966 var k
= data
.length
- 1,
968 countNum
= n_classes
;
970 // the calculation of classes will never include the upper and
971 // lower bounds, so we need to explicitly set them
972 kclass
[n_classes
] = data
[data
.length
- 1];
975 // the lower_class_limits matrix is used as indices into itself
976 // here: the `k` variable is reused in each iteration.
977 while (countNum
> 1) {
978 kclass
[countNum
- 1] = data
[lower_class_limits
[k
][countNum
] - 2];
979 k
= lower_class_limits
[k
][countNum
] - 1;
986 // # [Jenks natural breaks optimization](http://en.wikipedia.org/wiki/Jenks_natural_breaks_optimization)
988 // Implementations: [1](http://danieljlewis.org/files/2010/06/Jenks.pdf) (python),
989 // [2](https://github.com/vvoovv/djeo-jenks/blob/master/main.js) (buggy),
990 // [3](https://github.com/simogeo/geostats/blob/master/lib/geostats.js#L407) (works)
992 // Depends on `jenksBreaks()` and `jenksMatrices()`
993 function jenks(data
, n_classes
) {
995 if (n_classes
> data
.length
) return null;
997 // sort data in numerical order, since this is expected
998 // by the matrices function
999 data
= data
.slice().sort(function (a
, b
) { return a
- b
; });
1001 // get our basic matrices
1002 var matrices
= jenksMatrices(data
, n_classes
),
1003 // we only need lower class limits here
1004 lower_class_limits
= matrices
.lower_class_limits
;
1006 // extract n_classes out of the computed matrices
1007 return jenksBreaks(data
, lower_class_limits
, n_classes
);
1011 // # [Skewness](http://en.wikipedia.org/wiki/Skewness)
1013 // A measure of the extent to which a probability distribution of a
1014 // real-valued random variable "leans" to one side of the mean.
1015 // The skewness value can be positive or negative, or even undefined.
1017 // Implementation is based on the adjusted Fisher-Pearson standardized
1018 // moment coefficient, which is the version found in Excel and several
1019 // statistical packages including Minitab, SAS and SPSS.
1021 // Depends on `sum_nth_power_deviations()` and `sample_standard_deviation`
1022 function sample_skewness(x
) {
1023 // The skewness of less than three arguments is null
1024 if (x
.length
< 3) return null;
1027 cubed_s
= Math
.pow(sample_standard_deviation(x
), 3),
1028 sum_cubed_deviations
= sum_nth_power_deviations(x
, 3);
1030 return n
* sum_cubed_deviations
/ ((n
- 1) * (n
- 2) * cubed_s
);
1033 // # Standard Normal Table
1034 // A standard normal table, also called the unit normal table or Z table,
1035 // is a mathematical table for the values of Φ (phi), which are the values of
1036 // the cumulative distribution function of the normal distribution.
1037 // It is used to find the probability that a statistic is observed below,
1038 // above, or between values on the standard normal distribution, and by
1039 // extension, any normal distribution.
1041 // The probabilities are taken from http://en.wikipedia.org/wiki/Standard_normal_table
1042 // The table used is the cumulative, and not cumulative from 0 to mean
1043 // (even though the latter has 5 digits precision, instead of 4).
1044 var standard_normal_table
= [
1045 /* z 0.00 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 */
1047 0.5000, 0.5040, 0.5080, 0.5120, 0.5160, 0.5199, 0.5239, 0.5279, 0.5319, 0.5359,
1049 0.5398, 0.5438, 0.5478, 0.5517, 0.5557, 0.5596, 0.5636, 0.5675, 0.5714, 0.5753,
1051 0.5793, 0.5832, 0.5871, 0.5910, 0.5948, 0.5987, 0.6026, 0.6064, 0.6103, 0.6141,
1053 0.6179, 0.6217, 0.6255, 0.6293, 0.6331, 0.6368, 0.6406, 0.6443, 0.6480, 0.6517,
1055 0.6554, 0.6591, 0.6628, 0.6664, 0.6700, 0.6736, 0.6772, 0.6808, 0.6844, 0.6879,
1057 0.6915, 0.6950, 0.6985, 0.7019, 0.7054, 0.7088, 0.7123, 0.7157, 0.7190, 0.7224,
1059 0.7257, 0.7291, 0.7324, 0.7357, 0.7389, 0.7422, 0.7454, 0.7486, 0.7517, 0.7549,
1061 0.7580, 0.7611, 0.7642, 0.7673, 0.7704, 0.7734, 0.7764, 0.7794, 0.7823, 0.7852,
1063 0.7881, 0.7910, 0.7939, 0.7967, 0.7995, 0.8023, 0.8051, 0.8078, 0.8106, 0.8133,
1065 0.8159, 0.8186, 0.8212, 0.8238, 0.8264, 0.8289, 0.8315, 0.8340, 0.8365, 0.8389,
1067 0.8413, 0.8438, 0.8461, 0.8485, 0.8508, 0.8531, 0.8554, 0.8577, 0.8599, 0.8621,
1069 0.8643, 0.8665, 0.8686, 0.8708, 0.8729, 0.8749, 0.8770, 0.8790, 0.8810, 0.8830,
1071 0.8849, 0.8869, 0.8888, 0.8907, 0.8925, 0.8944, 0.8962, 0.8980, 0.8997, 0.9015,
1073 0.9032, 0.9049, 0.9066, 0.9082, 0.9099, 0.9115, 0.9131, 0.9147, 0.9162, 0.9177,
1075 0.9192, 0.9207, 0.9222, 0.9236, 0.9251, 0.9265, 0.9279, 0.9292, 0.9306, 0.9319,
1077 0.9332, 0.9345, 0.9357, 0.9370, 0.9382, 0.9394, 0.9406, 0.9418, 0.9429, 0.9441,
1079 0.9452, 0.9463, 0.9474, 0.9484, 0.9495, 0.9505, 0.9515, 0.9525, 0.9535, 0.9545,
1081 0.9554, 0.9564, 0.9573, 0.9582, 0.9591, 0.9599, 0.9608, 0.9616, 0.9625, 0.9633,
1083 0.9641, 0.9649, 0.9656, 0.9664, 0.9671, 0.9678, 0.9686, 0.9693, 0.9699, 0.9706,
1085 0.9713, 0.9719, 0.9726, 0.9732, 0.9738, 0.9744, 0.9750, 0.9756, 0.9761, 0.9767,
1087 0.9772, 0.9778, 0.9783, 0.9788, 0.9793, 0.9798, 0.9803, 0.9808, 0.9812, 0.9817,
1089 0.9821, 0.9826, 0.9830, 0.9834, 0.9838, 0.9842, 0.9846, 0.9850, 0.9854, 0.9857,
1091 0.9861, 0.9864, 0.9868, 0.9871, 0.9875, 0.9878, 0.9881, 0.9884, 0.9887, 0.9890,
1093 0.9893, 0.9896, 0.9898, 0.9901, 0.9904, 0.9906, 0.9909, 0.9911, 0.9913, 0.9916,
1095 0.9918, 0.9920, 0.9922, 0.9925, 0.9927, 0.9929, 0.9931, 0.9932, 0.9934, 0.9936,
1097 0.9938, 0.9940, 0.9941, 0.9943, 0.9945, 0.9946, 0.9948, 0.9949, 0.9951, 0.9952,
1099 0.9953, 0.9955, 0.9956, 0.9957, 0.9959, 0.9960, 0.9961, 0.9962, 0.9963, 0.9964,
1101 0.9965, 0.9966, 0.9967, 0.9968, 0.9969, 0.9970, 0.9971, 0.9972, 0.9973, 0.9974,
1103 0.9974, 0.9975, 0.9976, 0.9977, 0.9977, 0.9978, 0.9979, 0.9979, 0.9980, 0.9981,
1105 0.9981, 0.9982, 0.9982, 0.9983, 0.9984, 0.9984, 0.9985, 0.9985, 0.9986, 0.9986,
1107 0.9987, 0.9987, 0.9987, 0.9988, 0.9988, 0.9989, 0.9989, 0.9989, 0.9990, 0.9990
1110 // # [Cumulative Standard Normal Probability](http://en.wikipedia.org/wiki/Standard_normal_table)
1112 // Since probability tables cannot be
1113 // printed for every normal distribution, as there are an infinite variety
1114 // of normal distributions, it is common practice to convert a normal to a
1115 // standard normal and then use the standard normal table to find probabilities
1116 function cumulative_std_normal_probability(z
) {
1118 // Calculate the position of this value.
1119 var absZ
= Math
.abs(z
),
1120 // Each row begins with a different
1121 // significant digit: 0.5, 0.6, 0.7, and so on. So the row is simply
1122 // this value's significant digit: 0.567 will be in row 0, so row=0,
1123 // 0.643 will be in row 1, so row=10.
1124 row
= Math
.floor(absZ
* 10),
1125 column
= 10 * (Math
.floor(absZ
* 100) / 10 - Math
.floor(absZ
* 100 / 10)),
1126 index
= Math
.min((row
* 10) + column
, standard_normal_table
.length
- 1);
1128 // The index we calculate must be in the table as a positive value,
1129 // but we still pay attention to whether the input is positive
1130 // or negative, and flip the output value as a last step.
1132 return standard_normal_table
[index
];
1134 // due to floating-point arithmetic, values in the table with
1135 // 4 significant figures can nevertheless end up as repeating
1136 // fractions when they're computed here.
1137 return +(1 - standard_normal_table
[index
]).toFixed(4);
1141 // # [Z-Score, or Standard Score](http://en.wikipedia.org/wiki/Standard_score)
1143 // The standard score is the number of standard deviations an observation
1144 // or datum is above or below the mean. Thus, a positive standard score
1145 // represents a datum above the mean, while a negative standard score
1146 // represents a datum below the mean. It is a dimensionless quantity
1147 // obtained by subtracting the population mean from an individual raw
1148 // score and then dividing the difference by the population standard
1151 // The z-score is only defined if one knows the population parameters;
1152 // if one only has a sample set, then the analogous computation with
1153 // sample mean and sample standard deviation yields the
1154 // Student's t-statistic.
1155 function z_score(x
, mean
, standard_deviation
) {
1156 return (x
- mean
) / standard_deviation
;
1159 // We use `ε`, epsilon, as a stopping criterion when we want to iterate
1160 // until we're "close enough".
1161 var epsilon
= 0.0001;
1163 // # [Factorial](https://en.wikipedia.org/wiki/Factorial)
1165 // A factorial, usually written n!, is the product of all positive
1166 // integers less than or equal to n. Often factorial is implemented
1167 // recursively, but this iterative approach is significantly faster
1169 function factorial(n
) {
1171 // factorial is mathematically undefined for negative numbers
1172 if (n
< 0 ) { return null; }
1174 // typically you'll expand the factorial function going down, like
1175 // 5! = 5 * 4 * 3 * 2 * 1. This is going in the opposite direction,
1176 // counting from 2 up to the number in question, and since anything
1177 // multiplied by 1 is itself, the loop only needs to start at 2.
1178 var accumulator
= 1;
1179 for (var i
= 2; i
<= n
; i
++) {
1180 // for each number up to and including the number `n`, multiply
1181 // the accumulator my that number.
1187 // # Bernoulli Distribution
1189 // The [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution)
1190 // is the probability discrete
1191 // distribution of a random variable which takes value 1 with success
1192 // probability `p` and value 0 with failure
1193 // probability `q` = 1 - `p`. It can be used, for example, to represent the
1194 // toss of a coin, where "1" is defined to mean "heads" and "0" is defined
1195 // to mean "tails" (or vice versa). It is
1196 // a special case of a Binomial Distribution
1198 function bernoulli_distribution(p
) {
1199 // Check that `p` is a valid probability (0 ≤ p ≤ 1)
1200 if (p
< 0 || p
> 1 ) { return null; }
1202 return binomial_distribution(1, p
);
1205 // # Binomial Distribution
1207 // The [Binomial Distribution](http://en.wikipedia.org/wiki/Binomial_distribution) is the discrete probability
1208 // distribution of the number of successes in a sequence of n independent yes/no experiments, each of which yields
1209 // success with probability `probability`. Such a success/failure experiment is also called a Bernoulli experiment or
1210 // Bernoulli trial; when trials = 1, the Binomial Distribution is a Bernoulli Distribution.
1211 function binomial_distribution(trials
, probability
) {
1212 // Check that `p` is a valid probability (0 ≤ p ≤ 1),
1213 // that `n` is an integer, strictly positive.
1214 if (probability
< 0 || probability
> 1 ||
1215 trials
<= 0 || trials
% 1 !== 0) {
1219 // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
1220 function probability_mass(x
, trials
, probability
) {
1221 return factorial(trials
) /
1222 (factorial(x
) * factorial(trials
- x
)) *
1223 (Math
.pow(probability
, x
) * Math
.pow(1 - probability
, trials
- x
));
1226 // We initialize `x`, the random variable, and `accumulator`, an accumulator
1227 // for the cumulative distribution function to 0. `distribution_functions`
1228 // is the object we'll return with the `probability_of_x` and the
1229 // `cumulative_probability_of_x`, as well as the calculated mean &
1230 // variance. We iterate until the `cumulative_probability_of_x` is
1231 // within `epsilon` of 1.0.
1233 cumulative_probability
= 0,
1236 // This algorithm iterates through each potential outcome,
1237 // until the `cumulative_probability` is very close to 1, at
1238 // which point we've defined the vast majority of outcomes
1240 cells
[x
] = probability_mass(x
, trials
, probability
);
1241 cumulative_probability
+= cells
[x
];
1243 // when the cumulative_probability is nearly 1, we've calculated
1244 // the useful range of this distribution
1245 } while (cumulative_probability
< 1 - epsilon
);
1250 // # Poisson Distribution
1252 // The [Poisson Distribution](http://en.wikipedia.org/wiki/Poisson_distribution)
1253 // is a discrete probability distribution that expresses the probability
1254 // of a given number of events occurring in a fixed interval of time
1255 // and/or space if these events occur with a known average rate and
1256 // independently of the time since the last event.
1258 // The Poisson Distribution is characterized by the strictly positive
1259 // mean arrival or occurrence rate, `λ`.
1260 function poisson_distribution(lambda
) {
1261 // Check that lambda is strictly positive
1262 if (lambda
<= 0) { return null; }
1264 // our current place in the distribution
1266 // and we keep track of the current cumulative probability, in
1267 // order to know when to stop calculating chances.
1268 cumulative_probability
= 0,
1269 // the calculated cells to be returned
1272 // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
1273 function probability_mass(x
, lambda
) {
1274 return (Math
.pow(Math
.E
, -lambda
) * Math
.pow(lambda
, x
)) /
1278 // This algorithm iterates through each potential outcome,
1279 // until the `cumulative_probability` is very close to 1, at
1280 // which point we've defined the vast majority of outcomes
1282 cells
[x
] = probability_mass(x
, lambda
);
1283 cumulative_probability
+= cells
[x
];
1285 // when the cumulative_probability is nearly 1, we've calculated
1286 // the useful range of this distribution
1287 } while (cumulative_probability
< 1 - epsilon
);
1292 // # Percentage Points of the χ2 (Chi-Squared) Distribution
1293 // The [χ2 (Chi-Squared) Distribution](http://en.wikipedia.org/wiki/Chi-squared_distribution) is used in the common
1294 // chi-squared tests for goodness of fit of an observed distribution to a theoretical one, the independence of two
1295 // criteria of classification of qualitative data, and in confidence interval estimation for a population standard
1296 // deviation of a normal distribution from a sample standard deviation.
1298 // Values from Appendix 1, Table III of William W. Hines & Douglas C. Montgomery, "Probability and Statistics in
1299 // Engineering and Management Science", Wiley (1980).
1300 var chi_squared_distribution_table
= {
1301 1: { 0.995: 0.00, 0.99: 0.00, 0.975: 0.00, 0.95: 0.00, 0.9: 0.02, 0.5: 0.45, 0.1: 2.71, 0.05: 3.84, 0.025: 5.02, 0.01: 6.63, 0.005: 7.88 },
1302 2: { 0.995: 0.01, 0.99: 0.02, 0.975: 0.05, 0.95: 0.10, 0.9: 0.21, 0.5: 1.39, 0.1: 4.61, 0.05: 5.99, 0.025: 7.38, 0.01: 9.21, 0.005: 10.60 },
1303 3: { 0.995: 0.07, 0.99: 0.11, 0.975: 0.22, 0.95: 0.35, 0.9: 0.58, 0.5: 2.37, 0.1: 6.25, 0.05: 7.81, 0.025: 9.35, 0.01: 11.34, 0.005: 12.84 },
1304 4: { 0.995: 0.21, 0.99: 0.30, 0.975: 0.48, 0.95: 0.71, 0.9: 1.06, 0.5: 3.36, 0.1: 7.78, 0.05: 9.49, 0.025: 11.14, 0.01: 13.28, 0.005: 14.86 },
1305 5: { 0.995: 0.41, 0.99: 0.55, 0.975: 0.83, 0.95: 1.15, 0.9: 1.61, 0.5: 4.35, 0.1: 9.24, 0.05: 11.07, 0.025: 12.83, 0.01: 15.09, 0.005: 16.75 },
1306 6: { 0.995: 0.68, 0.99: 0.87, 0.975: 1.24, 0.95: 1.64, 0.9: 2.20, 0.5: 5.35, 0.1: 10.65, 0.05: 12.59, 0.025: 14.45, 0.01: 16.81, 0.005: 18.55 },
1307 7: { 0.995: 0.99, 0.99: 1.25, 0.975: 1.69, 0.95: 2.17, 0.9: 2.83, 0.5: 6.35, 0.1: 12.02, 0.05: 14.07, 0.025: 16.01, 0.01: 18.48, 0.005: 20.28 },
1308 8: { 0.995: 1.34, 0.99: 1.65, 0.975: 2.18, 0.95: 2.73, 0.9: 3.49, 0.5: 7.34, 0.1: 13.36, 0.05: 15.51, 0.025: 17.53, 0.01: 20.09, 0.005: 21.96 },
1309 9: { 0.995: 1.73, 0.99: 2.09, 0.975: 2.70, 0.95: 3.33, 0.9: 4.17, 0.5: 8.34, 0.1: 14.68, 0.05: 16.92, 0.025: 19.02, 0.01: 21.67, 0.005: 23.59 },
1310 10: { 0.995: 2.16, 0.99: 2.56, 0.975: 3.25, 0.95: 3.94, 0.9: 4.87, 0.5: 9.34, 0.1: 15.99, 0.05: 18.31, 0.025: 20.48, 0.01: 23.21, 0.005: 25.19 },
1311 11: { 0.995: 2.60, 0.99: 3.05, 0.975: 3.82, 0.95: 4.57, 0.9: 5.58, 0.5: 10.34, 0.1: 17.28, 0.05: 19.68, 0.025: 21.92, 0.01: 24.72, 0.005: 26.76 },
1312 12: { 0.995: 3.07, 0.99: 3.57, 0.975: 4.40, 0.95: 5.23, 0.9: 6.30, 0.5: 11.34, 0.1: 18.55, 0.05: 21.03, 0.025: 23.34, 0.01: 26.22, 0.005: 28.30 },
1313 13: { 0.995: 3.57, 0.99: 4.11, 0.975: 5.01, 0.95: 5.89, 0.9: 7.04, 0.5: 12.34, 0.1: 19.81, 0.05: 22.36, 0.025: 24.74, 0.01: 27.69, 0.005: 29.82 },
1314 14: { 0.995: 4.07, 0.99: 4.66, 0.975: 5.63, 0.95: 6.57, 0.9: 7.79, 0.5: 13.34, 0.1: 21.06, 0.05: 23.68, 0.025: 26.12, 0.01: 29.14, 0.005: 31.32 },
1315 15: { 0.995: 4.60, 0.99: 5.23, 0.975: 6.27, 0.95: 7.26, 0.9: 8.55, 0.5: 14.34, 0.1: 22.31, 0.05: 25.00, 0.025: 27.49, 0.01: 30.58, 0.005: 32.80 },
1316 16: { 0.995: 5.14, 0.99: 5.81, 0.975: 6.91, 0.95: 7.96, 0.9: 9.31, 0.5: 15.34, 0.1: 23.54, 0.05: 26.30, 0.025: 28.85, 0.01: 32.00, 0.005: 34.27 },
1317 17: { 0.995: 5.70, 0.99: 6.41, 0.975: 7.56, 0.95: 8.67, 0.9: 10.09, 0.5: 16.34, 0.1: 24.77, 0.05: 27.59, 0.025: 30.19, 0.01: 33.41, 0.005: 35.72 },
1318 18: { 0.995: 6.26, 0.99: 7.01, 0.975: 8.23, 0.95: 9.39, 0.9: 10.87, 0.5: 17.34, 0.1: 25.99, 0.05: 28.87, 0.025: 31.53, 0.01: 34.81, 0.005: 37.16 },
1319 19: { 0.995: 6.84, 0.99: 7.63, 0.975: 8.91, 0.95: 10.12, 0.9: 11.65, 0.5: 18.34, 0.1: 27.20, 0.05: 30.14, 0.025: 32.85, 0.01: 36.19, 0.005: 38.58 },
1320 20: { 0.995: 7.43, 0.99: 8.26, 0.975: 9.59, 0.95: 10.85, 0.9: 12.44, 0.5: 19.34, 0.1: 28.41, 0.05: 31.41, 0.025: 34.17, 0.01: 37.57, 0.005: 40.00 },
1321 21: { 0.995: 8.03, 0.99: 8.90, 0.975: 10.28, 0.95: 11.59, 0.9: 13.24, 0.5: 20.34, 0.1: 29.62, 0.05: 32.67, 0.025: 35.48, 0.01: 38.93, 0.005: 41.40 },
1322 22: { 0.995: 8.64, 0.99: 9.54, 0.975: 10.98, 0.95: 12.34, 0.9: 14.04, 0.5: 21.34, 0.1: 30.81, 0.05: 33.92, 0.025: 36.78, 0.01: 40.29, 0.005: 42.80 },
1323 23: { 0.995: 9.26, 0.99: 10.20, 0.975: 11.69, 0.95: 13.09, 0.9: 14.85, 0.5: 22.34, 0.1: 32.01, 0.05: 35.17, 0.025: 38.08, 0.01: 41.64, 0.005: 44.18 },
1324 24: { 0.995: 9.89, 0.99: 10.86, 0.975: 12.40, 0.95: 13.85, 0.9: 15.66, 0.5: 23.34, 0.1: 33.20, 0.05: 36.42, 0.025: 39.36, 0.01: 42.98, 0.005: 45.56 },
1325 25: { 0.995: 10.52, 0.99: 11.52, 0.975: 13.12, 0.95: 14.61, 0.9: 16.47, 0.5: 24.34, 0.1: 34.28, 0.05: 37.65, 0.025: 40.65, 0.01: 44.31, 0.005: 46.93 },
1326 26: { 0.995: 11.16, 0.99: 12.20, 0.975: 13.84, 0.95: 15.38, 0.9: 17.29, 0.5: 25.34, 0.1: 35.56, 0.05: 38.89, 0.025: 41.92, 0.01: 45.64, 0.005: 48.29 },
1327 27: { 0.995: 11.81, 0.99: 12.88, 0.975: 14.57, 0.95: 16.15, 0.9: 18.11, 0.5: 26.34, 0.1: 36.74, 0.05: 40.11, 0.025: 43.19, 0.01: 46.96, 0.005: 49.65 },
1328 28: { 0.995: 12.46, 0.99: 13.57, 0.975: 15.31, 0.95: 16.93, 0.9: 18.94, 0.5: 27.34, 0.1: 37.92, 0.05: 41.34, 0.025: 44.46, 0.01: 48.28, 0.005: 50.99 },
1329 29: { 0.995: 13.12, 0.99: 14.26, 0.975: 16.05, 0.95: 17.71, 0.9: 19.77, 0.5: 28.34, 0.1: 39.09, 0.05: 42.56, 0.025: 45.72, 0.01: 49.59, 0.005: 52.34 },
1330 30: { 0.995: 13.79, 0.99: 14.95, 0.975: 16.79, 0.95: 18.49, 0.9: 20.60, 0.5: 29.34, 0.1: 40.26, 0.05: 43.77, 0.025: 46.98, 0.01: 50.89, 0.005: 53.67 },
1331 40: { 0.995: 20.71, 0.99: 22.16, 0.975: 24.43, 0.95: 26.51, 0.9: 29.05, 0.5: 39.34, 0.1: 51.81, 0.05: 55.76, 0.025: 59.34, 0.01: 63.69, 0.005: 66.77 },
1332 50: { 0.995: 27.99, 0.99: 29.71, 0.975: 32.36, 0.95: 34.76, 0.9: 37.69, 0.5: 49.33, 0.1: 63.17, 0.05: 67.50, 0.025: 71.42, 0.01: 76.15, 0.005: 79.49 },
1333 60: { 0.995: 35.53, 0.99: 37.48, 0.975: 40.48, 0.95: 43.19, 0.9: 46.46, 0.5: 59.33, 0.1: 74.40, 0.05: 79.08, 0.025: 83.30, 0.01: 88.38, 0.005: 91.95 },
1334 70: { 0.995: 43.28, 0.99: 45.44, 0.975: 48.76, 0.95: 51.74, 0.9: 55.33, 0.5: 69.33, 0.1: 85.53, 0.05: 90.53, 0.025: 95.02, 0.01: 100.42, 0.005: 104.22 },
1335 80: { 0.995: 51.17, 0.99: 53.54, 0.975: 57.15, 0.95: 60.39, 0.9: 64.28, 0.5: 79.33, 0.1: 96.58, 0.05: 101.88, 0.025: 106.63, 0.01: 112.33, 0.005: 116.32 },
1336 90: { 0.995: 59.20, 0.99: 61.75, 0.975: 65.65, 0.95: 69.13, 0.9: 73.29, 0.5: 89.33, 0.1: 107.57, 0.05: 113.14, 0.025: 118.14, 0.01: 124.12, 0.005: 128.30 },
1337 100: { 0.995: 67.33, 0.99: 70.06, 0.975: 74.22, 0.95: 77.93, 0.9: 82.36, 0.5: 99.33, 0.1: 118.50, 0.05: 124.34, 0.025: 129.56, 0.01: 135.81, 0.005: 140.17 }
1340 // # χ2 (Chi-Squared) Goodness-of-Fit Test
1342 // The [χ2 (Chi-Squared) Goodness-of-Fit Test](http://en.wikipedia.org/wiki/Goodness_of_fit#Pearson.27s_chi-squared_test)
1343 // uses a measure of goodness of fit which is the sum of differences between observed and expected outcome frequencies
1344 // (that is, counts of observations), each squared and divided by the number of observations expected given the
1345 // hypothesized distribution. The resulting χ2 statistic, `chi_squared`, can be compared to the chi-squared distribution
1346 // to determine the goodness of fit. In order to determine the degrees of freedom of the chi-squared distribution, one
1347 // takes the total number of observed frequencies and subtracts the number of estimated parameters. The test statistic
1348 // follows, approximately, a chi-square distribution with (k − c) degrees of freedom where `k` is the number of non-empty
1349 // cells and `c` is the number of estimated parameters for the distribution.
1350 function chi_squared_goodness_of_fit(data
, distribution_type
, significance
) {
1351 // Estimate from the sample data, a weighted mean.
1352 var input_mean
= mean(data
),
1353 // Calculated value of the χ2 statistic.
1355 // Degrees of freedom, calculated as (number of class intervals -
1356 // number of hypothesized distribution parameters estimated - 1)
1358 // Number of hypothesized distribution parameters estimated, expected to be supplied in the distribution test.
1359 // Lose one degree of freedom for estimating `lambda` from the sample data.
1361 // The hypothesized distribution.
1362 // Generate the hypothesized distribution.
1363 hypothesized_distribution
= distribution_type(input_mean
),
1364 observed_frequencies
= [],
1365 expected_frequencies
= [],
1368 // Create an array holding a histogram from the sample data, of
1369 // the form `{ value: numberOfOcurrences }`
1370 for (var i
= 0; i
< data
.length
; i
++) {
1371 if (observed_frequencies
[data
[i
]] === undefined) {
1372 observed_frequencies
[data
[i
]] = 0;
1374 observed_frequencies
[data
[i
]]++;
1377 // The histogram we created might be sparse - there might be gaps
1378 // between values. So we iterate through the histogram, making
1379 // sure that instead of undefined, gaps have 0 values.
1380 for (i
= 0; i
< observed_frequencies
.length
; i
++) {
1381 if (observed_frequencies
[i
] === undefined) {
1382 observed_frequencies
[i
] = 0;
1386 // Create an array holding a histogram of expected data given the
1387 // sample size and hypothesized distribution.
1388 for (k
in hypothesized_distribution
) {
1389 if (k
in observed_frequencies
) {
1390 expected_frequencies
[k
] = hypothesized_distribution
[k
] * data
.length
;
1394 // Working backward through the expected frequencies, collapse classes
1395 // if less than three observations are expected for a class.
1396 // This transformation is applied to the observed frequencies as well.
1397 for (k
= expected_frequencies
.length
- 1; k
>= 0; k
--) {
1398 if (expected_frequencies
[k
] < 3) {
1399 expected_frequencies
[k
- 1] += expected_frequencies
[k
];
1400 expected_frequencies
.pop();
1402 observed_frequencies
[k
- 1] += observed_frequencies
[k
];
1403 observed_frequencies
.pop();
1407 // Iterate through the squared differences between observed & expected
1408 // frequencies, accumulating the `chi_squared` statistic.
1409 for (k
= 0; k
< observed_frequencies
.length
; k
++) {
1410 chi_squared
+= Math
.pow(
1411 observed_frequencies
[k
] - expected_frequencies
[k
], 2) /
1412 expected_frequencies
[k
];
1415 // Calculate degrees of freedom for this test and look it up in the
1416 // `chi_squared_distribution_table` in order to
1417 // accept or reject the goodness-of-fit of the hypothesized distribution.
1418 degrees_of_freedom
= observed_frequencies
.length
- c
- 1;
1419 return chi_squared_distribution_table
[degrees_of_freedom
][significance
] < chi_squared
;
1424 // Mixin simple_statistics to a single Array instance if provided
1425 // or the Array native object if not. This is an optional
1426 // feature that lets you treat simple_statistics as a native feature
1428 function mixin(array
) {
1429 var support
= !!(Object
.defineProperty
&& Object
.defineProperties
);
1430 if (!support
) throw new Error('without defineProperty, simple-statistics cannot be mixed in');
1432 // only methods which work on basic arrays in a single step
1434 var arrayMethods
= ['median', 'standard_deviation', 'sum',
1436 'mean', 'min', 'max', 'quantile', 'geometric_mean',
1437 'harmonic_mean', 'root_mean_square'];
1439 // create a closure with a method name so that a reference
1440 // like `arrayMethods[i]` doesn't follow the loop increment
1441 function wrap(method
) {
1443 // cast any arguments into an array, since they're
1445 var args
= Array
.prototype.slice
.apply(arguments
);
1446 // make the first argument the array itself
1448 // return the result of the ss method
1449 return ss
[method
].apply(ss
, args
);
1453 // select object to extend
1456 // create a shallow copy of the array so that our internal
1457 // operations do not change it by reference
1458 extending
= array
.slice();
1460 extending
= Array
.prototype;
1463 // for each array function, define a function that gets
1464 // the array as the first argument.
1465 // We use [defineProperty](https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/Object/defineProperty)
1466 // because it allows these properties to be non-enumerable:
1467 // `for (var in x)` loops will not run into problems with this
1469 for (var i
= 0; i
< arrayMethods
.length
; i
++) {
1470 Object
.defineProperty(extending
, arrayMethods
[i
], {
1471 value
: wrap(arrayMethods
[i
]),
1481 ss
.linear_regression
= linear_regression
;
1482 ss
.standard_deviation
= standard_deviation
;
1483 ss
.r_squared
= r_squared
;
1490 ss
.quantile
= quantile
;
1491 ss
.quantile_sorted
= quantile_sorted
;
1496 ss
.shuffle
= shuffle
;
1497 ss
.shuffle_in_place
= shuffle_in_place
;
1501 ss
.sample_covariance
= sample_covariance
;
1502 ss
.sample_correlation
= sample_correlation
;
1503 ss
.sample_variance
= sample_variance
;
1504 ss
.sample_standard_deviation
= sample_standard_deviation
;
1505 ss
.sample_skewness
= sample_skewness
;
1507 ss
.geometric_mean
= geometric_mean
;
1508 ss
.harmonic_mean
= harmonic_mean
;
1509 ss
.root_mean_square
= root_mean_square
;
1510 ss
.variance
= variance
;
1512 ss
.t_test_two_sample
= t_test_two_sample
;
1515 ss
.jenksMatrices
= jenksMatrices
;
1516 ss
.jenksBreaks
= jenksBreaks
;
1519 ss
.bayesian
= bayesian
;
1521 // Distribution-related methods
1522 ss
.epsilon
= epsilon
; // We make ε available to the test suite.
1523 ss
.factorial
= factorial
;
1524 ss
.bernoulli_distribution
= bernoulli_distribution
;
1525 ss
.binomial_distribution
= binomial_distribution
;
1526 ss
.poisson_distribution
= poisson_distribution
;
1527 ss
.chi_squared_goodness_of_fit
= chi_squared_goodness_of_fit
;
1529 // Normal distribution
1530 ss
.z_score
= z_score
;
1531 ss
.cumulative_std_normal_probability
= cumulative_std_normal_probability
;
1532 ss
.standard_normal_table
= standard_normal_table
;
1534 // Alias this into its common name
1536 ss
.interquartile_range
= iqr
;
1538 ss
.median_absolute_deviation
= mad
;
1539 ss
.rms
= root_mean_square
;