5 x
= np
.array([ 23,23,25,27,31,31,32,33,35,36,38,38,39,42 ])
6 y
= np
.array([ 0.58,0.35,0.26,0.48,0.21,0.19,0.69,0.65,0.09,0.14,0.3,0.22,0.47,0.26 ])
7 e
= np
.array([ 0.096794787,0.075415516,0.070237692,0.052086884,0.043921177,0.047927184,0.115623311,0.068138514,0.0379057,0.043373379,0.079772404,0.068101673,0.056511855,0.067682733 ])
8 sampleid
= '11044888_F'
10 # e = sqrt( p(1-p)/(n-1) )
14 import matplotlib
.pyplot
as plt
16 from scipy
import optimize
18 def squared_loss(theta
, x
=x
, y
=y
, e
=e
):
19 dy
= y
- theta
[0] - theta
[1] * x
20 return np
.sum(0.5 * (dy
/ e
) ** 2)
22 theta1
= optimize
.fmin(squared_loss
, [np
.mean(x
), np
.mean(y
)], disp
=False)
24 t
= np
.linspace(-20, 20)
26 def huber_loss(t
, c
=3):
27 return ((abs(t
) < c
) * 0.5 * t
** 2
28 + (abs(t
) >= c
) * -c
* (0.5 * c
- abs(t
)))
30 def total_huber_loss(theta
, x
=x
, y
=y
, e
=e
, c
=3):
31 return huber_loss((y
- theta
[0] - theta
[1] * x
) / e
, c
).sum()
33 theta2
= optimize
.fmin(total_huber_loss
, [np
.mean(x
), np
.mean(y
)], disp
=False)
35 xfit
= np
.linspace(10, 70)
37 # theta will be an array of length 2 + N, where N is the number of points
38 # theta[0] is the intercept, theta[1] is the slope,
39 # and theta[2 + i] is the weight g_i
42 #g_i needs to be between 0 and 1
43 if (all(theta
[2:] > 0) and all(theta
[2:] < 1)):
46 return -np
.inf
# recall log(0) = -inf
48 def log_likelihood(theta
, x
, y
, e
, sigma_B
):
49 dy
= y
- theta
[0] - theta
[1] * x
50 g
= np
.clip(theta
[2:], 0, 1) # g<0 or g>1 leads to NaNs in logarithm
51 logL1
= np
.log(g
) - 0.5 * np
.log(2 * np
.pi
* e
** 2) - 0.5 * (dy
/ e
) ** 2
52 logL2
= np
.log(1 - g
) - 0.5 * np
.log(2 * np
.pi
* sigma_B
** 2) - 0.5 * (dy
/ sigma_B
) ** 2
53 return np
.sum(np
.logaddexp(logL1
, logL2
))
55 def log_posterior(theta
, x
, y
, e
, sigma_B
):
56 return log_prior(theta
) + log_likelihood(theta
, x
, y
, e
, sigma_B
)
58 ndim
= 2 + len(x
) # number of parameters in the model
59 nwalkers
= 50 # number of MCMC walkers
60 nburn
= 100000 # "burn-in" period to let chains stabilize
61 nsteps
= 150000 # number of MCMC steps to take
63 # set theta near the maximum likelihood, with
65 starting_guesses
= np
.zeros((nwalkers
, ndim
))
66 starting_guesses
[:, :2] = np
.random
.normal(theta1
, 1, (nwalkers
, 2))
67 starting_guesses
[:, 2:] = np
.random
.normal(0.5, 0.1, (nwalkers
, ndim
- 2))
70 import multiprocessing
as mp
71 sampler
= emcee
.EnsembleSampler(nwalkers
, ndim
, log_posterior
, args
=[x
, y
, e
, 50], threads
=mp
.cpu_count() )
72 sampler
.run_mcmc(starting_guesses
, nsteps
)
74 sample
= sampler
.chain
# shape = (nwalkers, nsteps, ndim)
75 sample
= sampler
.chain
[:, nburn
:, :].reshape(-1, ndim
)
77 theta3
= np
.mean(sample
[:, :2], axis
=0)
78 g
= np
.mean(sample
[:, 2:], 0)
82 plt
.errorbar(x
, y
, e
, fmt
='.k', ecolor
='gray')
83 plt
.plot(xfit
, theta1
[0] + theta1
[1] * xfit
, color
='gray',label
="y = %sx + %s"%(theta1
[1],theta1
[0]) )
84 plt
.plot(xfit
, theta2
[0] + theta2
[1] * xfit
, color
='green',label
="Huber: y = %sx + %s"%(theta2
[1],theta2
[0]) )
85 plt
.plot(xfit
, theta3
[0] + theta3
[1] * xfit
, color
='navy',label
="MCMC: y = %sx + %s"%(theta3
[1],theta3
[0]) )
86 plt
.plot(x
[outliers
], y
[outliers
], 'ro', ms
=20, mfc
='none', mec
='red')
87 plt
.title("%s"%(sampleid));
88 plt
.legend(loc
='best', frameon
=False);
90 plt
.savefig("%s.pdf"%sampleid
)