new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / tl.py
blob8f123fd5744b6f55aeab37a6cf6ecaf37f24a351
1 #!/usr/bin/env python3
3 import numpy as np
5 x = np.array([ 23,23,25,27,31,31,32,33,35,36,38,38,39,42 ])
6 y = np.array([ 0.58,0.35,0.26,0.48,0.21,0.19,0.69,0.65,0.09,0.14,0.3,0.22,0.47,0.26 ])
7 e = np.array([ 0.096794787,0.075415516,0.070237692,0.052086884,0.043921177,0.047927184,0.115623311,0.068138514,0.0379057,0.043373379,0.079772404,0.068101673,0.056511855,0.067682733 ])
8 sampleid = '11044888_F'
10 # e = sqrt( p(1-p)/(n-1) )
11 import matplotlib
12 matplotlib.use('pdf')
14 import matplotlib.pyplot as plt
16 from scipy import optimize
18 def squared_loss(theta, x=x, y=y, e=e):
19 dy = y - theta[0] - theta[1] * x
20 return np.sum(0.5 * (dy / e) ** 2)
22 theta1 = optimize.fmin(squared_loss, [np.mean(x), np.mean(y)], disp=False)
24 t = np.linspace(-20, 20)
26 def huber_loss(t, c=3):
27 return ((abs(t) < c) * 0.5 * t ** 2
28 + (abs(t) >= c) * -c * (0.5 * c - abs(t)))
30 def total_huber_loss(theta, x=x, y=y, e=e, c=3):
31 return huber_loss((y - theta[0] - theta[1] * x) / e, c).sum()
33 theta2 = optimize.fmin(total_huber_loss, [np.mean(x), np.mean(y)], disp=False)
35 xfit = np.linspace(10, 70)
37 # theta will be an array of length 2 + N, where N is the number of points
38 # theta[0] is the intercept, theta[1] is the slope,
39 # and theta[2 + i] is the weight g_i
41 def log_prior(theta):
42 #g_i needs to be between 0 and 1
43 if (all(theta[2:] > 0) and all(theta[2:] < 1)):
44 return 0
45 else:
46 return -np.inf # recall log(0) = -inf
48 def log_likelihood(theta, x, y, e, sigma_B):
49 dy = y - theta[0] - theta[1] * x
50 g = np.clip(theta[2:], 0, 1) # g<0 or g>1 leads to NaNs in logarithm
51 logL1 = np.log(g) - 0.5 * np.log(2 * np.pi * e ** 2) - 0.5 * (dy / e) ** 2
52 logL2 = np.log(1 - g) - 0.5 * np.log(2 * np.pi * sigma_B ** 2) - 0.5 * (dy / sigma_B) ** 2
53 return np.sum(np.logaddexp(logL1, logL2))
55 def log_posterior(theta, x, y, e, sigma_B):
56 return log_prior(theta) + log_likelihood(theta, x, y, e, sigma_B)
58 ndim = 2 + len(x) # number of parameters in the model
59 nwalkers = 50 # number of MCMC walkers
60 nburn = 100000 # "burn-in" period to let chains stabilize
61 nsteps = 150000 # number of MCMC steps to take
63 # set theta near the maximum likelihood, with
64 np.random.seed(0)
65 starting_guesses = np.zeros((nwalkers, ndim))
66 starting_guesses[:, :2] = np.random.normal(theta1, 1, (nwalkers, 2))
67 starting_guesses[:, 2:] = np.random.normal(0.5, 0.1, (nwalkers, ndim - 2))
69 import emcee
70 import multiprocessing as mp
71 sampler = emcee.EnsembleSampler(nwalkers, ndim, log_posterior, args=[x, y, e, 50], threads=mp.cpu_count() )
72 sampler.run_mcmc(starting_guesses, nsteps)
74 sample = sampler.chain # shape = (nwalkers, nsteps, ndim)
75 sample = sampler.chain[:, nburn:, :].reshape(-1, ndim)
77 theta3 = np.mean(sample[:, :2], axis=0)
78 g = np.mean(sample[:, 2:], 0)
79 outliers = (g < 0.5)
80 #plt.show()
82 plt.errorbar(x, y, e, fmt='.k', ecolor='gray')
83 plt.plot(xfit, theta1[0] + theta1[1] * xfit, color='gray',label="y = %sx + %s"%(theta1[1],theta1[0]) )
84 plt.plot(xfit, theta2[0] + theta2[1] * xfit, color='green',label="Huber: y = %sx + %s"%(theta2[1],theta2[0]) )
85 plt.plot(xfit, theta3[0] + theta3[1] * xfit, color='navy',label="MCMC: y = %sx + %s"%(theta3[1],theta3[0]) )
86 plt.plot(x[outliers], y[outliers], 'ro', ms=20, mfc='none', mec='red')
87 plt.title("%s"%(sampleid));
88 plt.legend(loc='best', frameon=False);
89 #plt.show()
90 plt.savefig("%s.pdf"%sampleid)