perl/etc/justonce/tl.py

   1 #!/usr/bin/env python3
   2
   3 import numpy as np
   4
   5 x = np.array([ 23,23,25,27,31,31,32,33,35,36,38,38,39,42 ])
   6 y = np.array([ 0.58,0.35,0.26,0.48,0.21,0.19,0.69,0.65,0.09,0.14,0.3,0.22,0.47,0.26 ])
   7 e = np.array([ 0.096794787,0.075415516,0.070237692,0.052086884,0.043921177,0.047927184,0.115623311,0.068138514,0.0379057,0.043373379,0.079772404,0.068101673,0.056511855,0.067682733 ])
   8 sampleid = '11044888_F'
   9
  10 # e = sqrt( p(1-p)/(n-1) )
  11 import matplotlib
  12 matplotlib.use('pdf')
  13
  14 import matplotlib.pyplot as plt
  15
  16 from scipy import optimize
  17
  18 def squared_loss(theta, x=x, y=y, e=e):
  19     dy = y - theta[0] - theta[1] * x
  20     return np.sum(0.5 * (dy / e) ** 2)
  21
  22 theta1 = optimize.fmin(squared_loss, [np.mean(x), np.mean(y)], disp=False)
  23
  24 t = np.linspace(-20, 20)
  25
  26 def huber_loss(t, c=3):
  27     return ((abs(t) < c) * 0.5 * t ** 2
  28             + (abs(t) >= c) * -c * (0.5 * c - abs(t)))
  29
  30 def total_huber_loss(theta, x=x, y=y, e=e, c=3):
  31     return huber_loss((y - theta[0] - theta[1] * x) / e, c).sum()
  32
  33 theta2 = optimize.fmin(total_huber_loss, [np.mean(x), np.mean(y)], disp=False)
  34
  35 xfit = np.linspace(10, 70)
  36
  37 # theta will be an array of length 2 + N, where N is the number of points
  38 # theta[0] is the intercept, theta[1] is the slope,
  39 # and theta[2 + i] is the weight g_i
  40
  41 def log_prior(theta):
  42     #g_i needs to be between 0 and 1
  43     if (all(theta[2:] > 0) and all(theta[2:] < 1)):
  44         return 0
  45     else:
  46         return -np.inf  # recall log(0) = -inf
  47
  48 def log_likelihood(theta, x, y, e, sigma_B):
  49     dy = y - theta[0] - theta[1] * x
  50     g = np.clip(theta[2:], 0, 1)  # g<0 or g>1 leads to NaNs in logarithm
  51     logL1 = np.log(g) - 0.5 * np.log(2 * np.pi * e ** 2) - 0.5 * (dy / e) ** 2
  52     logL2 = np.log(1 - g) - 0.5 * np.log(2 * np.pi * sigma_B ** 2) - 0.5 * (dy / sigma_B) ** 2
  53     return np.sum(np.logaddexp(logL1, logL2))
  54
  55 def log_posterior(theta, x, y, e, sigma_B):
  56     return log_prior(theta) + log_likelihood(theta, x, y, e, sigma_B)
  57
  58 ndim = 2 + len(x)  # number of parameters in the model
  59 nwalkers = 50  # number of MCMC walkers
  60 nburn  = 100000  # "burn-in" period to let chains stabilize
  61 nsteps = 150000  # number of MCMC steps to take
  62
  63 # set theta near the maximum likelihood, with
  64 np.random.seed(0)
  65 starting_guesses = np.zeros((nwalkers, ndim))
  66 starting_guesses[:, :2] = np.random.normal(theta1, 1, (nwalkers, 2))
  67 starting_guesses[:, 2:] = np.random.normal(0.5, 0.1, (nwalkers, ndim - 2))
  68
  69 import emcee
  70 import multiprocessing as mp
  71 sampler = emcee.EnsembleSampler(nwalkers, ndim, log_posterior, args=[x, y, e, 50], threads=mp.cpu_count() )
  72 sampler.run_mcmc(starting_guesses, nsteps)
  73
  74 sample = sampler.chain  # shape = (nwalkers, nsteps, ndim)
  75 sample = sampler.chain[:, nburn:, :].reshape(-1, ndim)
  76
  77 theta3 = np.mean(sample[:, :2], axis=0)
  78 g = np.mean(sample[:, 2:], 0)
  79 outliers = (g < 0.5)
  80 #plt.show()
  81
  82 plt.errorbar(x, y, e, fmt='.k', ecolor='gray')
  83 plt.plot(xfit, theta1[0] + theta1[1] * xfit, color='gray',label="y = %sx + %s"%(theta1[1],theta1[0]) )
  84 plt.plot(xfit, theta2[0] + theta2[1] * xfit, color='green',label="Huber: y = %sx + %s"%(theta2[1],theta2[0]) )
  85 plt.plot(xfit, theta3[0] + theta3[1] * xfit, color='navy',label="MCMC: y = %sx + %s"%(theta3[1],theta3[0]) )
  86 plt.plot(x[outliers], y[outliers], 'ro', ms=20, mfc='none', mec='red')
  87 plt.title("%s"%(sampleid));
  88 plt.legend(loc='best', frameon=False);
  89 #plt.show()
  90 plt.savefig("%s.pdf"%sampleid)