BUG FIX: add needed imports to utils
[notebooks.git] / fmda / data_funcs.py
blob9e0e28158ebac169bc7539ef2806b8d46a857054
1 ## Set of Functions to process and format fuel moisture model inputs
2 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4 import numpy as np, random
5 from numpy.random import rand
6 import tensorflow as tf
7 import pickle, os
8 from sklearn.metrics import mean_squared_error
9 import matplotlib.pyplot as plt
10 from moisture_models import model_decay, model_moisture
11 from datetime import datetime, timedelta
12 from utils import is_numeric_ndarray, hash2
13 import json
14 import copy
15 import subprocess
16 import os.path as osp
19 def compare_dicts(dict1, dict2, keys):
20 for key in keys:
21 if dict1.get(key) != dict2.get(key):
22 return False
23 return True
25 items = '_items_' # dictionary key to keep list of items in
26 def check_data_array(dat,hours,a,s):
27 if a in dat[items]:
28 dat[items].remove(a)
29 if a in dat:
30 ar = dat[a]
31 print("array %s %s length %i min %s max %s hash %s %s" %
32 (a,s,len(ar),min(ar),max(ar),hash2(ar),type(ar)))
33 if hours is not None:
34 if len(ar) < hours:
35 print('len(%a) = %i does not equal to hours = %i' % (a,len(ar),hours))
36 exit(1)
37 else:
38 print(a + ' not present')
40 def check_data_scalar(dat,a):
41 if a in dat[items]:
42 dat[items].remove(a)
43 if a in dat:
44 print('%s = %s' % (a,dat[a]),' ',type(dat[a]))
45 else:
46 print(a + ' not present' )
48 def check_data(dat,case=True,name=None):
49 dat[items] = list(dat.keys()) # add list of items to the dictionary
50 if name is not None:
51 print(name)
52 if case:
53 check_data_scalar(dat,'filename')
54 check_data_scalar(dat,'title')
55 check_data_scalar(dat,'note')
56 check_data_scalar(dat,'hours')
57 check_data_scalar(dat,'h2')
58 check_data_scalar(dat,'case')
59 if 'hours' in dat:
60 hours = dat['hours']
61 else:
62 hours = None
63 check_data_array(dat,hours,'E','drying equilibrium (%)')
64 check_data_array(dat,hours,'Ed','drying equilibrium (%)')
65 check_data_array(dat,hours,'Ew','wetting equilibrium (%)')
66 check_data_array(dat,hours,'Ec','equilibrium equilibrium (%)')
67 check_data_array(dat,hours,'rain','rain intensity (mm/h)')
68 check_data_array(dat,hours,'fm','RAWS fuel moisture data (%)')
69 check_data_array(dat,hours,'m','fuel moisture estimate (%)')
70 if dat[items]:
71 print('items:',dat[items])
72 for a in dat[items].copy():
73 ar=dat[a]
74 if dat[a] is None or np.isscalar(dat[a]):
75 check_data_scalar(dat,a)
76 elif is_numeric_ndarray(ar):
77 print(type(ar))
78 print("array", a, "shape",ar.shape,"min",np.min(ar),
79 "max",np.max(ar),"hash",hash2(ar),"type",type(ar))
80 elif isinstance(ar, tf.Tensor):
81 print("array", a, "shape",ar.shape,"min",np.min(ar),
82 "max",np.max(ar),"type",type(ar))
83 else:
84 print('%s = %s' % (a,dat[a]),' ',type(dat[a]))
85 del dat[items] # clean up
87 # Note: the project structure has moved towards pickle files, so these json funcs might not be needed
88 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
89 def to_json(dic,filename):
90 # Write given dictionary as json file.
91 # This utility is used because the typical method fails on numpy.ndarray
92 # Inputs:
93 # dic: dictionary
94 # filename: (str) output json filename, expect a ".json" file extension
95 # Return: none
97 print('writing ',filename)
98 # check_data(dic)
99 new={}
100 for i in dic:
101 if type(dic[i]) is np.ndarray:
102 new[i]=dic[i].tolist() # because numpy.ndarray is not serializable
103 else:
104 new[i]=dic[i]
105 # print('i',type(new[i]))
106 new['filename']=filename
107 print('Hash: ', hash2(new))
108 json.dump(new,open(filename,'w'),indent=4)
110 def from_json(filename):
111 # Read json file given a filename
112 # Inputs: filename (str) expect a ".json" string
114 print('reading ',filename)
115 dic=json.load(open(filename,'r'))
116 new={}
117 for i in dic:
118 if type(dic[i]) is list:
119 new[i]=np.array(dic[i]) # because ndarray is not serializable
120 else:
121 new[i]=dic[i]
122 check_data(new)
123 print('Hash: ', hash2(new))
124 return new
126 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
128 # Function to simulate moisture data and equilibrium for model testing
129 def create_synthetic_data(days=20,power=4,data_noise=0.02,process_noise=0.0,DeltaE=0.0):
130 hours = days*24
131 h2 = int(hours/2)
132 hour = np.array(range(hours))
133 day = np.array(range(hours))/24.
135 # artificial equilibrium data
136 E = 100.0*np.power(np.sin(np.pi*day),4) # diurnal curve
137 E = 0.05+0.25*E
138 # FMC free run
139 m_f = np.zeros(hours)
140 m_f[0] = 0.1 # initial FMC
141 process_noise=0.
142 for t in range(hours-1):
143 m_f[t+1] = max(0.,model_decay(m_f[t],E[t]) + random.gauss(0,process_noise) )
144 data = m_f + np.random.normal(loc=0,scale=data_noise,size=hours)
145 E = E + DeltaE
146 return E,m_f,data,hour,h2,DeltaE
148 # the following input or output dictionary with all model data and variables
150 def synthetic_data(days=20,power=4,data_noise=0.02,process_noise=0.0,
151 DeltaE=0.0,Emin=5,Emax=30,p_rain=0.01,max_rain=10.0):
152 hours = days*24
153 h2 = int(hours/2)
154 hour = np.array(range(hours))
155 day = np.array(range(hours))/24.
156 # artificial equilibrium data
157 E = np.power(np.sin(np.pi*day),power) # diurnal curve betwen 0 and 1
158 E = Emin+(Emax - Emin)*E
159 E = E + DeltaE
160 Ed=E+0.5
161 Ew=np.maximum(E-0.5,0)
162 rain = np.multiply(rand(hours) < p_rain, rand(hours)*max_rain)
163 # FMC free run
164 fm = np.zeros(hours)
165 fm[0] = 0.1 # initial FMC
166 # process_noise=0.
167 for t in range(hours-1):
168 fm[t+1] = max(0.,model_moisture(fm[t],Ed[t-1],Ew[t-1],rain[t-1]) + random.gauss(0,process_noise))
169 fm = fm + np.random.normal(loc=0,scale=data_noise,size=hours)
170 dat = {'E':E,'Ew':Ew,'Ed':Ed,'fm':fm,'hours':hours,'h2':h2,'DeltaE':DeltaE,'rain':rain,'title':'Synthetic data'}
172 return dat
174 def plot_one(hmin,hmax,dat,name,linestyle,c,label, alpha=1,type='plot'):
175 # helper for plot_data
176 if name in dat:
177 h = len(dat[name])
178 if hmin is None:
179 hmin=0
180 if hmax is None:
181 hmax=len(dat[name])
182 hour = np.array(range(hmin,hmax))
183 if type=='plot':
184 plt.plot(hour,dat[name][hmin:hmax],linestyle=linestyle,c=c,label=label, alpha=alpha)
185 elif type=='scatter':
186 plt.scatter(hour,dat[name][hmin:hmax],linestyle=linestyle,c=c,label=label, alpha=alpha)
188 # Lookup table for plotting features
189 plot_styles = {
190 'Ed': {'color': '#EF847C', 'linestyle': '--', 'alpha':.8, 'label': 'drying EQ'},
191 'Ew': {'color': '#7CCCEF', 'linestyle': '--', 'alpha':.8, 'label': 'wetting EQ'},
192 'rain': {'color': 'b', 'linestyle': '-', 'alpha':.9, 'label': 'Rain'}
194 def plot_feature(x, y, feature_name):
195 style = plot_styles.get(feature_name, {})
196 plt.plot(x, y, **style)
198 def plot_features(hmin,hmax,dat,linestyle,c,label,alpha=1):
199 hour = np.array(range(hmin,hmax))
200 for feat in dat.features_list:
201 i = dat.all_features_list.index(feat) # index of main data
202 if feat in plot_styles.keys():
203 plot_feature(x=hour, y=dat['X'][:,i][hmin:hmax], feature_name=feat)
205 def plot_data(dat,title=None,title2=None,hmin=0,hmax=None,xlabel=None,ylabel=None):
206 # Plot fmda dictionary of data and model if present
207 # Inputs:
208 # dat: FMDA dictionary
209 # inverse_scale: logical, whether to inverse scale data
210 # Returns: none
212 # dat = copy.deepcopy(dat0)
214 if 'hours' in dat:
215 if hmax is None:
216 hmax = dat['hours']
217 else:
218 hmax = min(hmax, dat['hours'])
222 # plt.figure(figsize=(16,4))
223 plot_one(hmin,hmax,dat,'y',linestyle='-',c='#468a29',label='FM Observed')
224 plot_one(hmin,hmax,dat,'m',linestyle='-',c='k',label='FM Model')
225 plot_features(hmin,hmax,dat,linestyle='-',c='k',label='FM Model')
228 if 'test_ind' in dat.keys():
229 test_ind = dat["test_ind"]
230 else:
231 test_ind = None
232 if (test_ind is not None) and ('m' in dat.keys()):
233 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
234 # Note: the code within the tildes here makes a more complex, annotated plot
235 plt.axvline(dat['test_ind'], linestyle=':', c='k', alpha=.8)
236 yy = plt.ylim() # used to format annotations
237 plt.annotate('', xy=(0, yy[0]),xytext=(dat['test_ind'],yy[0]),
238 arrowprops=dict(arrowstyle='<-', linewidth=2),
239 annotation_clip=False)
240 plt.annotate('(Training)',xy=(np.ceil(dat['test_ind']/2),yy[1]),xytext=(np.ceil(dat['test_ind']/2),yy[1]+1),
241 annotation_clip=False, alpha=.8)
242 plt.annotate('', xy=(dat['test_ind'], yy[0]),xytext=(dat['hours'],yy[0]),
243 arrowprops=dict(arrowstyle='<-', linewidth=2),
244 annotation_clip=False)
245 plt.annotate('(Forecast)',xy=(np.ceil(dat['test_ind']+(dat['hours']-dat['test_ind'])/2),yy[1]),
246 xytext=(np.ceil(dat['test_ind']+(dat['hours']-dat['test_ind'])/2),yy[1]+1),
247 annotation_clip=False, alpha=.8)
248 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
251 if title is not None:
252 t = title
253 elif 'title' in dat:
254 t=dat['title']
255 # print('title',type(t),t)
256 else:
257 t=''
258 if title2 is not None:
259 t = t + ' ' + title2
260 t = t + ' (' + rmse_data_str(dat)+')'
261 plt.title(t, y=1.1)
263 if xlabel is None:
264 plt.xlabel('Time (hours)')
265 else:
266 plt.xlabel(xlabel)
267 if 'rain' in dat:
268 plt.ylabel('FM (%) / Rain (mm/h)')
269 elif ylabel is None:
270 plt.ylabel('Fuel moisture content (%)')
271 else:
272 plt.ylabel(ylabel)
273 plt.legend(loc="upper left")
275 def rmse(a, b):
276 return np.sqrt(mean_squared_error(a.flatten(), b.flatten()))
278 def rmse_skip_nan(x, y):
279 mask = ~np.isnan(x) & ~np.isnan(y)
280 if np.count_nonzero(mask):
281 return np.sqrt(np.mean((x[mask] - y[mask]) ** 2))
282 else:
283 return np.nan
285 def rmse_str(a,b):
286 rmse = rmse_skip_nan(a,b)
287 return "RMSE " + "{:.3f}".format(rmse)
289 def rmse_data_str(dat, predict=True, hours = None, test_ind = None):
290 # Return RMSE for model object in formatted string. Used within plotting
291 # Inputs:
292 # dat: (dict) fmda dictionary
293 # predict: (bool) Whether to return prediction period RMSE. Default True
294 # hours: (int) total number of modeled time periods
295 # test_ind: (int) start of test period
296 # Return: (str) RMSE value
298 if hours is None:
299 if 'hours' in dat:
300 hours = dat['hours']
301 if test_ind is None:
302 if 'test_ind' in dat:
303 test_ind = dat['test_ind']
305 if 'm' in dat and 'y' in dat:
306 if predict and hours is not None and test_ind is not None:
307 return rmse_str(dat['m'][test_ind:hours],dat['y'].flatten()[test_ind:hours])
308 else:
309 return rmse_str(dat['m'],dat['y'].flatten())
310 else:
311 return ''
314 # Calculate mean absolute error
315 def mape(a, b):
316 return ((a - b).__abs__()).mean()
318 def rmse_data(dat, hours = None, h2 = None, simulation='m', measurements='fm'):
319 if hours is None:
320 hours = dat['hours']
321 if h2 is None:
322 h2 = dat['h2']
324 m = dat[simulation]
325 fm = dat[measurements]
326 case = dat['case']
328 train =rmse(m[:h2], fm[:h2])
329 predict = rmse(m[h2:hours], fm[h2:hours])
330 all = rmse(m[:hours], fm[:hours])
331 print(case,'Training 1 to',h2,'hours RMSE: ' + str(np.round(train, 4)))
332 print(case,'Prediction',h2+1,'to',hours,'hours RMSE: ' + str(np.round(predict, 4)))
333 print(f"All predictions hash: {hash2(m)}")
335 return {'train':train, 'predict':predict, 'all':all}
339 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
343 def get_file(filename, data_dir='data'):
344 # Check for file locally, retrieve with wget if not
345 if osp.exists(osp.join(data_dir, filename)):
346 print(f"File {osp.join(data_dir, filename)} exists locally")
347 elif not osp.exists(filename):
348 import subprocess
349 base_url = "https://demo.openwfm.org/web/data/fmda/dicts/"
350 print(f"Retrieving data {osp.join(base_url, filename)}")
351 subprocess.call(f"wget -P {data_dir} {osp.join(base_url, filename)}", shell=True)