use data wrapper
[notebooks.git] / fmda / test_notebooks / fmda_rnn_train_and_save.ipynb
blob9893b339cccb18012380dc2864b815a57004185c
2  "cells": [
3   {
4    "cell_type": "markdown",
5    "id": "83b774b3-ef55-480a-b999-506676e49145",
6    "metadata": {},
7    "source": [
8     "# v2.1 run RNN with Spatial Training\n",
9     "\n",
10     "This notebook is intended to set up a test where the RNN is run serial by location and compared to the spatial training scheme. Additionally, the ODE model with the augmented KF will be run as a comparison, but note that the RNN models will be predicting entirely without knowledge of the heldout locations, while the augmented KF will be run directly on the test locations.\n"
11    ]
12   },
13   {
14    "cell_type": "markdown",
15    "id": "bbd84d61-a9cd-47b4-b538-4986fb10b98d",
16    "metadata": {},
17    "source": [
18     "## Environment Setup"
19    ]
20   },
21   {
22    "cell_type": "code",
23    "execution_count": null,
24    "id": "83cc1dc4-3dcb-4325-9263-58101a3dc378",
25    "metadata": {},
26    "outputs": [],
27    "source": [
28     "import numpy as np\n",
29     "import sys\n",
30     "sys.path.append('..')\n",
31     "import pickle\n",
32     "import logging\n",
33     "import os.path as osp\n",
34     "import tensorflow as tf\n",
35     "from moisture_rnn_pkl import pkl2train\n",
36     "from moisture_rnn import RNNParams, RNNData, RNN, rnn_data_wrap\n",
37     "from utils import hash2, read_yml, read_pkl, retrieve_url, Dict, print_dict_summary, print_first, str2time, logging_setup\n",
38     "from moisture_rnn import RNN\n",
39     "import reproducibility\n",
40     "from data_funcs import rmse, to_json, combine_nested, subset_by_features, build_train_dict\n",
41     "from moisture_models import run_augmented_kf\n",
42     "import copy\n",
43     "import pandas as pd\n",
44     "import matplotlib.pyplot as plt\n",
45     "import yaml\n",
46     "import time"
47    ]
48   },
49   {
50    "cell_type": "code",
51    "execution_count": null,
52    "id": "17db9b90-a931-4674-a447-5b8ffbcdc86a",
53    "metadata": {},
54    "outputs": [],
55    "source": [
56     "logging_setup()"
57    ]
58   },
59   {
60    "cell_type": "code",
61    "execution_count": null,
62    "id": "35319c1c-7849-4b8c-8262-f5aa6656e0c7",
63    "metadata": {},
64    "outputs": [],
65    "source": [
66     "filename = \"fmda_rocky_202403-05_f05.pkl\"\n",
67     "retrieve_url(\n",
68     "    url = f\"https://demo.openwfm.org/web/data/fmda/dicts/{filename}\", \n",
69     "    dest_path = f\"../data/{filename}\")"
70    ]
71   },
72   {
73    "cell_type": "code",
74    "execution_count": null,
75    "id": "eabdbd9c-07d9-4bae-9851-cca79f321895",
76    "metadata": {},
77    "outputs": [],
78    "source": [
79     "file_paths = [f'../data/{filename}']"
80    ]
81   },
82   {
83    "cell_type": "code",
84    "execution_count": null,
85    "id": "dcca6185-e799-4dd1-8acb-87ad33c411d7",
86    "metadata": {},
87    "outputs": [],
88    "source": [
89     "# # read/write control\n",
90     "# train_file='../data/train.pkl'\n",
91     "# train_create=True   # if false, read\n",
92     "# train_write=False\n",
93     "# train_read=False"
94    ]
95   },
96   {
97    "cell_type": "code",
98    "execution_count": null,
99    "id": "604388de-11ab-45c3-9f0d-80bdff0cca60",
100    "metadata": {},
101    "outputs": [],
102    "source": [
103     "# Params used for data filtering\n",
104     "params_data = read_yml(\"../params_data.yaml\") \n",
105     "params_data"
106    ]
107   },
108   {
109    "cell_type": "code",
110    "execution_count": null,
111    "id": "211a1c2f-ba8d-40b8-b29c-daa38af97a26",
112    "metadata": {},
113    "outputs": [],
114    "source": [
115     "# Params used for setting up RNN\n",
116     "params = read_yml(\"../params.yaml\", subkey='rnn') \n",
117     "params"
118    ]
119   },
120   {
121    "cell_type": "code",
122    "execution_count": null,
123    "id": "38e6bc61-e123-4cc9-bdee-54b051bbb352",
124    "metadata": {},
125    "outputs": [],
126    "source": [
127     "feats = ['Ed', 'Ew', 'solar', 'wind', 'elev', 'lon', 'lat', 'rain']\n",
128     "params.update({'features_list': feats})"
129    ]
130   },
131   {
132    "cell_type": "code",
133    "execution_count": null,
134    "id": "ef84104f-9898-4cd9-be54-7c480536ee0e",
135    "metadata": {
136     "scrolled": true
137    },
138    "outputs": [],
139    "source": [
140     "train = build_train_dict(file_paths, atm_source=\"RAWS\", params_data = params_data,\n",
141     "                         features_subset = feats, spatial=True, verbose=True)"
142    ]
143   },
144   {
145    "cell_type": "code",
146    "execution_count": null,
147    "id": "bc0a775b-b587-42ef-8576-e36dc0be3a75",
148    "metadata": {
149     "scrolled": true
150    },
151    "outputs": [],
152    "source": [
153     "# if train_create:\n",
154     "#     params_data.update({'hours': 1440})\n",
155     "#     logging.info('creating the training cases from files %s',file_paths)\n",
156     "#     # osp.join works on windows too, joins paths using \\ or /\n",
157     "#     train = process_train_dict(file_paths, atm_dict = \"RAWS\", params_data = params_data, verbose=True)\n",
158     "#     train = subset_by_features(train, feats)\n",
159     "#     train = combine_nested(train)\n",
160     "# if train_write:\n",
161     "#     with open(train_file, 'wb') as file:\n",
162     "#         logging.info('Writing the rain cases into file %s',train_file)\n",
163     "#         pickle.dump(train, file)\n",
164     "# if train_read:\n",
165     "#     logging.info('Reading the train cases from file %s',train_file)\n",
166     "#     train = read_pkl(train_file)"
167    ]
168   },
169   {
170    "cell_type": "markdown",
171    "id": "a24d76fc-6c25-43e7-99df-3cd5dbf84fc3",
172    "metadata": {},
173    "source": [
174     "## Spatial Data Training\n",
175     "\n",
176     "This method combines the training timeseries data into a single 3-d array, with timeseries at the same location arranged appropriately in the right order for a given `batch_size` hyperparameter. The hidden state of the recurrent layers are set up reset when the location changes. "
177    ]
178   },
179   {
180    "cell_type": "code",
181    "execution_count": null,
182    "id": "36823193-b93c-421e-b699-8c1ae5719309",
183    "metadata": {},
184    "outputs": [],
185    "source": [
186     "reproducibility.set_seed(123)"
187    ]
188   },
189   {
190    "cell_type": "code",
191    "execution_count": null,
192    "id": "66f40c9f-c1c2-4b12-bf14-2ada8c26113d",
193    "metadata": {},
194    "outputs": [],
195    "source": [
196     "params = RNNParams(params)\n",
197     "params.update({'epochs': 200, \n",
198     "               'learning_rate': 0.001,\n",
199     "               'activation': ['relu', 'relu'], # Activation for RNN Layers, Dense layers respectively.\n",
200     "               'recurrent_layers': 1, 'recurrent_units': 30, \n",
201     "               'dense_layers': 1, 'dense_units': 30,\n",
202     "               'early_stopping_patience': 30, # how many epochs of no validation accuracy gain to wait before stopping\n",
203     "               'batch_schedule_type': 'exp', # Hidden state batch reset schedule\n",
204     "               'bmin': 20, # Lower bound of hidden state batch reset, \n",
205     "               'bmax': params_data['hours'], # Upper bound of hidden state batch reset, using max hours\n",
206     "               'batch_size': 60,\n",
207     "               'space_fracs': [.8, .1, .1]\n",
208     "              })"
209    ]
210   },
211   {
212    "cell_type": "code",
213    "execution_count": null,
214    "id": "82bc407d-9d26-41e3-8b58-ab3f7238e105",
215    "metadata": {},
216    "outputs": [],
217    "source": [
218     "import importlib\n",
219     "import moisture_rnn\n",
220     "importlib.reload(moisture_rnn)\n",
221     "from moisture_rnn import RNNData"
222    ]
223   },
224   {
225    "cell_type": "code",
226    "execution_count": null,
227    "id": "c0c7f5fb-4c33-45f8-9a2e-38c9ab1cd4e3",
228    "metadata": {},
229    "outputs": [],
230    "source": [
231     "# rnn_dat_sp = RNNData(\n",
232     "#     train, # input dictionary\n",
233     "#     scaler=\"standard\",  # data scaling type\n",
234     "#     features_list = params['features_list'] # features for predicting outcome\n",
235     "# )\n",
236     "\n",
237     "\n",
238     "# rnn_dat_sp.train_test_split(   \n",
239     "#     time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test\n",
240     "#     space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test\n",
241     "# )\n",
242     "# rnn_dat_sp.scale_data()\n",
243     "\n",
244     "# rnn_dat_sp.batch_reshape(\n",
245     "#     timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. \n",
246     "#     batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent\n",
247     "# )\n",
248     "# # Update Params specific to spatial training\n",
249     "# params.update({\n",
250     "#     'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch\n",
251     "# })"
252    ]
253   },
254   {
255    "cell_type": "code",
256    "execution_count": null,
257    "id": "924549ba-ea73-4fc9-91b3-8f1f0e32e831",
258    "metadata": {},
259    "outputs": [],
260    "source": [
261     "rnn_dat_sp = rnn_data_wrap(train, params)\n",
262     "params.update({\n",
263     "    'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch\n",
264     "})"
265    ]
266   },
267   {
268    "cell_type": "code",
269    "execution_count": null,
270    "id": "4bc11474-fed8-47f2-b9cf-dfdda0d3d3b2",
271    "metadata": {},
272    "outputs": [],
273    "source": [
274     "rnn_sp = RNN(params)\n",
275     "m_sp, errs = rnn_sp.run_model(rnn_dat_sp)"
276    ]
277   },
278   {
279    "cell_type": "code",
280    "execution_count": null,
281    "id": "704ad662-d81a-488d-be3d-e90bf775a5b8",
282    "metadata": {},
283    "outputs": [],
284    "source": [
285     "errs.mean()"
286    ]
287   },
288   {
289    "cell_type": "markdown",
290    "id": "62c1b049-304e-4c90-b1d2-b9b96b9a202f",
291    "metadata": {},
292    "source": [
293     "## Save Model"
294    ]
295   },
296   {
297    "cell_type": "code",
298    "execution_count": null,
299    "id": "f333521f-c724-40bf-8c1c-32735aea52cc",
300    "metadata": {},
301    "outputs": [],
302    "source": [
303     "outpath = \"../outputs/models\"\n",
304     "filename = osp.join(outpath, f\"model_predict_raws_rocky.keras\")\n",
305     "rnn_sp.model_predict.save(filename)"
306    ]
307   },
308   {
309    "cell_type": "markdown",
310    "id": "bc1c601f-23a9-41b0-b921-47f1340f2a47",
311    "metadata": {},
312    "source": [
313     "## Load and Check"
314    ]
315   },
316   {
317    "cell_type": "code",
318    "execution_count": null,
319    "id": "3c27b3c1-6f60-450e-82ea-18eaf012fece",
320    "metadata": {},
321    "outputs": [],
322    "source": [
323     "mod = tf.keras.models.load_model(filename)"
324    ]
325   },
326   {
327    "cell_type": "code",
328    "execution_count": null,
329    "id": "25bf5420-d681-40ec-9eb8-aed784ca4e5a",
330    "metadata": {},
331    "outputs": [],
332    "source": [
333     "from utils import hash_weights\n",
334     "\n",
335     "hash_weights(mod)"
336    ]
337   },
338   {
339    "cell_type": "code",
340    "execution_count": null,
341    "id": "d773b2ab-18de-4b13-a243-b6353c57f192",
342    "metadata": {},
343    "outputs": [],
344    "source": [
345     "type(rnn_dat_sp.X_test)"
346    ]
347   },
348   {
349    "cell_type": "code",
350    "execution_count": null,
351    "id": "253ba437-c3a2-452b-b8e6-078aa17c8408",
352    "metadata": {},
353    "outputs": [],
354    "source": [
355     "X_test = np.stack(rnn_dat_sp.X_test, axis=0)\n",
356     "y_array = np.stack(rnn_dat_sp.y_test, axis=0)"
357    ]
358   },
359   {
360    "cell_type": "code",
361    "execution_count": null,
362    "id": "f4332dd8-57cd-4f5b-a864-dc72f96d72b2",
363    "metadata": {},
364    "outputs": [],
365    "source": [
366     "preds = mod.predict(X_test)\n",
367     "preds.shape"
368    ]
369   },
370   {
371    "cell_type": "code",
372    "execution_count": null,
373    "id": "4e4cd809-6701-4bd7-b4fe-37c5e35d8999",
374    "metadata": {},
375    "outputs": [],
376    "source": [
377     "np.mean(np.sqrt(np.mean(np.square(preds - y_array), axis=(1,2))))"
378    ]
379   },
380   {
381    "cell_type": "code",
382    "execution_count": null,
383    "id": "4f4d80cb-edef-4720-b335-4af5a04992c3",
384    "metadata": {},
385    "outputs": [],
386    "source": []
387   },
388   {
389    "cell_type": "code",
390    "execution_count": null,
391    "id": "e9d7f913-b391-4e14-9b64-46a0a9786f4a",
392    "metadata": {},
393    "outputs": [],
394    "source": []
395   }
396  ],
397  "metadata": {
398   "kernelspec": {
399    "display_name": "Python 3 (ipykernel)",
400    "language": "python",
401    "name": "python3"
402   },
403   "language_info": {
404    "codemirror_mode": {
405     "name": "ipython",
406     "version": 3
407    },
408    "file_extension": ".py",
409    "mimetype": "text/x-python",
410    "name": "python",
411    "nbconvert_exporter": "python",
412    "pygments_lexer": "ipython3",
413    "version": "3.12.5"
414   }
415  },
416  "nbformat": 4,
417  "nbformat_minor": 5