cleanup commented code
[notebooks.git] / fmda / fmda_rnn_spatial.ipynb
blobcb1785b00aae82f2e19015deb709e6eeb7ac8aeb
2  "cells": [
3   {
4    "cell_type": "markdown",
5    "id": "83b774b3-ef55-480a-b999-506676e49145",
6    "metadata": {},
7    "source": [
8     "# v2.1 run RNN with Spatial Training\n",
9     "\n",
10     "This notebook is intended to set up a test where the RNN is run serial by location and compared to the spatial training scheme. Additionally, the ODE model with the augmented KF will be run as a comparison, but note that the RNN models will be predicting entirely without knowledge of the heldout locations, while the augmented KF will be run directly on the test locations.\n"
11    ]
12   },
13   {
14    "cell_type": "markdown",
15    "id": "bbd84d61-a9cd-47b4-b538-4986fb10b98d",
16    "metadata": {},
17    "source": [
18     "## Environment Setup"
19    ]
20   },
21   {
22    "cell_type": "code",
23    "execution_count": null,
24    "id": "83cc1dc4-3dcb-4325-9263-58101a3dc378",
25    "metadata": {},
26    "outputs": [],
27    "source": [
28     "import numpy as np\n",
29     "from utils import print_dict_summary, print_first, str2time, logging_setup\n",
30     "import pickle\n",
31     "import logging\n",
32     "import os.path as osp\n",
33     "from moisture_rnn_pkl import pkl2train\n",
34     "from moisture_rnn import RNNParams, RNNData, RNN, rnn_data_wrap\n",
35     "from utils import hash2, read_yml, read_pkl, retrieve_url, Dict\n",
36     "from moisture_rnn import RNN\n",
37     "import reproducibility\n",
38     "from data_funcs import rmse, to_json, combine_nested, build_train_dict\n",
39     "from moisture_models import run_augmented_kf\n",
40     "import copy\n",
41     "import pandas as pd\n",
42     "import matplotlib.pyplot as plt\n",
43     "import yaml\n",
44     "import time"
45    ]
46   },
47   {
48    "cell_type": "code",
49    "execution_count": null,
50    "id": "17db9b90-a931-4674-a447-5b8ffbcdc86a",
51    "metadata": {},
52    "outputs": [],
53    "source": [
54     "logging_setup()"
55    ]
56   },
57   {
58    "cell_type": "code",
59    "execution_count": null,
60    "id": "35319c1c-7849-4b8c-8262-f5aa6656e0c7",
61    "metadata": {},
62    "outputs": [],
63    "source": [
64     "filename = \"fmda_rocky_202403-05_f05.pkl\"\n",
65     "retrieve_url(\n",
66     "    url = f\"https://demo.openwfm.org/web/data/fmda/dicts/{filename}\", \n",
67     "    dest_path = f\"data/{filename}\")"
68    ]
69   },
70   {
71    "cell_type": "code",
72    "execution_count": null,
73    "id": "eabdbd9c-07d9-4bae-9851-cca79f321895",
74    "metadata": {},
75    "outputs": [],
76    "source": [
77     "file_paths = [f\"data/{filename}\"]"
78    ]
79   },
80   {
81    "cell_type": "code",
82    "execution_count": null,
83    "id": "dcca6185-e799-4dd1-8acb-87ad33c411d7",
84    "metadata": {},
85    "outputs": [],
86    "source": [
87     "# read/write control\n",
88     "train_file='data/train.pkl'\n",
89     "train_create=True   # if false, read\n",
90     "train_write=False\n",
91     "train_read=False"
92    ]
93   },
94   {
95    "cell_type": "code",
96    "execution_count": null,
97    "id": "604388de-11ab-45c3-9f0d-80bdff0cca60",
98    "metadata": {},
99    "outputs": [],
100    "source": [
101     "# Params used for data filtering\n",
102     "params_data = read_yml(\"params_data.yaml\") \n",
103     "params_data"
104    ]
105   },
106   {
107    "cell_type": "code",
108    "execution_count": null,
109    "id": "211a1c2f-ba8d-40b8-b29c-daa38af97a26",
110    "metadata": {},
111    "outputs": [],
112    "source": [
113     "# Params used for setting up RNN\n",
114     "params = read_yml(\"params.yaml\", subkey='rnn') \n",
115     "params"
116    ]
117   },
118   {
119    "cell_type": "code",
120    "execution_count": null,
121    "id": "be81d76c-3123-4467-982b-d2da5b1c08bd",
122    "metadata": {
123     "scrolled": true
124    },
125    "outputs": [],
126    "source": [
127     "train = build_train_dict(file_paths, atm_source=\"HRRR\", params_data = params_data, spatial=False, verbose=True)"
128    ]
129   },
130   {
131    "cell_type": "code",
132    "execution_count": null,
133    "id": "bc0a775b-b587-42ef-8576-e36dc0be3a75",
134    "metadata": {
135     "scrolled": true
136    },
137    "outputs": [],
138    "source": [
139     "# if train_create:\n",
140     "#     logging.info('creating the training cases from files %s',file_paths)\n",
141     "#     # osp.join works on windows too, joins paths using \\ or /\n",
142     "#     train = process_train_dict(file_paths, atm_dict=\"HRRR\", params_data = params_data, verbose=True)\n",
143     "# if train_write:\n",
144     "#     with open(train_file, 'wb') as file:\n",
145     "#         logging.info('Writing the rain cases into file %s',train_file)\n",
146     "#         pickle.dump(train, file)\n",
147     "# if train_read:\n",
148     "#     logging.info('Reading the train cases from file %s',train_file)\n",
149     "#     train = read_pkl(train_file)"
150    ]
151   },
152   {
153    "cell_type": "code",
154    "execution_count": null,
155    "id": "23cd60c0-9865-4314-9a96-948c3d400c08",
156    "metadata": {},
157    "outputs": [],
158    "source": [
159     "from itertools import islice\n",
160     "train = {k: train[k] for k in islice(train, 250)}"
161    ]
162   },
163   {
164    "cell_type": "markdown",
165    "id": "efc10cdc-f18b-4781-84da-b8e2eef39981",
166    "metadata": {},
167    "source": [
168     "## Setup Validation Runs"
169    ]
170   },
171   {
172    "cell_type": "markdown",
173    "id": "2d9cd5c5-87ed-41f9-b36c-e0c18d58c841",
174    "metadata": {},
175    "source": [
176     "The following parameters will be used for both serial and spatial models."
177    ]
178   },
179   {
180    "cell_type": "code",
181    "execution_count": null,
182    "id": "66f40c9f-c1c2-4b12-bf14-2ada8c26113d",
183    "metadata": {},
184    "outputs": [],
185    "source": [
186     "params = RNNParams(params)\n",
187     "params.update({'epochs': 200, \n",
188     "               'learning_rate': 0.001,\n",
189     "               'activation': ['tanh', 'tanh'], # Activation for RNN Layers, Dense layers respectively.\n",
190     "               'rnn_layers': 2, 'recurrent_units': 30, \n",
191     "               'dense_layers': 2, 'dense_units': 30,\n",
192     "               'early_stopping_patience': 30, # how many epochs of no validation accuracy gain to wait before stopping\n",
193     "               'batch_schedule_type': 'exp', # Hidden state batch reset schedule\n",
194     "               'bmin': 20, # Lower bound of hidden state batch reset, \n",
195     "               'bmax': params_data['hours'], # Upper bound of hidden state batch reset, using max hours\n",
196     "               'features_list': ['hod', 'Ed', 'Ew', 'rain', 'elev', 'lon', 'lat', 'solar', 'wind'],\n",
197     "               'timesteps': 12,\n",
198     "               'space_fracs': [.8, .1, .1]\n",
199     "              })"
200    ]
201   },
202   {
203    "cell_type": "code",
204    "execution_count": null,
205    "id": "36823193-b93c-421e-b699-8c1ae5719309",
206    "metadata": {},
207    "outputs": [],
208    "source": [
209     "reproducibility.set_seed(123)"
210    ]
211   },
212   {
213    "cell_type": "markdown",
214    "id": "a24d76fc-6c25-43e7-99df-3cd5dbf84fc3",
215    "metadata": {},
216    "source": [
217     "## Spatial Data Training\n",
218     "\n",
219     "This method combines the training timeseries data into a single 3-d array, with timeseries at the same location arranged appropriately in the right order for a given `batch_size` hyperparameter. The hidden state of the recurrent layers are set up reset when the location changes. "
220    ]
221   },
222   {
223    "cell_type": "code",
224    "execution_count": null,
225    "id": "3b5371a9-c1e8-4df5-b360-210746f7cd52",
226    "metadata": {},
227    "outputs": [],
228    "source": [
229     "# Start timer for code \n",
230     "start_time = time.time()"
231    ]
232   },
233   {
234    "cell_type": "code",
235    "execution_count": null,
236    "id": "faf93470-b55f-4770-9fa9-3288a2f13fcc",
237    "metadata": {},
238    "outputs": [],
239    "source": [
240     "# Combine Nested Dictionary into Spatial Data\n",
241     "train_sp = Dict(combine_nested(train))"
242    ]
243   },
244   {
245    "cell_type": "code",
246    "execution_count": null,
247    "id": "af82c50e-bcc4-406d-b759-399119d1af81",
248    "metadata": {},
249    "outputs": [],
250    "source": [
251     "rnn_dat_sp = rnn_data_wrap(combine_nested(train), params)\n",
252     "params.update({\n",
253     "    'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch\n",
254     "})"
255    ]
256   },
257   {
258    "cell_type": "code",
259    "execution_count": null,
260    "id": "4bc11474-fed8-47f2-b9cf-dfdda0d3d3b2",
261    "metadata": {},
262    "outputs": [],
263    "source": [
264     "rnn_sp = RNN(params)\n",
265     "m_sp, errs = rnn_sp.run_model(rnn_dat_sp)"
266    ]
267   },
268   {
269    "cell_type": "code",
270    "execution_count": null,
271    "id": "704ad662-d81a-488d-be3d-e90bf775a5b8",
272    "metadata": {},
273    "outputs": [],
274    "source": [
275     "errs.mean()"
276    ]
277   },
278   {
279    "cell_type": "code",
280    "execution_count": null,
281    "id": "d53571e3-b6cf-49aa-9848-e3c77053283d",
282    "metadata": {},
283    "outputs": [],
284    "source": [
285     "# End Timer\n",
286     "end_time = time.time()\n",
287     "\n",
288     "# Calculate Code Runtime\n",
289     "elapsed_time_sp = end_time - start_time\n",
290     "print(f\"Spatial Training Elapsed time: {elapsed_time_sp:.4f} seconds\")"
291    ]
292   },
293   {
294    "cell_type": "markdown",
295    "id": "7d8292a2-418c-48ed-aff7-ccbe98b046d3",
296    "metadata": {},
297    "source": [
298     "## Run ODE + KF "
299    ]
300   },
301   {
302    "cell_type": "code",
303    "execution_count": null,
304    "id": "8eaa136b-b496-4543-8970-dac46cb88df8",
305    "metadata": {},
306    "outputs": [],
307    "source": [
308     "import importlib\n",
309     "import moisture_models\n",
310     "importlib.reload(moisture_models)\n",
311     "from moisture_models import run_augmented_kf"
312    ]
313   },
314   {
315    "cell_type": "code",
316    "execution_count": null,
317    "id": "cca12d8c-c0e1-4df4-b2ca-20440485f2f3",
318    "metadata": {},
319    "outputs": [],
320    "source": [
321     "# Get timeseries IDs from previous RNNData object\n",
322     "test_cases = rnn_dat_sp.loc['test_locs']\n",
323     "print(len(test_cases))"
324    ]
325   },
326   {
327    "cell_type": "code",
328    "execution_count": null,
329    "id": "997f2534-7e77-45b3-93bf-d988837dfc0b",
330    "metadata": {},
331    "outputs": [],
332    "source": [
333     "test_ind = rnn_dat_sp.test_ind # Time index for test period start\n",
334     "print(test_ind)"
335    ]
336   },
337   {
338    "cell_type": "code",
339    "execution_count": null,
340    "id": "1e4ffc68-c775-41c6-ac42-f49c76824b43",
341    "metadata": {
342     "scrolled": true
343    },
344    "outputs": [],
345    "source": [
346     "outputs_kf = {}\n",
347     "for case in test_cases:\n",
348     "    print(\"~\"*50)\n",
349     "    print(case)\n",
350     "    # Run Augmented KF\n",
351     "    print('Running Augmented KF')\n",
352     "    train[case]['h2'] = test_ind\n",
353     "    train[case]['scale_fm'] = 1\n",
354     "    m, Ec = run_augmented_kf(train[case])\n",
355     "    y = train[case]['y']        \n",
356     "    train[case]['m_kf'] = m\n",
357     "    print(f\"KF RMSE: {rmse(m[test_ind:],y[test_ind:])}\")\n",
358     "    outputs_kf[case] = {'case':case, 'errs': rmse(m[test_ind:],y[test_ind:])}"
359    ]
360   },
361   {
362    "cell_type": "code",
363    "execution_count": null,
364    "id": "57b19ec5-23f6-44ec-9f71-16d4d69aec68",
365    "metadata": {},
366    "outputs": [],
367    "source": [
368     "df_kf = pd.DataFrame.from_dict(outputs_kf).transpose()\n",
369     "df_kf.head()"
370    ]
371   },
372   {
373    "cell_type": "code",
374    "execution_count": null,
375    "id": "25a9d2fe-83f7-4ef3-a04b-14c970b6e2ba",
376    "metadata": {},
377    "outputs": [],
378    "source": [
379     "df_kf.errs.mean()"
380    ]
381   },
382   {
383    "cell_type": "markdown",
384    "id": "86795281-f8ea-4141-81ea-c53fae830e80",
385    "metadata": {},
386    "source": [
387     "## Compare"
388    ]
389   },
390   {
391    "cell_type": "code",
392    "execution_count": null,
393    "id": "508a6392-49bc-4471-ad8e-814f60119283",
394    "metadata": {},
395    "outputs": [],
396    "source": [
397     "print(f\"Total Test Cases: {len(test_cases)}\")\n",
398     "print(f\"Total Test Hours: {rnn_dat_sp.y_test.shape[0]}\")"
399    ]
400   },
401   {
402    "cell_type": "code",
403    "execution_count": null,
404    "id": "73e8ca05-d17b-4e72-8def-fa77664e7bb0",
405    "metadata": {},
406    "outputs": [],
407    "source": [
408     "print(f\"Spatial Training RMSE: {errs.mean()}\")\n",
409     "print(f\"Augmented KF RMSE: {df_kf.errs.mean()}\")"
410    ]
411   },
412   {
413    "cell_type": "code",
414    "execution_count": null,
415    "id": "a73d22ee-707b-44a3-80ab-ad6e671731cf",
416    "metadata": {},
417    "outputs": [],
418    "source": []
419   },
420   {
421    "cell_type": "code",
422    "execution_count": null,
423    "id": "272bfb32-e8e2-49dd-8f90-4b5b09c3a2a2",
424    "metadata": {},
425    "outputs": [],
426    "source": [
427     "print(f\"Spatial Training Elapsed time: {elapsed_time_sp:.4f} seconds\")"
428    ]
429   },
430   {
431    "cell_type": "code",
432    "execution_count": null,
433    "id": "38ab08fb-ac97-45be-8907-6f9cd124243b",
434    "metadata": {},
435    "outputs": [],
436    "source": []
437   }
438  ],
439  "metadata": {
440   "kernelspec": {
441    "display_name": "Python 3 (ipykernel)",
442    "language": "python",
443    "name": "python3"
444   },
445   "language_info": {
446    "codemirror_mode": {
447     "name": "ipython",
448     "version": 3
449    },
450    "file_extension": ".py",
451    "mimetype": "text/x-python",
452    "name": "python",
453    "nbconvert_exporter": "python",
454    "pygments_lexer": "ipython3",
455    "version": "3.12.5"
456   }
457  },
458  "nbformat": 4,
459  "nbformat_minor": 5