5 "execution_count": null,
6 "id": "9ddd1d89-abdb-4627-a0ca-23db006b62f4",
12 "import os.path as osp\n",
13 "import subprocess\n",
14 "from datetime import timedelta\n",
15 "from urllib.parse import urlparse\n",
16 "import numpy as np\n",
17 "import matplotlib.pyplot as plt\n",
18 "from utils import time_intp, str2time, filter_nan_values, read_pkl, read_yml, retrieve_url"
22 "cell_type": "markdown",
23 "id": "609ea544-ed92-40a6-892b-1943e9f6f620",
31 "execution_count": null,
32 "id": "41b0d403-7d6b-44f4-963f-8dc492ae0126",
36 "retrieve_url(\"https://demo.openwfm.org/web/data/fmda/dicts/fmda_nw_202401-05_f05.pkl\", \"data/fmda_nw_202401-05_f05.pkl\")"
41 "execution_count": null,
42 "id": "e69e37b9-73ef-45a1-9738-844f26dc3323",
46 "data_params = read_yml(\"params_data.yaml\")\n",
52 "execution_count": null,
53 "id": "6b5c3c82-84ba-426c-b8d9-f540b5026158",
57 "# dat = read_pkl(\"data/test_CA_202401.pkl\")\n",
58 "dat = read_pkl(\"data/fmda_nw_202401-05_f05.pkl\")"
62 "cell_type": "markdown",
63 "id": "00c9c7da-e036-48d1-9822-6eb54fda6eff",
68 "Any filters applied after interpolation."
73 "execution_count": null,
74 "id": "442c9661-c21e-457f-af32-e892dd0920a8",
80 "cases = list([*dat.keys()])\n",
81 "flags = np.zeros(len(cases))\n",
82 "for i, case in enumerate(cases):\n",
83 " time_raws=str2time(dat[case]['RAWS']['time_raws'])\n",
84 " time_hrrr=str2time(dat[case][\"HRRR\"]['time'])\n",
85 " fm = dat[case]['RAWS']['fm']\n",
86 " ynew = time_intp(time_raws,fm,time_hrrr)\n",
87 " dat[case]['y'] = ynew"
91 "cell_type": "markdown",
92 "id": "20d6de99-9ad9-4ba4-8d87-05a6f962f0c0",
95 "## Divide into Hours"
100 "execution_count": null,
101 "id": "c0211477-9a31-4be1-a7bd-9275f427115d",
107 "hours = 720 # 1 month\n",
108 "cases = list([*dat.keys()])\n",
109 "flags = np.zeros(len(cases))\n",
111 "for key, data in dat.items():\n",
113 " time_hrrr=str2time(data[\"HRRR\"]['time'])\n",
114 " X_array = data['HRRR']['f01']['temp']\n",
115 " y_array = data['y']\n",
117 " # Determine the start and end time for the 720-hour portions\n",
118 " start_time = time_hrrr[0]\n",
119 " end_time = time_hrrr[-1]\n",
120 " current_time = start_time\n",
121 " portion_index = 1\n",
122 " while current_time < end_time:\n",
123 " next_time = current_time + timedelta(hours=720)\n",
125 " # Create a mask for the time range\n",
126 " mask = (time_hrrr >= current_time) & (time_hrrr < next_time)\n",
128 " # Apply the mask to extract the portions\n",
129 " new_time = time_hrrr[mask]\n",
130 " new_X = X_array[mask]\n",
131 " new_y = y_array[mask]\n",
133 " # Save the portions in the new dictionary\n",
134 " new_key = f\"{key}_portion_{portion_index}\"\n",
135 " dat2[new_key] = {'time': new_time, 'X': new_X, 'y': new_y}\n",
137 " # Move to the next portion\n",
138 " current_time = next_time\n",
139 " portion_index += 1 "
144 "execution_count": null,
145 "id": "7be0f469-0733-4719-96b7-5b2d2a5b2657",
155 "cell_type": "markdown",
156 "id": "dae0e47b-02eb-4759-9b95-3cc1b281d41e",
164 "execution_count": null,
165 "id": "7b6b4347-6abe-4c21-8318-06a766d67d21",
170 " # NV040_202401: more raws observations than HRRR, interp should shorten\n",
171 " # NV026_202401: raws 10min obs, interp should shorten\n",
172 " # CGVC1_202401: missing only a few observations, interp should lengthen\n",
173 " # YNWC1_202401: only 2 observations, should be filtered entirely"
178 "execution_count": null,
179 "id": "fc3fbda3-5e93-4122-9278-4b95ec69d25f",
183 "def flag_lag_stretches(x, lag = 1, threshold = data_params['zero_lag_threshold']):\n",
184 " lags = np.round(np.diff(x, n=lag), 8)\n",
185 " zero_lag_indices = np.where(lags == 0)[0]\n",
186 " current_run_length = 1\n",
187 " for i in range(1, len(zero_lag_indices)):\n",
188 " if zero_lag_indices[i] == zero_lag_indices[i-1] + 1:\n",
189 " current_run_length += 1\n",
190 " if current_run_length > threshold:\n",
193 " current_run_length = 1\n",
200 "execution_count": null,
201 "id": "67689bfe-3971-495f-95ef-0d52f3c7c3b5",
207 "cases = list([*dat.keys()])\n",
208 "flags = np.zeros(len(cases))\n",
209 "data_params['max_intp_time'] = 48\n",
210 "data_params['zero_lag_threshold'] = 48\n",
211 "for i, case in enumerate(cases):\n",
212 " print(\"~\"*50)\n",
213 " print(f\"Case: {case}\")\n",
214 " time_raws=str2time(dat[case]['RAWS']['time_raws'])\n",
215 " time_hrrr=str2time(dat[case][\"HRRR\"]['time'])\n",
216 " fm = dat[case]['RAWS']['fm']\n",
217 " ynew = time_intp(time_raws,fm,time_hrrr)\n",
218 " dat[case]['y'] = ynew\n",
219 " if flag_lag_stretches(ynew):\n",
220 " print(f\"Flagging case {case} for zero lag stretches greater than `zero_lag_threshold` param {data_params['zero_lag_threshold']}\")\n",
222 " if flag_lag_stretches(ynew, lag=2):\n",
223 " print(f\"Flagging case {case} for constant linear stretches greater than `max_intp_time` param {data_params['max_intp_time']}\")\n",
225 " if np.any(ynew>=data_params['max_fm']) or np.any(ynew<=data_params['min_fm']):\n",
226 " print(f\"Flagging case {case} for FMC outside param range {data_params['min_fm'],data_params['max_fm']}. FMC range for {case}: {ynew.min(),ynew.max()}\")\n",
232 "execution_count": null,
233 "id": "246272bf-2f2e-4bab-97e2-b9d7f946618a",
237 "flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]\n",
238 "print(flagged_cases)"
243 "execution_count": null,
244 "id": "bc28bd0a-1673-4414-bbc6-31baf55618ae",
253 "execution_count": null,
254 "id": "1f97877c-89c9-49c1-a141-dac7ee2ea1a1",
263 "execution_count": null,
264 "id": "1e76d282-a8ad-4e65-9363-d3f64739686e",
273 "execution_count": null,
274 "id": "02ea706a-5974-4f89-8644-0488256727ae",
278 "# Try partitioned dict\n",
284 "execution_count": null,
285 "id": "d89c7afe-8c70-4062-9229-83261431c04c",
289 "def discard_keys_with_short_y(input_dict):\n",
290 " filtered_dict = {key: value for key, value in input_dict.items() if len(value['y']) >= 720}\n",
291 " return filtered_dict\n",
293 "# Discard shorter stretches\n",
294 "dat2 = discard_keys_with_short_y(dat2)"
299 "execution_count": null,
300 "id": "92135427-099e-49a1-bab7-286ac7995a7b",
309 "execution_count": null,
310 "id": "0f703c8a-a26b-444b-8b4f-54185e19fdf8",
316 "cases = list([*dat2.keys()])\n",
317 "flags = np.zeros(len(cases))\n",
318 "data_params['max_intp_time'] = 8\n",
319 "data_params['zero_lag_threshold'] = 8\n",
320 "for i, case in enumerate(cases):\n",
321 " print(\"~\"*50)\n",
322 " print(f\"Case: {case}\")\n",
323 " ynew = dat2[case]['y']\n",
324 " if flag_lag_stretches(ynew):\n",
325 " print(f\"Flagging case {case} for zero lag stretches greater than `zero_lag_threshold` param {data_params['zero_lag_threshold']}\")\n",
327 " if flag_lag_stretches(ynew, lag=2):\n",
328 " print(f\"Flagging case {case} for constant linear stretches greater than `max_intp_time` param {data_params['max_intp_time']}\")\n",
330 " if np.any(ynew>=data_params['max_fm']) or np.any(ynew<=data_params['min_fm']):\n",
331 " print(f\"Flagging case {case} for FMC outside param range {data_params['min_fm'],data_params['max_fm']}. FMC range for {case}: {ynew.min(),ynew.max()}\")\n",
337 "execution_count": null,
338 "id": "04a22d48-2ef1-46b4-ab2b-333b240c799f",
347 "execution_count": null,
348 "id": "16f30816-3f94-4238-a0f2-da69632415ba",
352 "flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]\n",
358 "execution_count": null,
359 "id": "af172dc4-0f92-4996-9c41-4b09b28ca738",
368 "execution_count": null,
369 "id": "d16498c5-040b-4d77-a36d-80e4c1d2d167",
376 "execution_count": null,
377 "id": "106056cc-1ea7-4385-af3e-8e510483c445",
385 "display_name": "Python 3 (ipykernel)",
386 "language": "python",
394 "file_extension": ".py",
395 "mimetype": "text/x-python",
397 "nbconvert_exporter": "python",
398 "pygments_lexer": "ipython3",