Update read_and_clean_tutorial.ipynb
[notebooks.git] / fmda / read_and_clean_tutorial.ipynb
blob6fd4b883062f26d6248540c532cab71f2863a873
2  "cells": [
3   {
4    "cell_type": "code",
5    "execution_count": null,
6    "id": "9ddd1d89-abdb-4627-a0ca-23db006b62f4",
7    "metadata": {},
8    "outputs": [],
9    "source": [
10     "import yaml\n",
11     "import pickle\n",
12     "import os.path as osp\n",
13     "import subprocess\n",
14     "from datetime import timedelta\n",
15     "from urllib.parse import urlparse\n",
16     "import numpy as np\n",
17     "import matplotlib.pyplot as plt\n",
18     "from utils import time_intp, str2time, filter_nan_values, read_pkl, read_yml, retrieve_url"
19    ]
20   },
21   {
22    "cell_type": "markdown",
23    "id": "609ea544-ed92-40a6-892b-1943e9f6f620",
24    "metadata": {},
25    "source": [
26     "## Setup"
27    ]
28   },
29   {
30    "cell_type": "code",
31    "execution_count": null,
32    "id": "41b0d403-7d6b-44f4-963f-8dc492ae0126",
33    "metadata": {},
34    "outputs": [],
35    "source": [
36     "retrieve_url(\"https://demo.openwfm.org/web/data/fmda/dicts/fmda_nw_202401-05_f05.pkl\", \"data/fmda_nw_202401-05_f05.pkl\")"
37    ]
38   },
39   {
40    "cell_type": "code",
41    "execution_count": null,
42    "id": "e69e37b9-73ef-45a1-9738-844f26dc3323",
43    "metadata": {},
44    "outputs": [],
45    "source": [
46     "data_params = read_yml(\"params_data.yaml\")\n",
47     "data_params"
48    ]
49   },
50   {
51    "cell_type": "code",
52    "execution_count": null,
53    "id": "6b5c3c82-84ba-426c-b8d9-f540b5026158",
54    "metadata": {},
55    "outputs": [],
56    "source": [
57     "# dat = read_pkl(\"data/test_CA_202401.pkl\")\n",
58     "dat = read_pkl(\"data/fmda_nw_202401-05_f05.pkl\")"
59    ]
60   },
61   {
62    "cell_type": "markdown",
63    "id": "00c9c7da-e036-48d1-9822-6eb54fda6eff",
64    "metadata": {},
65    "source": [
66     "## Interp\n",
67     "\n",
68     "Any filters applied after interpolation."
69    ]
70   },
71   {
72    "cell_type": "code",
73    "execution_count": null,
74    "id": "442c9661-c21e-457f-af32-e892dd0920a8",
75    "metadata": {
76     "scrolled": true
77    },
78    "outputs": [],
79    "source": [
80     "cases = list([*dat.keys()])\n",
81     "flags = np.zeros(len(cases))\n",
82     "for i, case in enumerate(cases):\n",
83     "    time_raws=str2time(dat[case]['RAWS']['time_raws'])\n",
84     "    time_hrrr=str2time(dat[case][\"HRRR\"]['time'])\n",
85     "    fm = dat[case]['RAWS']['fm']\n",
86     "    ynew = time_intp(time_raws,fm,time_hrrr)\n",
87     "    dat[case]['y'] = ynew"
88    ]
89   },
90   {
91    "cell_type": "markdown",
92    "id": "20d6de99-9ad9-4ba4-8d87-05a6f962f0c0",
93    "metadata": {},
94    "source": [
95     "## Divide into Hours"
96    ]
97   },
98   {
99    "cell_type": "code",
100    "execution_count": null,
101    "id": "c0211477-9a31-4be1-a7bd-9275f427115d",
102    "metadata": {
103     "scrolled": true
104    },
105    "outputs": [],
106    "source": [
107     "hours = 720 # 1 month\n",
108     "cases = list([*dat.keys()])\n",
109     "flags = np.zeros(len(cases))\n",
110     "dat2={}\n",
111     "for key, data in dat.items():\n",
112     "    print(key)\n",
113     "    time_hrrr=str2time(data[\"HRRR\"]['time'])\n",
114     "    X_array = data['HRRR']['f01']['temp']\n",
115     "    y_array = data['y']\n",
116     "    \n",
117     "    # Determine the start and end time for the 720-hour portions\n",
118     "    start_time = time_hrrr[0]\n",
119     "    end_time = time_hrrr[-1]\n",
120     "    current_time = start_time\n",
121     "    portion_index = 1\n",
122     "    while current_time < end_time:\n",
123     "        next_time = current_time + timedelta(hours=720)\n",
124     "        \n",
125     "        # Create a mask for the time range\n",
126     "        mask = (time_hrrr >= current_time) & (time_hrrr < next_time)\n",
127     "        \n",
128     "        # Apply the mask to extract the portions\n",
129     "        new_time = time_hrrr[mask]\n",
130     "        new_X = X_array[mask]\n",
131     "        new_y = y_array[mask]\n",
132     "        \n",
133     "        # Save the portions in the new dictionary\n",
134     "        new_key = f\"{key}_portion_{portion_index}\"\n",
135     "        dat2[new_key] = {'time': new_time, 'X': new_X, 'y': new_y}\n",
136     "        \n",
137     "        # Move to the next portion\n",
138     "        current_time = next_time\n",
139     "        portion_index += 1    "
140    ]
141   },
142   {
143    "cell_type": "code",
144    "execution_count": null,
145    "id": "7be0f469-0733-4719-96b7-5b2d2a5b2657",
146    "metadata": {
147     "scrolled": true
148    },
149    "outputs": [],
150    "source": [
151     "dat2.keys()"
152    ]
153   },
154   {
155    "cell_type": "markdown",
156    "id": "dae0e47b-02eb-4759-9b95-3cc1b281d41e",
157    "metadata": {},
158    "source": [
159     "## Filters"
160    ]
161   },
162   {
163    "cell_type": "code",
164    "execution_count": null,
165    "id": "7b6b4347-6abe-4c21-8318-06a766d67d21",
166    "metadata": {},
167    "outputs": [],
168    "source": [
169     "# Useful Cases:\n",
170     "    # NV040_202401: more raws observations than HRRR, interp should shorten\n",
171     "    # NV026_202401: raws 10min obs, interp should shorten\n",
172     "    # CGVC1_202401: missing only a few observations, interp should lengthen\n",
173     "    # YNWC1_202401: only 2 observations, should be filtered entirely"
174    ]
175   },
176   {
177    "cell_type": "code",
178    "execution_count": null,
179    "id": "fc3fbda3-5e93-4122-9278-4b95ec69d25f",
180    "metadata": {},
181    "outputs": [],
182    "source": [
183     "def flag_lag_stretches(x, lag = 1, threshold = data_params['zero_lag_threshold']):\n",
184     "    lags = np.round(np.diff(x, n=lag), 8)\n",
185     "    zero_lag_indices = np.where(lags == 0)[0]\n",
186     "    current_run_length = 1\n",
187     "    for i in range(1, len(zero_lag_indices)):\n",
188     "        if zero_lag_indices[i] == zero_lag_indices[i-1] + 1:\n",
189     "            current_run_length += 1\n",
190     "            if current_run_length > threshold:\n",
191     "                return True\n",
192     "        else:\n",
193     "            current_run_length = 1\n",
194     "    else:\n",
195     "        return False    "
196    ]
197   },
198   {
199    "cell_type": "code",
200    "execution_count": null,
201    "id": "67689bfe-3971-495f-95ef-0d52f3c7c3b5",
202    "metadata": {
203     "scrolled": true
204    },
205    "outputs": [],
206    "source": [
207     "cases = list([*dat.keys()])\n",
208     "flags = np.zeros(len(cases))\n",
209     "data_params['max_intp_time'] = 48\n",
210     "data_params['zero_lag_threshold'] = 48\n",
211     "for i, case in enumerate(cases):\n",
212     "    print(\"~\"*50)\n",
213     "    print(f\"Case: {case}\")\n",
214     "    time_raws=str2time(dat[case]['RAWS']['time_raws'])\n",
215     "    time_hrrr=str2time(dat[case][\"HRRR\"]['time'])\n",
216     "    fm = dat[case]['RAWS']['fm']\n",
217     "    ynew = time_intp(time_raws,fm,time_hrrr)\n",
218     "    dat[case]['y'] = ynew\n",
219     "    if flag_lag_stretches(ynew):\n",
220     "        print(f\"Flagging case {case} for zero lag stretches greater than `zero_lag_threshold` param {data_params['zero_lag_threshold']}\")\n",
221     "        flags[i]=1\n",
222     "    if flag_lag_stretches(ynew, lag=2):\n",
223     "        print(f\"Flagging case {case} for constant linear stretches greater than `max_intp_time` param {data_params['max_intp_time']}\")\n",
224     "        flags[i]=1\n",
225     "    if np.any(ynew>=data_params['max_fm']) or np.any(ynew<=data_params['min_fm']):\n",
226     "        print(f\"Flagging case {case} for FMC outside param range {data_params['min_fm'],data_params['max_fm']}. FMC range for {case}: {ynew.min(),ynew.max()}\")\n",
227     "        flags[i]=1"
228    ]
229   },
230   {
231    "cell_type": "code",
232    "execution_count": null,
233    "id": "246272bf-2f2e-4bab-97e2-b9d7f946618a",
234    "metadata": {},
235    "outputs": [],
236    "source": [
237     "flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]\n",
238     "print(flagged_cases)"
239    ]
240   },
241   {
242    "cell_type": "code",
243    "execution_count": null,
244    "id": "bc28bd0a-1673-4414-bbc6-31baf55618ae",
245    "metadata": {},
246    "outputs": [],
247    "source": [
248     "len(flagged_cases)"
249    ]
250   },
251   {
252    "cell_type": "code",
253    "execution_count": null,
254    "id": "1f97877c-89c9-49c1-a141-dac7ee2ea1a1",
255    "metadata": {},
256    "outputs": [],
257    "source": [
258     "len(cases)"
259    ]
260   },
261   {
262    "cell_type": "code",
263    "execution_count": null,
264    "id": "1e76d282-a8ad-4e65-9363-d3f64739686e",
265    "metadata": {},
266    "outputs": [],
267    "source": [
268     "168 / 235"
269    ]
270   },
271   {
272    "cell_type": "code",
273    "execution_count": null,
274    "id": "02ea706a-5974-4f89-8644-0488256727ae",
275    "metadata": {},
276    "outputs": [],
277    "source": [
278     "# Try partitioned dict\n",
279     "len(dat2.keys())"
280    ]
281   },
282   {
283    "cell_type": "code",
284    "execution_count": null,
285    "id": "d89c7afe-8c70-4062-9229-83261431c04c",
286    "metadata": {},
287    "outputs": [],
288    "source": [
289     "def discard_keys_with_short_y(input_dict):\n",
290     "    filtered_dict = {key: value for key, value in input_dict.items() if len(value['y']) >= 720}\n",
291     "    return filtered_dict\n",
292     "\n",
293     "# Discard shorter stretches\n",
294     "dat2 = discard_keys_with_short_y(dat2)"
295    ]
296   },
297   {
298    "cell_type": "code",
299    "execution_count": null,
300    "id": "92135427-099e-49a1-bab7-286ac7995a7b",
301    "metadata": {},
302    "outputs": [],
303    "source": [
304     "len(dat2.keys())"
305    ]
306   },
307   {
308    "cell_type": "code",
309    "execution_count": null,
310    "id": "0f703c8a-a26b-444b-8b4f-54185e19fdf8",
311    "metadata": {
312     "scrolled": true
313    },
314    "outputs": [],
315    "source": [
316     "cases = list([*dat2.keys()])\n",
317     "flags = np.zeros(len(cases))\n",
318     "data_params['max_intp_time'] = 8\n",
319     "data_params['zero_lag_threshold'] = 8\n",
320     "for i, case in enumerate(cases):\n",
321     "    print(\"~\"*50)\n",
322     "    print(f\"Case: {case}\")\n",
323     "    ynew = dat2[case]['y']\n",
324     "    if flag_lag_stretches(ynew):\n",
325     "        print(f\"Flagging case {case} for zero lag stretches greater than `zero_lag_threshold` param {data_params['zero_lag_threshold']}\")\n",
326     "        flags[i]=1\n",
327     "    if flag_lag_stretches(ynew, lag=2):\n",
328     "        print(f\"Flagging case {case} for constant linear stretches greater than `max_intp_time` param {data_params['max_intp_time']}\")\n",
329     "        flags[i]=1\n",
330     "    if np.any(ynew>=data_params['max_fm']) or np.any(ynew<=data_params['min_fm']):\n",
331     "        print(f\"Flagging case {case} for FMC outside param range {data_params['min_fm'],data_params['max_fm']}. FMC range for {case}: {ynew.min(),ynew.max()}\")\n",
332     "        flags[i]=1"
333    ]
334   },
335   {
336    "cell_type": "code",
337    "execution_count": null,
338    "id": "04a22d48-2ef1-46b4-ab2b-333b240c799f",
339    "metadata": {},
340    "outputs": [],
341    "source": [
342     "len(dat2.keys())"
343    ]
344   },
345   {
346    "cell_type": "code",
347    "execution_count": null,
348    "id": "16f30816-3f94-4238-a0f2-da69632415ba",
349    "metadata": {},
350    "outputs": [],
351    "source": [
352     "flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]\n",
353     "len(flagged_cases)"
354    ]
355   },
356   {
357    "cell_type": "code",
358    "execution_count": null,
359    "id": "af172dc4-0f92-4996-9c41-4b09b28ca738",
360    "metadata": {},
361    "outputs": [],
362    "source": [
363     "477 / 1175"
364    ]
365   },
366   {
367    "cell_type": "code",
368    "execution_count": null,
369    "id": "d16498c5-040b-4d77-a36d-80e4c1d2d167",
370    "metadata": {},
371    "outputs": [],
372    "source": []
373   },
374   {
375    "cell_type": "code",
376    "execution_count": null,
377    "id": "106056cc-1ea7-4385-af3e-8e510483c445",
378    "metadata": {},
379    "outputs": [],
380    "source": []
381   }
382  ],
383  "metadata": {
384   "kernelspec": {
385    "display_name": "Python 3 (ipykernel)",
386    "language": "python",
387    "name": "python3"
388   },
389   "language_info": {
390    "codemirror_mode": {
391     "name": "ipython",
392     "version": 3
393    },
394    "file_extension": ".py",
395    "mimetype": "text/x-python",
396    "name": "python",
397    "nbconvert_exporter": "python",
398    "pygments_lexer": "ipython3",
399    "version": "3.12.5"
400   }
401  },
402  "nbformat": 4,
403  "nbformat_minor": 5