Update data cleaning tutorial and params
[notebooks.git] / fmda / read_and_clean_tutorial.ipynb
blob9a9affd941058071c1966e6f22b671506899f0a2
2  "cells": [
3   {
4    "cell_type": "code",
5    "execution_count": null,
6    "id": "9ddd1d89-abdb-4627-a0ca-23db006b62f4",
7    "metadata": {},
8    "outputs": [],
9    "source": [
10     "import yaml\n",
11     "import pickle\n",
12     "import os.path as osp\n",
13     "import subprocess\n",
14     "from urllib.parse import urlparse\n",
15     "import numpy as np\n",
16     "import matplotlib.pyplot as plt\n",
17     "from utils import time_intp, str2time, filter_nan_values, read_pkl, read_yml, retrieve_url"
18    ]
19   },
20   {
21    "cell_type": "markdown",
22    "id": "609ea544-ed92-40a6-892b-1943e9f6f620",
23    "metadata": {},
24    "source": [
25     "## Setup"
26    ]
27   },
28   {
29    "cell_type": "code",
30    "execution_count": null,
31    "id": "41b0d403-7d6b-44f4-963f-8dc492ae0126",
32    "metadata": {},
33    "outputs": [],
34    "source": [
35     "retrieve_url(\"https://demo.openwfm.org/web/data/fmda/dicts/fmda_nw_202401-05_f05.pkl\", \"data/fmda_nw_202401-05_f05.pkl\")"
36    ]
37   },
38   {
39    "cell_type": "code",
40    "execution_count": null,
41    "id": "e69e37b9-73ef-45a1-9738-844f26dc3323",
42    "metadata": {},
43    "outputs": [],
44    "source": [
45     "data_params = read_yml(\"params_data.yaml\")\n",
46     "data_params"
47    ]
48   },
49   {
50    "cell_type": "code",
51    "execution_count": null,
52    "id": "6b5c3c82-84ba-426c-b8d9-f540b5026158",
53    "metadata": {},
54    "outputs": [],
55    "source": [
56     "# dat = read_pkl(\"data/test_CA_202401.pkl\")\n",
57     "dat = read_pkl(\"data/test_NW_202401.pkl\")"
58    ]
59   },
60   {
61    "cell_type": "markdown",
62    "id": "dae0e47b-02eb-4759-9b95-3cc1b281d41e",
63    "metadata": {},
64    "source": [
65     "## Filters"
66    ]
67   },
68   {
69    "cell_type": "code",
70    "execution_count": null,
71    "id": "7b6b4347-6abe-4c21-8318-06a766d67d21",
72    "metadata": {},
73    "outputs": [],
74    "source": [
75     "# Useful Cases:\n",
76     "    # NV040_202401: more raws observations than HRRR, interp should shorten\n",
77     "    # NV026_202401: raws 10min obs, interp should shorten\n",
78     "    # CGVC1_202401: missing only a few observations, interp should lengthen\n",
79     "    # YNWC1_202401: only 2 observations, should be filtered entirely"
80    ]
81   },
82   {
83    "cell_type": "code",
84    "execution_count": null,
85    "id": "fc3fbda3-5e93-4122-9278-4b95ec69d25f",
86    "metadata": {},
87    "outputs": [],
88    "source": [
89     "def flag_lag_stretches(x, lag = 1, threshold = data_params['zero_lag_threshold']):\n",
90     "    lags = np.diff(x, n=lag)\n",
91     "    zero_lag_indices = np.where(lags == 0)[0]\n",
92     "    current_run_length = 1\n",
93     "    for i in range(1, len(zero_lag_indices)):\n",
94     "        if zero_lag_indices[i] == zero_lag_indices[i-1] + 1:\n",
95     "            current_run_length += 1\n",
96     "            if current_run_length > threshold:\n",
97     "                return True\n",
98     "        else:\n",
99     "            current_run_length = 1\n",
100     "    else:\n",
101     "        return False    "
102    ]
103   },
104   {
105    "cell_type": "code",
106    "execution_count": null,
107    "id": "67689bfe-3971-495f-95ef-0d52f3c7c3b5",
108    "metadata": {},
109    "outputs": [],
110    "source": [
111     "cases = list([*dat.keys()])\n",
112     "flags = np.zeros(len(cases))\n",
113     "for i, case in enumerate(cases):\n",
114     "    print(\"~\"*50)\n",
115     "    print(f\"Case: {case}\")\n",
116     "    time_raws=str2time(dat[case]['RAWS']['time_raws'])\n",
117     "    time_hrrr=str2time(dat[case][\"HRRR\"]['time'])\n",
118     "    fm = dat[case]['RAWS']['fm']\n",
119     "    ynew = time_intp(time_raws,fm,time_hrrr)\n",
120     "    dat[case]['y'] = ynew\n",
121     "    if flag_lag_stretches(ynew):\n",
122     "        print(f\"Flagging case {case} for zero lag stretches greater than `zero_lag_threshold` param {data_params['zero_lag_threshold']}\")\n",
123     "        flags[i]=1\n",
124     "    if flag_lag_stretches(ynew, lag=2):\n",
125     "        print(f\"Flagging case {case} for constant linear stretches greater than `max_intp_time` param {data_params['max_intp_time']}\")\n",
126     "        flags[i]=1\n",
127     "    if np.any(ynew>=data_params['max_fm']) or np.any(ynew<=data_params['min_fm']):\n",
128     "        print(f\"Flagging case {case} for FMC outside param range {data_params['min_fm'],data_params['max_fm']}. FMC range for {case}: {ynew.min(),ynew.max()}\")\n",
129     "        flags[i]=1"
130    ]
131   },
132   {
133    "cell_type": "code",
134    "execution_count": null,
135    "id": "246272bf-2f2e-4bab-97e2-b9d7f946618a",
136    "metadata": {},
137    "outputs": [],
138    "source": [
139     "flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]\n",
140     "print(flagged_cases)"
141    ]
142   },
143   {
144    "cell_type": "code",
145    "execution_count": null,
146    "id": "bc28bd0a-1673-4414-bbc6-31baf55618ae",
147    "metadata": {},
148    "outputs": [],
149    "source": []
150   },
151   {
152    "cell_type": "code",
153    "execution_count": null,
154    "id": "1f97877c-89c9-49c1-a141-dac7ee2ea1a1",
155    "metadata": {},
156    "outputs": [],
157    "source": []
158   },
159   {
160    "cell_type": "code",
161    "execution_count": null,
162    "id": "04a22d48-2ef1-46b4-ab2b-333b240c799f",
163    "metadata": {},
164    "outputs": [],
165    "source": []
166   },
167   {
168    "cell_type": "code",
169    "execution_count": null,
170    "id": "16f30816-3f94-4238-a0f2-da69632415ba",
171    "metadata": {},
172    "outputs": [],
173    "source": []
174   }
175  ],
176  "metadata": {
177   "kernelspec": {
178    "display_name": "Python 3 (ipykernel)",
179    "language": "python",
180    "name": "python3"
181   },
182   "language_info": {
183    "codemirror_mode": {
184     "name": "ipython",
185     "version": 3
186    },
187    "file_extension": ".py",
188    "mimetype": "text/x-python",
189    "name": "python",
190    "nbconvert_exporter": "python",
191    "pygments_lexer": "ipython3",
192    "version": "3.12.5"
193   }
194  },
195  "nbformat": 4,
196  "nbformat_minor": 5