fmda/read_and_clean_tutorial.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "code",
   5    "execution_count": null,
   6    "id": "9ddd1d89-abdb-4627-a0ca-23db006b62f4",
   7    "metadata": {},
   8    "outputs": [],
   9    "source": [
  10     "import yaml\n",
  11     "import pickle\n",
  12     "import os.path as osp\n",
  13     "import subprocess\n",
  14     "from urllib.parse import urlparse\n",
  15     "import numpy as np\n",
  16     "import matplotlib.pyplot as plt\n",
  17     "from utils import time_intp, str2time, filter_nan_values, read_pkl, read_yml, retrieve_url"
  18    ]
  19   },
  20   {
  21    "cell_type": "markdown",
  22    "id": "609ea544-ed92-40a6-892b-1943e9f6f620",
  23    "metadata": {},
  24    "source": [
  25     "## Setup"
  26    ]
  27   },
  28   {
  29    "cell_type": "code",
  30    "execution_count": null,
  31    "id": "41b0d403-7d6b-44f4-963f-8dc492ae0126",
  32    "metadata": {},
  33    "outputs": [],
  34    "source": [
  35     "retrieve_url(\"https://demo.openwfm.org/web/data/fmda/dicts/fmda_nw_202401-05_f05.pkl\", \"data/fmda_nw_202401-05_f05.pkl\")"
  36    ]
  37   },
  38   {
  39    "cell_type": "code",
  40    "execution_count": null,
  41    "id": "e69e37b9-73ef-45a1-9738-844f26dc3323",
  42    "metadata": {},
  43    "outputs": [],
  44    "source": [
  45     "data_params = read_yml(\"params_data.yaml\")\n",
  46     "data_params"
  47    ]
  48   },
  49   {
  50    "cell_type": "code",
  51    "execution_count": null,
  52    "id": "6b5c3c82-84ba-426c-b8d9-f540b5026158",
  53    "metadata": {},
  54    "outputs": [],
  55    "source": [
  56     "# dat = read_pkl(\"data/test_CA_202401.pkl\")\n",
  57     "dat = read_pkl(\"data/test_NW_202401.pkl\")"
  58    ]
  59   },
  60   {
  61    "cell_type": "markdown",
  62    "id": "dae0e47b-02eb-4759-9b95-3cc1b281d41e",
  63    "metadata": {},
  64    "source": [
  65     "## Filters"
  66    ]
  67   },
  68   {
  69    "cell_type": "code",
  70    "execution_count": null,
  71    "id": "7b6b4347-6abe-4c21-8318-06a766d67d21",
  72    "metadata": {},
  73    "outputs": [],
  74    "source": [
  75     "# Useful Cases:\n",
  76     "    # NV040_202401: more raws observations than HRRR, interp should shorten\n",
  77     "    # NV026_202401: raws 10min obs, interp should shorten\n",
  78     "    # CGVC1_202401: missing only a few observations, interp should lengthen\n",
  79     "    # YNWC1_202401: only 2 observations, should be filtered entirely"
  80    ]
  81   },
  82   {
  83    "cell_type": "code",
  84    "execution_count": null,
  85    "id": "fc3fbda3-5e93-4122-9278-4b95ec69d25f",
  86    "metadata": {},
  87    "outputs": [],
  88    "source": [
  89     "def flag_lag_stretches(x, lag = 1, threshold = data_params['zero_lag_threshold']):\n",
  90     "    lags = np.diff(x, n=lag)\n",
  91     "    zero_lag_indices = np.where(lags == 0)[0]\n",
  92     "    current_run_length = 1\n",
  93     "    for i in range(1, len(zero_lag_indices)):\n",
  94     "        if zero_lag_indices[i] == zero_lag_indices[i-1] + 1:\n",
  95     "            current_run_length += 1\n",
  96     "            if current_run_length > threshold:\n",
  97     "                return True\n",
  98     "        else:\n",
  99     "            current_run_length = 1\n",
 100     "    else:\n",
 101     "        return False    "
 102    ]
 103   },
 104   {
 105    "cell_type": "code",
 106    "execution_count": null,
 107    "id": "67689bfe-3971-495f-95ef-0d52f3c7c3b5",
 108    "metadata": {},
 109    "outputs": [],
 110    "source": [
 111     "cases = list([*dat.keys()])\n",
 112     "flags = np.zeros(len(cases))\n",
 113     "for i, case in enumerate(cases):\n",
 114     "    print(\"~\"*50)\n",
 115     "    print(f\"Case: {case}\")\n",
 116     "    time_raws=str2time(dat[case]['RAWS']['time_raws'])\n",
 117     "    time_hrrr=str2time(dat[case][\"HRRR\"]['time'])\n",
 118     "    fm = dat[case]['RAWS']['fm']\n",
 119     "    ynew = time_intp(time_raws,fm,time_hrrr)\n",
 120     "    dat[case]['y'] = ynew\n",
 121     "    if flag_lag_stretches(ynew):\n",
 122     "        print(f\"Flagging case {case} for zero lag stretches greater than `zero_lag_threshold` param {data_params['zero_lag_threshold']}\")\n",
 123     "        flags[i]=1\n",
 124     "    if flag_lag_stretches(ynew, lag=2):\n",
 125     "        print(f\"Flagging case {case} for constant linear stretches greater than `max_intp_time` param {data_params['max_intp_time']}\")\n",
 126     "        flags[i]=1\n",
 127     "    if np.any(ynew>=data_params['max_fm']) or np.any(ynew<=data_params['min_fm']):\n",
 128     "        print(f\"Flagging case {case} for FMC outside param range {data_params['min_fm'],data_params['max_fm']}. FMC range for {case}: {ynew.min(),ynew.max()}\")\n",
 129     "        flags[i]=1"
 130    ]
 131   },
 132   {
 133    "cell_type": "code",
 134    "execution_count": null,
 135    "id": "246272bf-2f2e-4bab-97e2-b9d7f946618a",
 136    "metadata": {},
 137    "outputs": [],
 138    "source": [
 139     "flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]\n",
 140     "print(flagged_cases)"
 141    ]
 142   },
 143   {
 144    "cell_type": "code",
 145    "execution_count": null,
 146    "id": "bc28bd0a-1673-4414-bbc6-31baf55618ae",
 147    "metadata": {},
 148    "outputs": [],
 149    "source": []
 150   },
 151   {
 152    "cell_type": "code",
 153    "execution_count": null,
 154    "id": "1f97877c-89c9-49c1-a141-dac7ee2ea1a1",
 155    "metadata": {},
 156    "outputs": [],
 157    "source": []
 158   },
 159   {
 160    "cell_type": "code",
 161    "execution_count": null,
 162    "id": "04a22d48-2ef1-46b4-ab2b-333b240c799f",
 163    "metadata": {},
 164    "outputs": [],
 165    "source": []
 166   },
 167   {
 168    "cell_type": "code",
 169    "execution_count": null,
 170    "id": "16f30816-3f94-4238-a0f2-da69632415ba",
 171    "metadata": {},
 172    "outputs": [],
 173    "source": []
 174   }
 175  ],
 176  "metadata": {
 177   "kernelspec": {
 178    "display_name": "Python 3 (ipykernel)",
 179    "language": "python",
 180    "name": "python3"
 181   },
 182   "language_info": {
 183    "codemirror_mode": {
 184     "name": "ipython",
 185     "version": 3
 186    },
 187    "file_extension": ".py",
 188    "mimetype": "text/x-python",
 189    "name": "python",
 190    "nbconvert_exporter": "python",
 191    "pygments_lexer": "ipython3",
 192    "version": "3.12.5"
 193   }
 194  },
 195  "nbformat": 4,
 196  "nbformat_minor": 5
 197 }