From 6ebf723c2585bde3b6bf5d51d0967e7072920c00 Mon Sep 17 00:00:00 2001
From: jh-206 <jhirschi206@gmail.com>
Date: Mon, 14 Oct 2024 10:35:30 -0600
Subject: [PATCH] cleanup commented code

---
 fmda/data_funcs.py          | 214 --------------------------------------------
 fmda/fmda_rnn_spatial.ipynb |  39 --------
 2 files changed, 253 deletions(-)

diff --git a/fmda/data_funcs.py b/fmda/data_funcs.py
index 6fce2db..98ce493 100644
--- a/fmda/data_funcs.py
+++ b/fmda/data_funcs.py
@@ -290,56 +290,6 @@ def shift_time(X_array, inds, forecast_step):
 
 
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-# Wrapper Functions to Put it all together
-#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-# TODO: ENGINEERED TIME FEATURES:
-# hod = rnn_dat.time.astype('datetime64[h]').astype(int) % 24
-# doy = np.array([dt.timetuple().tm_yday - 1 for dt in rnn_dat.time])
-
-# def create_spatial_train(input_file_paths, params_data, atm_dict = "HRRR", verbose=False):
-#     train = process_train_dict(input_file_paths, params_data = params_data, verbose=verbose)
-#     train_sp = Dict(combine_nested(train))
-#     return train_sp
-
-# def process_train_dict(input_file_paths, params_data, atm_dict = "HRRR", spatial=False, verbose=False):
-#     if type(input_file_paths) is not list:
-#         raise ValueError(f"Argument `input_file_paths` must be list, received {type(input_file_paths)}")
-#     train = {}
-#     for file_path in input_file_paths:
-#         # Extract target and features
-#         di = build_train_dict(file_path, atm=atm_dict, features_all=params_data['features_all'], verbose=verbose)
-#         # Subset timeseries into shorter stretches
-#         di = split_timeseries(di, hours=params_data['hours'], verbose=verbose)
-#         di = discard_keys_with_short_y(di, hours=params_data['hours'], verbose=False)
-#         # Check for suspect data
-#         flags = flag_dict_keys(di, params_data['zero_lag_threshold'], params_data['max_intp_time'], max_y = params_data['max_fm'], min_y = params_data['min_fm'], verbose=verbose)
-#         # Remove flagged cases
-#         cases = list([*di.keys()])
-#         flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1]
-#         remove_key_list(di, flagged_cases, verbose=verbose)
-#         train.update(di)
-#     if spatial:
-#         train = combine_nested(train)
-    
-#     return Dict(train)
-
-
 def subset_by_features(nested_dict, input_features, verbose=True):
     """
     Subsets a nested dictionary to only include keys where all strings in the input_features
@@ -369,170 +319,6 @@ def subset_by_features(nested_dict, input_features, verbose=True):
     
     return result
 
-# feature_types = {
-#     # Static features are based on physical location, e.g. location of RAWS site
-#     'static': ['elev', 'lon', 'lat'],
-#     # Atmospheric weather features come from either RAWS subdict or HRRR
-#     'atm': ['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew']
-# }
-
-# def build_train_dict(input_file_path,
-#               forecast_step=1, atm="HRRR",features_all=['Ed', 'Ew', 'solar', 'wind', 'elev', 'lon', 'lat', 'doy', 'hod', 'rain'], verbose=False):
-#     # in:
-#     #   file_path       list of strings - files as in read_test_pkl
-#     #   forecast_step   int - which forecast step to take atmospheric data from (maybe 03, must be >0). 
-#     #   atm        str - name of subdict where atmospheric vars are located
-#     #   features_list   list of strings - names of keys in subdicts to collect into features matrix. Default is everything collected
-#     # return:
-#     #   train          dictionary with structure
-#     #                  {key : {'key' : key,    # copied subdict key
-#     #                          'loc' : {...},  # copied from in dict = {key : {'loc': ... }...}
-#     #                         'time' : time,   # datetime vector, spacing tres
-#     #                            'X' : fm      # target fuel moisture from the RAWS, interpolated to time
-#     #                            'Y' : feat    # features from atmosphere and location
-#     #                            
-#     #
-
-    
-#     # TODO: fix this
-#     if 'rain' in features_all and (not features_all[-1]=='rain'):
-#         raise ValueError(f"Make rain in features list last element since (working on fix as of 24-6-24), given features list: {features_list}")
-    
-#     if forecast_step > 0 and forecast_step < 100 and forecast_step == int(forecast_step):
-#         fstep='f'+str(forecast_step).zfill(2)
-#         fprev='f'+str(forecast_step-1).zfill(2)
-#         # logging.info('Using data from step %s',fstep)
-#         # logging.info('Using rain as the difference of accumulated precipitation between %s and %s',fstep,fprev)
-#     else:
-#         # logging.critical('forecast_step must be integer between 1 and 99')
-#         raise ValueError('bad forecast_step')
-        
-#     train = {}
-#     with open(input_file_path, 'rb') as file:
-#         # logging.info("loading file %s", file_path)
-#         d = pickle.load(file)
-#     for key in d:
-#         atm_dict = atm
-#         features_list = features_all
-#         # logging.info('Processing subdictionary %s',key)
-#         if key in train:
-#             pass
-#             # logging.warning('skipping duplicate key %s',key)
-#         else:
-#             subdict=d[key]    # subdictionary for this case
-#             loc=subdict['loc']
-#             train[key] = {
-#             'id': key,  # store the key inside the dictionary, subdictionary will be used separatedly
-#             'case':key,
-#             'filename': input_file_path,
-#             'loc': loc
-#             }
-#             desc='descr'
-#             if desc in subdict:
-#                 train[desc]=subdict[desc]
-#             time_hrrr=str2time(subdict[atm_dict]['time'])
-#             # timekeeping
-#             hours=len(d[key][atm_dict]['time'])
-#             train[key]['hours']=hours
-#             # train[key]['h2']   =hours     # not doing prediction yet    
-#             hrrr_increment = check_increment(time_hrrr,id=key+f' {atm_dict}.time')
-#             # logging.info(f'{atm_dict} increment is %s h',hrrr_increment)
-#             if  hrrr_increment < 1:
-#                 # logging.critical('HRRR increment is %s h must be at least 1 h',hrrr_increment)
-#                 raise(ValueError)
-            
-#             # build matrix of features - assuming all the same length, if not column_stack will fail
-#             train[key]['time']=time_hrrr
-#             # logging.info(f"Created feature matrix train[{key}]['X'] shape {train[key]['X'].shape}")
-#             time_raws=str2time(subdict['RAWS']['time_raws']) # may not be the same as HRRR
-#             # logging.info('%s RAWS.time_raws length is %s',key,len(time_raws))
-#             check_increment(time_raws,id=key+' RAWS.time_raws')
-#             # print_first(time_raws,num=5,id='RAWS.time_raws')
-            
-#             # Set up static vars
-#             columns=[]
-#             missing_features = []
-#             for feat in features_list:
-#                 # For atmospheric features,
-#                 if feat in feature_types['atm']:
-#                     if atm_dict == "HRRR":
-#                         vec = subdict['HRRR'][fstep][feat]
-#                         columns.append(vec)
-#                     elif atm_dict == "RAWS":
-#                         if feat in subdict['RAWS'].keys():
-#                             vec = time_intp(time_raws, subdict['RAWS'][feat], time_hrrr)
-#                             columns.append(vec)
-#                         else:
-#                             missing_features.append(feat)
-                
-#                 # For static features, repeat to fit number of time observations
-#                 elif feat in feature_types['static']:
-#                     columns.append(np.full(hours,loc[feat]))
-#             # Add Engineered Time features, doy and hod
-#             # hod = time_hrrr.astype('datetime64[h]').astype(int) % 24
-#             # doy = np.array([dt.timetuple().tm_yday - 1 for dt in time_hrrr])
-#             # columns.extend([doy, hod])
-            
-#             # compute rain as difference of accumulated precipitation
-#             if 'rain' in features_list:
-#                 if atm_dict == "HRRR":
-#                     rain = subdict[atm_dict][fstep]['precip_accum']- subdict[atm_dict][fprev]['precip_accum']
-#                     # logging.info('%s rain as difference %s minus %s: min %s max %s',
-#                              # key,fstep,fprev,np.min(rain),np.max(rain))
-#                 elif atm_dict == "RAWS":
-#                     if 'rain' in subdict[atm_dict]:
-#                         rain = time_intp(time_raws,subdict[atm_dict]['rain'],time_hrrr)
-#                     else:
-#                         pass
-#                         # logging.info('No rain data found in RAWS subdictionary %s', key)
-#                 columns.append( rain ) # add rain feature         
-#             else:
-#                 missing_features.append('rain')
-
-#             train[key]['X'] = np.column_stack(columns)
-#             train[key]['features_list'] = [item for item in features_list if item not in missing_features]
-            
-#             fm=subdict['RAWS']['fm']
-#             # logging.info('%s RAWS.fm length is %s',key,len(fm))
-#             # interpolate RAWS sensors to HRRR time and over NaNs
-#             train[key]['y'] = time_intp(time_raws,fm,time_hrrr)
-#             # TODO: check endpoint interpolation when RAWS data sparse, and bail out if not enough data
-            
-#             if  train[key]['y'] is None:
-#                 pass
-#                 # logging.error('Cannot create target matrix for %s, using None',key)
-#             else:
-#                 pass
-#                 # logging.info(f"Created target matrix train[{key}]['y'] shape {train[key]['y'].shape}")
-
-#     # logging.info('Created a "train" dictionary with %s items',len(train))
- 
-#     # clean up
-    
-#     keys_to_delete = []
-#     for key in train:
-#         if train[key]['X'] is None or train[key]['y'] is None:
-#             # logging.warning('Deleting training item %s because features X or target Y are None', key)
-#             keys_to_delete.append(key)
-
-#     # Delete the items from the dictionary
-#     if len(keys_to_delete)>0:
-#         for key in keys_to_delete:
-#             del train[key]       
-#         # logging.warning('Deleted %s items with None for data. %s items remain in the training dictionary.',
-#                         # len(keys_to_delete),len(train))
-        
-#     # output
-
-#     # if output_file_path is not None:
-#     #     with open(output_file_path, 'wb') as file:
-#     #         logging.info('Writing pickle dump of the dictionary train into file %s',output_file_path)
-#     #         pickle.dump(train, file)
-    
-#     # logging.info('pkl2train done')
-    
-#     return train
-
 
 
 def remove_key_list(d, ls, verbose=False):
diff --git a/fmda/fmda_rnn_spatial.ipynb b/fmda/fmda_rnn_spatial.ipynb
index c7870a9..cb1785b 100644
--- a/fmda/fmda_rnn_spatial.ipynb
+++ b/fmda/fmda_rnn_spatial.ipynb
@@ -244,32 +244,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c0c7f5fb-4c33-45f8-9a2e-38c9ab1cd4e3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# rnn_dat_sp = RNNData(\n",
-    "#     train_sp, # input dictionary\n",
-    "#     scaler=\"standard\",  # data scaling type\n",
-    "#     features_list = params['features_list'] # features for predicting outcome\n",
-    "# )\n",
-    "\n",
-    "\n",
-    "# rnn_dat_sp.train_test_split(   \n",
-    "#     time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test\n",
-    "#     space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test\n",
-    "# )\n",
-    "# rnn_dat_sp.scale_data()\n",
-    "\n",
-    "# rnn_dat_sp.batch_reshape(\n",
-    "#     timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. \n",
-    "#     batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent\n",
-    "# )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "id": "af82c50e-bcc4-406d-b759-399119d1af81",
    "metadata": {},
    "outputs": [],
@@ -283,19 +257,6 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7431bc95-d384-40fd-a622-bbc0ee68e5cd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# # Update Params specific to spatial training\n",
-    "# params.update({\n",
-    "#     'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch\n",
-    "# })"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "id": "4bc11474-fed8-47f2-b9cf-dfdda0d3d3b2",
    "metadata": {},
    "outputs": [],
-- 
2.11.4.GIT