From 6ebf723c2585bde3b6bf5d51d0967e7072920c00 Mon Sep 17 00:00:00 2001 From: jh-206 Date: Mon, 14 Oct 2024 10:35:30 -0600 Subject: [PATCH] cleanup commented code --- fmda/data_funcs.py | 214 -------------------------------------------- fmda/fmda_rnn_spatial.ipynb | 39 -------- 2 files changed, 253 deletions(-) diff --git a/fmda/data_funcs.py b/fmda/data_funcs.py index 6fce2db..98ce493 100644 --- a/fmda/data_funcs.py +++ b/fmda/data_funcs.py @@ -290,56 +290,6 @@ def shift_time(X_array, inds, forecast_step): - - - - - - - - - - - - - - - -# Wrapper Functions to Put it all together -#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -# TODO: ENGINEERED TIME FEATURES: -# hod = rnn_dat.time.astype('datetime64[h]').astype(int) % 24 -# doy = np.array([dt.timetuple().tm_yday - 1 for dt in rnn_dat.time]) - -# def create_spatial_train(input_file_paths, params_data, atm_dict = "HRRR", verbose=False): -# train = process_train_dict(input_file_paths, params_data = params_data, verbose=verbose) -# train_sp = Dict(combine_nested(train)) -# return train_sp - -# def process_train_dict(input_file_paths, params_data, atm_dict = "HRRR", spatial=False, verbose=False): -# if type(input_file_paths) is not list: -# raise ValueError(f"Argument `input_file_paths` must be list, received {type(input_file_paths)}") -# train = {} -# for file_path in input_file_paths: -# # Extract target and features -# di = build_train_dict(file_path, atm=atm_dict, features_all=params_data['features_all'], verbose=verbose) -# # Subset timeseries into shorter stretches -# di = split_timeseries(di, hours=params_data['hours'], verbose=verbose) -# di = discard_keys_with_short_y(di, hours=params_data['hours'], verbose=False) -# # Check for suspect data -# flags = flag_dict_keys(di, params_data['zero_lag_threshold'], params_data['max_intp_time'], max_y = params_data['max_fm'], min_y = params_data['min_fm'], verbose=verbose) -# # Remove flagged cases -# cases = list([*di.keys()]) -# flagged_cases = [element for element, flag in zip(cases, flags) if flag == 1] -# remove_key_list(di, flagged_cases, verbose=verbose) -# train.update(di) -# if spatial: -# train = combine_nested(train) - -# return Dict(train) - - def subset_by_features(nested_dict, input_features, verbose=True): """ Subsets a nested dictionary to only include keys where all strings in the input_features @@ -369,170 +319,6 @@ def subset_by_features(nested_dict, input_features, verbose=True): return result -# feature_types = { -# # Static features are based on physical location, e.g. location of RAWS site -# 'static': ['elev', 'lon', 'lat'], -# # Atmospheric weather features come from either RAWS subdict or HRRR -# 'atm': ['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew'] -# } - -# def build_train_dict(input_file_path, -# forecast_step=1, atm="HRRR",features_all=['Ed', 'Ew', 'solar', 'wind', 'elev', 'lon', 'lat', 'doy', 'hod', 'rain'], verbose=False): -# # in: -# # file_path list of strings - files as in read_test_pkl -# # forecast_step int - which forecast step to take atmospheric data from (maybe 03, must be >0). -# # atm str - name of subdict where atmospheric vars are located -# # features_list list of strings - names of keys in subdicts to collect into features matrix. Default is everything collected -# # return: -# # train dictionary with structure -# # {key : {'key' : key, # copied subdict key -# # 'loc' : {...}, # copied from in dict = {key : {'loc': ... }...} -# # 'time' : time, # datetime vector, spacing tres -# # 'X' : fm # target fuel moisture from the RAWS, interpolated to time -# # 'Y' : feat # features from atmosphere and location -# # -# # - - -# # TODO: fix this -# if 'rain' in features_all and (not features_all[-1]=='rain'): -# raise ValueError(f"Make rain in features list last element since (working on fix as of 24-6-24), given features list: {features_list}") - -# if forecast_step > 0 and forecast_step < 100 and forecast_step == int(forecast_step): -# fstep='f'+str(forecast_step).zfill(2) -# fprev='f'+str(forecast_step-1).zfill(2) -# # logging.info('Using data from step %s',fstep) -# # logging.info('Using rain as the difference of accumulated precipitation between %s and %s',fstep,fprev) -# else: -# # logging.critical('forecast_step must be integer between 1 and 99') -# raise ValueError('bad forecast_step') - -# train = {} -# with open(input_file_path, 'rb') as file: -# # logging.info("loading file %s", file_path) -# d = pickle.load(file) -# for key in d: -# atm_dict = atm -# features_list = features_all -# # logging.info('Processing subdictionary %s',key) -# if key in train: -# pass -# # logging.warning('skipping duplicate key %s',key) -# else: -# subdict=d[key] # subdictionary for this case -# loc=subdict['loc'] -# train[key] = { -# 'id': key, # store the key inside the dictionary, subdictionary will be used separatedly -# 'case':key, -# 'filename': input_file_path, -# 'loc': loc -# } -# desc='descr' -# if desc in subdict: -# train[desc]=subdict[desc] -# time_hrrr=str2time(subdict[atm_dict]['time']) -# # timekeeping -# hours=len(d[key][atm_dict]['time']) -# train[key]['hours']=hours -# # train[key]['h2'] =hours # not doing prediction yet -# hrrr_increment = check_increment(time_hrrr,id=key+f' {atm_dict}.time') -# # logging.info(f'{atm_dict} increment is %s h',hrrr_increment) -# if hrrr_increment < 1: -# # logging.critical('HRRR increment is %s h must be at least 1 h',hrrr_increment) -# raise(ValueError) - -# # build matrix of features - assuming all the same length, if not column_stack will fail -# train[key]['time']=time_hrrr -# # logging.info(f"Created feature matrix train[{key}]['X'] shape {train[key]['X'].shape}") -# time_raws=str2time(subdict['RAWS']['time_raws']) # may not be the same as HRRR -# # logging.info('%s RAWS.time_raws length is %s',key,len(time_raws)) -# check_increment(time_raws,id=key+' RAWS.time_raws') -# # print_first(time_raws,num=5,id='RAWS.time_raws') - -# # Set up static vars -# columns=[] -# missing_features = [] -# for feat in features_list: -# # For atmospheric features, -# if feat in feature_types['atm']: -# if atm_dict == "HRRR": -# vec = subdict['HRRR'][fstep][feat] -# columns.append(vec) -# elif atm_dict == "RAWS": -# if feat in subdict['RAWS'].keys(): -# vec = time_intp(time_raws, subdict['RAWS'][feat], time_hrrr) -# columns.append(vec) -# else: -# missing_features.append(feat) - -# # For static features, repeat to fit number of time observations -# elif feat in feature_types['static']: -# columns.append(np.full(hours,loc[feat])) -# # Add Engineered Time features, doy and hod -# # hod = time_hrrr.astype('datetime64[h]').astype(int) % 24 -# # doy = np.array([dt.timetuple().tm_yday - 1 for dt in time_hrrr]) -# # columns.extend([doy, hod]) - -# # compute rain as difference of accumulated precipitation -# if 'rain' in features_list: -# if atm_dict == "HRRR": -# rain = subdict[atm_dict][fstep]['precip_accum']- subdict[atm_dict][fprev]['precip_accum'] -# # logging.info('%s rain as difference %s minus %s: min %s max %s', -# # key,fstep,fprev,np.min(rain),np.max(rain)) -# elif atm_dict == "RAWS": -# if 'rain' in subdict[atm_dict]: -# rain = time_intp(time_raws,subdict[atm_dict]['rain'],time_hrrr) -# else: -# pass -# # logging.info('No rain data found in RAWS subdictionary %s', key) -# columns.append( rain ) # add rain feature -# else: -# missing_features.append('rain') - -# train[key]['X'] = np.column_stack(columns) -# train[key]['features_list'] = [item for item in features_list if item not in missing_features] - -# fm=subdict['RAWS']['fm'] -# # logging.info('%s RAWS.fm length is %s',key,len(fm)) -# # interpolate RAWS sensors to HRRR time and over NaNs -# train[key]['y'] = time_intp(time_raws,fm,time_hrrr) -# # TODO: check endpoint interpolation when RAWS data sparse, and bail out if not enough data - -# if train[key]['y'] is None: -# pass -# # logging.error('Cannot create target matrix for %s, using None',key) -# else: -# pass -# # logging.info(f"Created target matrix train[{key}]['y'] shape {train[key]['y'].shape}") - -# # logging.info('Created a "train" dictionary with %s items',len(train)) - -# # clean up - -# keys_to_delete = [] -# for key in train: -# if train[key]['X'] is None or train[key]['y'] is None: -# # logging.warning('Deleting training item %s because features X or target Y are None', key) -# keys_to_delete.append(key) - -# # Delete the items from the dictionary -# if len(keys_to_delete)>0: -# for key in keys_to_delete: -# del train[key] -# # logging.warning('Deleted %s items with None for data. %s items remain in the training dictionary.', -# # len(keys_to_delete),len(train)) - -# # output - -# # if output_file_path is not None: -# # with open(output_file_path, 'wb') as file: -# # logging.info('Writing pickle dump of the dictionary train into file %s',output_file_path) -# # pickle.dump(train, file) - -# # logging.info('pkl2train done') - -# return train - def remove_key_list(d, ls, verbose=False): diff --git a/fmda/fmda_rnn_spatial.ipynb b/fmda/fmda_rnn_spatial.ipynb index c7870a9..cb1785b 100644 --- a/fmda/fmda_rnn_spatial.ipynb +++ b/fmda/fmda_rnn_spatial.ipynb @@ -244,32 +244,6 @@ { "cell_type": "code", "execution_count": null, - "id": "c0c7f5fb-4c33-45f8-9a2e-38c9ab1cd4e3", - "metadata": {}, - "outputs": [], - "source": [ - "# rnn_dat_sp = RNNData(\n", - "# train_sp, # input dictionary\n", - "# scaler=\"standard\", # data scaling type\n", - "# features_list = params['features_list'] # features for predicting outcome\n", - "# )\n", - "\n", - "\n", - "# rnn_dat_sp.train_test_split( \n", - "# time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test\n", - "# space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test\n", - "# )\n", - "# rnn_dat_sp.scale_data()\n", - "\n", - "# rnn_dat_sp.batch_reshape(\n", - "# timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. \n", - "# batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent\n", - "# )" - ] - }, - { - "cell_type": "code", - "execution_count": null, "id": "af82c50e-bcc4-406d-b759-399119d1af81", "metadata": {}, "outputs": [], @@ -283,19 +257,6 @@ { "cell_type": "code", "execution_count": null, - "id": "7431bc95-d384-40fd-a622-bbc0ee68e5cd", - "metadata": {}, - "outputs": [], - "source": [ - "# # Update Params specific to spatial training\n", - "# params.update({\n", - "# 'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch\n", - "# })" - ] - }, - { - "cell_type": "code", - "execution_count": null, "id": "4bc11474-fed8-47f2-b9cf-dfdda0d3d3b2", "metadata": {}, "outputs": [], -- 2.11.4.GIT