From ce1e513b09fbd68d166f77543a1b25791d7086a1 Mon Sep 17 00:00:00 2001 From: jh-206 Date: Tue, 10 Sep 2024 10:06:08 -0600 Subject: [PATCH] Delete create_rnn_data2 Deprecated, use custom class RNNData now --- fmda/moisture_rnn.py | 147 --------------------------------------------------- 1 file changed, 147 deletions(-) diff --git a/fmda/moisture_rnn.py b/fmda/moisture_rnn.py index 888fce6..9b20a8a 100644 --- a/fmda/moisture_rnn.py +++ b/fmda/moisture_rnn.py @@ -185,153 +185,6 @@ scalers = { } -## DEPRECATED, use RNNData class instead -def create_rnn_data2(dict1, params, atm_dict="HRRR", verbose=False, train_ind=None, test_ind=None): - # Given fmda data and hyperparameters, return formatted dictionary to be used in RNN - # Inputs: - # d: (dict) fmda dictionary - # params: (dict) hyperparameters - # atm_dict: (str) string specifying name of subdictionary for atmospheric vars - # train_frac: (float) fraction of data to use for training (starting from time 0) - # val_frac: (float) fraction of data to use for validation data (starting from end of train) - # Returns: (dict) formatted data used in RNN - logging.info('create_rnn_data start') - # Copy Dictionary to avoid changing the input to this function - d=copy.deepcopy(dict1) - scale = params['scale'] - scaler= params['scaler'] - # Features list given by params dict to be used in training - features_list = params["features_list"] - # All available features list, corresponds to shape of X - features_all = d["features_list"] - # Indices to subset all features with based on params features - indices = [] - for item in features_list: - if item in features_all: - indices.append(features_all.index(item)) - else: - print(f"Warning: feature name '{item}' not found in list of all features from input data") - - # Extract desired features based on params, combine into matrix - # Extract response vector - y = d['y'] - y = np.reshape(y,(-1,1)) - # Extract Features matrix, subset to desired features - X_raw = d['X'][:, indices].copy() # saw untransformed features matrix - X = d['X'] - X = X[:, indices] - - # Check total observed hours - hours=d['hours'] - assert hours == y.shape[0] # Check that it matches response - - logging.info('create_rnn_data: total_hours=%s',hours) - logging.info('feature matrix X shape %s',np.shape(X)) - logging.info('target matrix Y shape %s',np.shape(y)) - logging.info('features_list: %s',features_list) - - logging.info('splitting train/val/test') - if train_ind is None: - train_ind = round(hours * params['train_frac']) # index of last training observation - test_ind= train_ind + round(hours * params['val_frac'])# index of first test observation, if no validation data it is equal to train_ind - logging.info('Final index of training data=%s',train_ind) - logging.info('First index of Test data=%s',test_ind) - # Training data from 0 to train_ind - X_train = X[:train_ind] - y_train = y[:train_ind].reshape(-1,1) - # Validation data from train_ind to test_ind - X_val = X[train_ind:test_ind] - y_val = y[train_ind:test_ind].reshape(-1,1) - # Test data from test_ind to end - X_test = X[test_ind:] - y_test = y[test_ind:].reshape(-1,1) - - # Scale Data if required - # TODO: - # Remove need for "scale_fm" param - # Reset reproducibility with this scaling - if scale: - logging.info('Scaling feature data with scaler: %s',scaler) - # scale=1 - if scaler=="reproducibility": - scale_fm = 17.076346687085564 - scale_rain = 0.01 - else: - scale_fm=1.0 - scale_rain=1.0 - # Fit scaler to training data - scalers[scaler].fit(X_train) - # Apply scaling to all data using in-place operations - X_train[:] = scalers[scaler].transform(X_train) - if X_val.shape[0] > 0: - X_val[:] = scalers[scaler].transform(X_val) - X_test[:] = scalers[scaler].transform(X_test) - - - else: - print("Not scaling data") - scale_fm=1.0 - scale_rain=1.0 - scaler=None - - logging.info('x_train shape=%s',X_train.shape) - logging.info('y_train shape=%s',y_train.shape) - if test_ind == train_ind: - logging.info('No validation data') - elif X_val.shape[0]!= 0: - logging.info('X_val shape=%s',X_val.shape) - logging.info('y_val shape=%s',y_val.shape) - logging.info('X_test shape=%s',X_test.shape) - logging.info('y_test shape=%s',y_test.shape) - - # Set up return dictionary - rnn_dat={ - 'case':d['case'], - 'hours':hours, - 'features_list':features_list, - 'n_features': len(features_list), - 'scaler':scaler, - 'train_ind':train_ind, - 'test_ind':test_ind, - 'X_raw': X_raw, - 'X':X, - 'y':y, - 'X_train': X_train, - 'y_train': y_train, - 'X_test': X_test, - 'y_test': y_test - } - - if X_val.shape[0] > 0: - rnn_dat.update({ - 'X_val': X_val, - 'y_val': y_val - }) - - # Update RNN params using data attributes - logging.info('Updating model params based on data') - timesteps = params['timesteps'] - batch_size = params['batch_size'] - logging.info('batch_size=%s',batch_size) - logging.info('timesteps=%s',timesteps) - features = len(features_list) - # params.update({ - # 'n_features': features, - # 'batch_shape': (params["batch_size"],params["timesteps"],features), - # 'pred_input_shape': (None, features), - # 'scaler': scaler, - # 'scale_fm': scale_fm, - # 'scale_rain': scale_rain - # }) - rnn_dat.update({ - 'scaler': scaler, - 'scale_fm': scale_fm, - 'scale_rain': scale_rain - }) - - logging.info('create_rnn_data2 done') - return rnn_dat - def batch_setup(ids, batch_size): """ Sets up stateful batched training data scheme for RNN training. -- 2.11.4.GIT