fmda/utils.py

   1 import numpy as np
   2 from functools import singledispatch
   3 import pandas as pd
   4 import numbers
   5 from datetime import datetime
   6 import logging
   7 import sys
   8 import inspect
   9
  10 def logging_setup():
  11     logging.basicConfig(
  12         level=logging.INFO,
  13         format='%(asctime)s - %(levelname)s - %(message)s',
  14         stream=sys.stdout
  15     )
  16
  17 numeric_kinds = {'i', 'u', 'f', 'c'}
  18
  19 def is_numeric_ndarray(array):
  20     if isinstance(array, np.ndarray):
  21         return array.dtype.kind in numeric_kinds
  22     else:
  23         return False
  24
  25 def vprint(*args):
  26     import inspect
  27
  28     frame = inspect.currentframe()
  29     if 'verbose' in frame.f_back.f_locals:
  30         verbose = frame.f_back.f_locals['verbose']
  31     else:
  32         verbose = False
  33
  34     if verbose:
  35         for s in args[:(len(args)-1)]:
  36             print(s, end=' ')
  37         print(args[-1])
  38
  39
  40 ## Generic function to hash dictionary of various types
  41
  42 @singledispatch
  43 ## Top level hash function with built-in hash function for str, float, int, etc
  44 def hash2(x):
  45     return hash(x)
  46
  47 @hash2.register(np.ndarray)
  48 ## Hash numpy array, hash array with pandas and return integer sum
  49 def _(x):
  50     # return hash(x.tobytes())
  51     return np.sum(pd.util.hash_array(x))
  52
  53 @hash2.register(list)
  54 ## Hash list, convert to tuple
  55 def _(x):
  56     return hash2(tuple(x))
  57
  58 @hash2.register(tuple)
  59 def _(x):
  60     r = 0
  61     for i in range(len(x)):
  62         r+=hash2(x[i])
  63     return r
  64
  65 @hash2.register(dict)
  66 ## Hash dict, loop through keys and hash each element via dispatch. Return hashed integer sum of hashes
  67 def _(x, keys = None, verbose = False):
  68     r = 0 # return value integer
  69     if keys is None: # allow user input of keys to hash, otherwise hash them all
  70         keys = [*x.keys()]
  71     keys.sort()
  72     for key in keys:
  73         if (verbose): print('Hashing', key)
  74         r += hash2(x[key])
  75     return hash(r)
  76
  77 def print_args(func, *args, **kwargs):
  78 # wrapper to trace function call and arguments
  79     print(f"Called: {func.__name__}")
  80     print("Arguments:")
  81     for arg in args:
  82         print(f"  {arg}")
  83     for key, value in kwargs.items():
  84         print(f"  {key}={value}")
  85     return func(*args, **kwargs)
  86
  87 def print_args_test():
  88     def my_function(a, b):
  89         # some code here
  90         return a + b
  91     print_args(my_function, a=1, b=2)
  92
  93 import inspect
  94 def get_item(dict,var,**kwargs):
  95     if var in dict:
  96         value = dict[var]
  97     elif 'default' in kwargs:
  98         value = kwargs['default']
  99     else:
 100         logging.error('Variable %s not in the dictionary and no default',var)
 101         raise NameError()
 102     logging.info('%s = %s',var,value)
 103     return value
 104
 105 def print_first(item_list,num=3,indent=0,id=None):
 106     """
 107     Print the first num items of the list followed by '...'
 108
 109     :param item_list: List of items to be printed
 110     :param num: number of items to list
 111     """
 112     indent_str = ' ' * indent
 113     if id is not None:
 114         print(indent_str, id)
 115     if len(item_list) > 0:
 116         print(indent_str,type(item_list[0]))
 117     for i in range(min(num,len(item_list))):
 118         print(indent_str,item_list[i])
 119     if len(item_list) > num:
 120         print(indent_str,'...')
 121
 122 def print_dict_summary(d,indent=0,first=[],first_num=3):
 123     """
 124     Prints a summary for each array in the dictionary, showing the key and the size of the array.
 125
 126     Arguments:
 127      d (dict): The dictionary to summarize.
 128      first_items (list): Print the first items for any arrays with these names
 129
 130     """
 131     indent_str = ' ' * indent
 132     for key, value in d.items():
 133         # Check if the value is list-like using a simple method check
 134         if isinstance(value, dict):
 135             print(f"{indent_str}{key}")
 136             print_dict_summary(value,first=first,indent=indent+5,first_num=first_num)
 137         elif isinstance(value,np.ndarray):
 138             if np.issubdtype(value.dtype, np.number):
 139                 print(f"{indent_str}{key}: NumPy array of shape {value.shape}, min: {value.min()}, max: {value.max()}")
 140             else:
 141                 # Handle non-numeric arrays differently
 142                 print(f"{indent_str}{key}: NumPy array of shape {value.shape}, type {value.dtype}")
 143         elif hasattr(value, "__iter__") and not isinstance(value, str):  # Check for iterable that is not a string
 144             print(f"{indent_str}{key}: Array of {len(value)} items")
 145         else:
 146             print(indent_str,key,":",value)
 147         if key in first:
 148             print_first(value,num=first_num,indent=indent+5)
 149
 150
 151 from datetime import datetime
 152
 153 def str2time(input):
 154     """
 155     Convert a single string timestamp or a list of string timestamps to corresponding datetime object(s).
 156     """
 157     if isinstance(input, str):
 158         return datetime.strptime(input.replace('Z', '+00:00'), '%Y-%m-%dT%H:%M:%S%z')
 159     elif isinstance(input, list):
 160         return [str2time(s) for s in input]
 161     else:
 162         raise ValueError("Input must be a string or a list of strings")
 163
 164
 165 # interpolate linearly over nans
 166
 167 def filter_nan_values(t1, v1):
 168     # Filter out NaN values from v1 and corresponding times in t1
 169     valid_indices = ~np.isnan(v1)  # Indices where v1 is not NaN
 170     t1_filtered = np.array(t1)[valid_indices]
 171     v1_filtered = np.array(v1)[valid_indices]
 172     return t1_filtered, v1_filtered
 173
 174 def time_intp(t1, v1, t2):
 175     # Check if t1 v1 t2 are 1D arrays
 176     if t1.ndim != 1:
 177         logging.error("Error: t1 is not a 1D array. Dimension: %s", t1.ndim)
 178         return None
 179     if v1.ndim != 1:
 180         logging.error("Error: v1 is not a 1D array. Dimension %s:", v1.ndim)
 181         return None
 182     if t2.ndim != 1:
 183         logging.errorr("Error: t2 is not a 1D array. Dimension: %s", t2.ndim)
 184         return None
 185     # Check if t1 and v1 have the same length
 186     if len(t1) != len(v1):
 187         logging.error("Error: t1 and v1 have different lengths: %s %s",len(t1),len(v1))
 188         return None
 189     t1_no_nan, v1_no_nan = filter_nan_values(t1, v1)
 190     # print('t1_no_nan.dtype=',t1_no_nan.dtype)
 191     # Convert datetime objects to timestamps
 192     t1_stamps = np.array([t.timestamp() for t in t1_no_nan])
 193     t2_stamps = np.array([t.timestamp() for t in t2])
 194
 195     # Interpolate using the filtered data
 196     v2_interpolated = np.interp(t2_stamps, t1_stamps, v1_no_nan)
 197     if np.isnan(v2_interpolated).any():
 198         logging.error('time_intp: interpolated output contains NaN')
 199
 200     return v2_interpolated
 201
 202 def str2time(strlist):
 203     # Convert array of strings to array of datetime objects
 204     return np.array([datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%SZ') for dt_str in strlist])
 205
 206 def check_increment(datetime_array,id=''):
 207     # Calculate time differences between consecutive datetime values
 208     diffs = [b - a for a, b in zip(datetime_array[:-1], datetime_array[1:])]
 209     diffs_hours = np.array([diff.total_seconds()/3600 for diff in diffs])
 210     # Check if all time differences are exactlyu 1 hour
 211     if all(diffs_hours == diffs_hours[0]):
 212         logging.info('%s time array increments are %s hours',id,diffs_hours[0])
 213         if diffs_hours[0] <= 0 :
 214             logging.error('%s time array increements are not positive',id)
 215         return diffs_hours[0]
 216     else:
 217         logging.info('%s time array increments are min %s max %s',id,
 218                         np.min(diffs_hours),np.max(diffs_hours))
 219         return -1