Update data_funcs.py
[notebooks.git] / fmda / utils.py
blob3cfd3cb5a1229e2154bb2e6755b15398b933ebd0
1 import numpy as np
2 from functools import singledispatch
3 import pandas as pd
4 import numbers
5 from datetime import datetime
6 import logging
7 import sys
8 import inspect
10 def logging_setup():
11 logging.basicConfig(
12 level=logging.INFO,
13 format='%(asctime)s - %(levelname)s - %(message)s',
14 stream=sys.stdout
17 numeric_kinds = {'i', 'u', 'f', 'c'}
19 def is_numeric_ndarray(array):
20 if isinstance(array, np.ndarray):
21 return array.dtype.kind in numeric_kinds
22 else:
23 return False
25 def vprint(*args):
26 import inspect
28 frame = inspect.currentframe()
29 if 'verbose' in frame.f_back.f_locals:
30 verbose = frame.f_back.f_locals['verbose']
31 else:
32 verbose = False
34 if verbose:
35 for s in args[:(len(args)-1)]:
36 print(s, end=' ')
37 print(args[-1])
40 ## Generic function to hash dictionary of various types
42 @singledispatch
43 ## Top level hash function with built-in hash function for str, float, int, etc
44 def hash2(x):
45 return hash(x)
47 @hash2.register(np.ndarray)
48 ## Hash numpy array, hash array with pandas and return integer sum
49 def _(x):
50 # return hash(x.tobytes())
51 return np.sum(pd.util.hash_array(x))
53 @hash2.register(list)
54 ## Hash list, convert to tuple
55 def _(x):
56 return hash2(tuple(x))
58 @hash2.register(tuple)
59 def _(x):
60 r = 0
61 for i in range(len(x)):
62 r+=hash2(x[i])
63 return r
65 @hash2.register(dict)
66 ## Hash dict, loop through keys and hash each element via dispatch. Return hashed integer sum of hashes
67 def _(x, keys = None, verbose = False):
68 r = 0 # return value integer
69 if keys is None: # allow user input of keys to hash, otherwise hash them all
70 keys = [*x.keys()]
71 keys.sort()
72 for key in keys:
73 if (verbose): print('Hashing', key)
74 r += hash2(x[key])
75 return hash(r)
77 def print_args(func, *args, **kwargs):
78 # wrapper to trace function call and arguments
79 print(f"Called: {func.__name__}")
80 print("Arguments:")
81 for arg in args:
82 print(f" {arg}")
83 for key, value in kwargs.items():
84 print(f" {key}={value}")
85 return func(*args, **kwargs)
87 def print_args_test():
88 def my_function(a, b):
89 # some code here
90 return a + b
91 print_args(my_function, a=1, b=2)
93 import inspect
94 def get_item(dict,var,**kwargs):
95 if var in dict:
96 value = dict[var]
97 elif 'default' in kwargs:
98 value = kwargs['default']
99 else:
100 logging.error('Variable %s not in the dictionary and no default',var)
101 raise NameError()
102 logging.info('%s = %s',var,value)
103 return value
105 def print_first(item_list,num=3,indent=0,id=None):
107 Print the first num items of the list followed by '...'
109 :param item_list: List of items to be printed
110 :param num: number of items to list
112 indent_str = ' ' * indent
113 if id is not None:
114 print(indent_str, id)
115 if len(item_list) > 0:
116 print(indent_str,type(item_list[0]))
117 for i in range(min(num,len(item_list))):
118 print(indent_str,item_list[i])
119 if len(item_list) > num:
120 print(indent_str,'...')
122 def print_dict_summary(d,indent=0,first=[],first_num=3):
124 Prints a summary for each array in the dictionary, showing the key and the size of the array.
126 Arguments:
127 d (dict): The dictionary to summarize.
128 first_items (list): Print the first items for any arrays with these names
131 indent_str = ' ' * indent
132 for key, value in d.items():
133 # Check if the value is list-like using a simple method check
134 if isinstance(value, dict):
135 print(f"{indent_str}{key}")
136 print_dict_summary(value,first=first,indent=indent+5,first_num=first_num)
137 elif isinstance(value,np.ndarray):
138 if np.issubdtype(value.dtype, np.number):
139 print(f"{indent_str}{key}: NumPy array of shape {value.shape}, min: {value.min()}, max: {value.max()}")
140 else:
141 # Handle non-numeric arrays differently
142 print(f"{indent_str}{key}: NumPy array of shape {value.shape}, type {value.dtype}")
143 elif hasattr(value, "__iter__") and not isinstance(value, str): # Check for iterable that is not a string
144 print(f"{indent_str}{key}: Array of {len(value)} items")
145 else:
146 print(indent_str,key,":",value)
147 if key in first:
148 print_first(value,num=first_num,indent=indent+5)
151 from datetime import datetime
153 def str2time(input):
155 Convert a single string timestamp or a list of string timestamps to corresponding datetime object(s).
157 if isinstance(input, str):
158 return datetime.strptime(input.replace('Z', '+00:00'), '%Y-%m-%dT%H:%M:%S%z')
159 elif isinstance(input, list):
160 return [str2time(s) for s in input]
161 else:
162 raise ValueError("Input must be a string or a list of strings")
165 # interpolate linearly over nans
167 def filter_nan_values(t1, v1):
168 # Filter out NaN values from v1 and corresponding times in t1
169 valid_indices = ~np.isnan(v1) # Indices where v1 is not NaN
170 t1_filtered = np.array(t1)[valid_indices]
171 v1_filtered = np.array(v1)[valid_indices]
172 return t1_filtered, v1_filtered
174 def time_intp(t1, v1, t2):
175 # Check if t1 v1 t2 are 1D arrays
176 if t1.ndim != 1:
177 logging.error("Error: t1 is not a 1D array. Dimension: %s", t1.ndim)
178 return None
179 if v1.ndim != 1:
180 logging.error("Error: v1 is not a 1D array. Dimension %s:", v1.ndim)
181 return None
182 if t2.ndim != 1:
183 logging.errorr("Error: t2 is not a 1D array. Dimension: %s", t2.ndim)
184 return None
185 # Check if t1 and v1 have the same length
186 if len(t1) != len(v1):
187 logging.error("Error: t1 and v1 have different lengths: %s %s",len(t1),len(v1))
188 return None
189 t1_no_nan, v1_no_nan = filter_nan_values(t1, v1)
190 # print('t1_no_nan.dtype=',t1_no_nan.dtype)
191 # Convert datetime objects to timestamps
192 t1_stamps = np.array([t.timestamp() for t in t1_no_nan])
193 t2_stamps = np.array([t.timestamp() for t in t2])
195 # Interpolate using the filtered data
196 v2_interpolated = np.interp(t2_stamps, t1_stamps, v1_no_nan)
197 if np.isnan(v2_interpolated).any():
198 logging.error('time_intp: interpolated output contains NaN')
200 return v2_interpolated
202 def str2time(strlist):
203 # Convert array of strings to array of datetime objects
204 return np.array([datetime.strptime(dt_str, '%Y-%m-%dT%H:%M:%SZ') for dt_str in strlist])
206 def check_increment(datetime_array,id=''):
207 # Calculate time differences between consecutive datetime values
208 diffs = [b - a for a, b in zip(datetime_array[:-1], datetime_array[1:])]
209 diffs_hours = np.array([diff.total_seconds()/3600 for diff in diffs])
210 # Check if all time differences are exactlyu 1 hour
211 if all(diffs_hours == diffs_hours[0]):
212 logging.info('%s time array increments are %s hours',id,diffs_hours[0])
213 if diffs_hours[0] <= 0 :
214 logging.error('%s time array increements are not positive',id)
215 return diffs_hours[0]
216 else:
217 logging.info('%s time array increments are min %s max %s',id,
218 np.min(diffs_hours),np.max(diffs_hours))
219 return -1