-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
286 lines (222 loc) · 8.66 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import math
import numpy as np
from config_local import *
from scipy.stats import spearmanr
from scipy.integrate import simpson
from scipy.spatial.distance import cdist
from scipy.interpolate import interp1d
def magnitude(arr):
"""
REQUIRES:
- arr: 3-value array [x, y, z]
EFFECTS: returns magnitude
"""
return math.sqrt(float(arr[0])**2 + float(arr[1])**2 + float(arr[2])**2)
def multiplyArr(arr, constant):
"""
REQUIRES:
- arr: array of any length
- constant: value to multiply all values in array by
EFFECTS: returns array with all values multiplied by constant
"""
for i in range(len(arr)):
arr[i] = float(arr[i]) * constant
return arr
class datafile:
"""
Manages output file generation. Output files include rows of data (usually differences)
"""
rotationFile = ""
def __init__(self, rotationFile):
self.rotationFile = configs["outputDataFolder"] + "/" + rotationFile
self.clear()
def clear(self):
with open(self.rotationFile + '.txt', 'w') as file:
pass
def add(self, content):
with open(self.rotationFile + '.txt', 'a') as file:
file.write(str(content) + "\t")
def newLine(self):
with open(self.rotationFile + '.txt', 'rb+') as file:
file.seek(0, 2)
if file.tell() == 0:
return
file.seek(-1, 2)
if file.read(1) == b'\t':
file.seek(-1, 2)
file.truncate()
file.write(b'\n')
def close(self):
with open(self.rotationFile + '.txt', 'rb+') as file:
file.seek(0, 2)
if file.tell() == 0:
return
file.seek(-1, 2)
if file.read(1) == b'\t' or file.read(1) == b'\n':
file.seek(-1, 2)
file.truncate()
def difference_sim_obs(simTimestamps, simValues, dataTimestamps, dataValues, method):
"""
Calculates difference between sim and obs results using given method
NOTE: timestamps must be converted to int format
REQUIRES:
- simTimestamps, simValues: arrays with simulation timestamps and values
- dataTimestamps, dataValues: arrays with data timestamps and values
- method: method to use (see METHODS)
EFFECTS: returns value of difference between the two datasets
-- METHODS --
mse: mean squared error (default)
mae: mean absolute error
scc: spearman correlation coefficient
curve_distance: curve distance
"""
# removes all timestamps with no observation data (NaN)
nanMask = ~np.isnan(dataValues)
dataValues = np.array(dataValues)
dataTimestamps = np.array(dataTimestamps)
dataTimestamps = dataTimestamps[nanMask]
dataValues = dataValues[nanMask]
interpSimValues = interpolate(dataTimestamps, simTimestamps, simValues)
if method == "mae":
# mean absolute error
n = len(dataTimestamps)
squaredDiff = [np.abs(actual - predicted) for actual, predicted in zip(dataValues, interpSimValues)]
mse = sum(squaredDiff) / n
return mse
elif method == "scc":
# spearman correlation coefficient
scc, pvalue = spearmanr(dataValues, interpSimValues)
return scc
elif method == "curve_distance":
def curveDist(x1, y1, x2, y2):
# normalization factors
# 1 is observation, 2 is simulation
# 10 days in epoch time is 10.*24*3600
X = 10.*24*3600
# Normalization for OMNI data
Y = max(y1) - min(y1)
x1_normalized = np.array(x1) / X
x2_normalized = np.array(x2) / X
y1_normalized = np.array(y1) / Y
y2_normalized = np.array(y2) / Y
n1 = len(x1)
n2 = len(x2)
d1=0
d2=0
x1c = np.add(x1_normalized[1:n1],x1_normalized[0:n1-1])/2
x2c = np.add(x2_normalized[1:n2],x2_normalized[0:n2-1])/2
y1c = np.add(y1_normalized[1:n1],y1_normalized[0:n1-1])/2
y2c = np.add(y2_normalized[1:n2],y2_normalized[0:n2-1])/2
x1c = np.array(x1c)
x2c = np.array(x2c)
y1c = np.array(y1c)
y2c = np.array(y2c)
d1c = np.sqrt( (x1_normalized[1:n1] - x1_normalized[0:n1-1])**2 + (y1_normalized[1:n1] - y1_normalized[0:n1-1])**2 )
d2c = np.sqrt( (x2_normalized[1:n2] - x2_normalized[0:n2-1])**2 + (y2_normalized[1:n2] - y2_normalized[0:n2-1])**2 )
len1 = np.sum(d1c)
len2 = np.sum(d2c)
for i in range(0, n1-1):
d1 = d1 + d1c[i]*np.min( np.sqrt( (x1c[i] - x2c)**2 + (y1c[i] - y2c)**2 ) )
for i in range(0, n2-1):
d2 = d2 + d2c[i]*np.min( np.sqrt( (x2c[i] - x1c)**2 + (y2c[i] - y1c)**2 ) )
d = (d1/len1 + d2/len2)/2
return d
return curveDist(dataTimestamps, dataValues, simTimestamps, simValues)
else:
# mean squared error
n = len(dataTimestamps)
squaredDiff = [(actual - predicted)**2 for actual, predicted in zip(dataValues, interpSimValues)]
mse = sum(squaredDiff) / n
return mse
def interpolate(x1, x2, y2):
"""
REQUIRES:
- x1: 1st dataset x axis
- x2, y2: 2nd dataset
EFFECTS: interpolates dataset 2 onto x1
"""
interpY2 = np.interp(x1, x2, y2)
return np.array(interpY2)
def findPlotOpacities(arr):
"""
REQUIRES:
- arr: array to calculate opacity values for
EFFECTS: interpolates min and max values to assign opacity values to each arr item. The opacity range is
specified in config_local, and the min and max values will be assigned to the min and max values in the arr.
"""
arr = np.array(arr)
# replace nan with negative inf
arr[np.isnan(arr)] = -np.inf
# get indices that would sort the array
if configs["diffCalcMethod"] == "scc": # spearman cc has highest rank when closest to 1/-1
arr = 1 - np.abs(arr)
sorted_indices = np.argsort(arr)
# create array of ranks based on the sorted indices
ranks = np.empty_like(sorted_indices)
ranks[sorted_indices] = np.arange(1, len(arr) + 1)
# make nan values last place
ranks[arr == -np.inf] = np.max(ranks) + 1
# get opacity range
minOpacity = configs["simLineOpacityRange"][0]
maxOpacity = configs["simLineOpacityRange"][1]
# calculate difference in opacity between 2 ranks
opacityStep = (maxOpacity - minOpacity) / (np.max(ranks - 1))
opacities = []
# calculate individual opacities
for rank in ranks:
opacities.append(maxOpacity - opacityStep * (rank - 1))
return opacities
def normalizeData(data):
"""
REQUIRES:
- data: input data
EFFECTS: Normalizes each value in data as a proportion of the max value
"""
if configs["diffCalcMethod"] == "scc" or configs["diffCalcMethod"] == "curve_distance": # spearman cc should NOT be normalized
return data
minValue = min(data)
maxValue = max(data)
return [(x - minValue) / (maxValue - minValue) for x in data]
def calculate2DArrayAverage(arr):
"""
REQUIRES:
- arr: input 2d array
Nested arrays MUST be of equal length!
EFFECTS: Averages all nested arrays in arr to form a single array
"""
numRows = len(arr)
numCols = len(arr[0])
if configs["diffCalcMethod"] == "scc": # normalizes spearman cc so best has lowest val and worst has highest
arr = 1 - np.abs(arr)
averages = [0] * numCols
for col in range(numCols):
colSum = sum(row[col] for row in arr)
averages[col] = colSum / numRows
return averages
def indexOfMinValue(arr):
"""
REQUIRES:
- arr: input array
EFFECTS: Returns index of min value in array
"""
if configs["diffCalcMethod"] == "scc": # normalizes spearman cc so best has lowest val and worst has highest
arr = [1 - abs(val) for val in arr]
minVal = min(arr)
minIndex = arr.index(minVal)
return minIndex
def filterDatasetByVarName(dataset, variables, varsToKeep):
"""
REQUIRES:
- dataset: array of values
- variables: indices of variables MUST correlate to dataset
ie. dataset[2] is the value for variables[2]
- varsToKeep: variables whose corresponding value in dataset will be kept
EFFECTS: Returns [newVarList, filteredDataset] where the indices of
newVarList correspond to the items in filteredDataset
"""
filteredDataset = []
newVarList = []
for i in range(len(varsToKeep)):
filteredDataset.append(dataset[variables.index(varsToKeep[i])])
newVarList.append(varsToKeep[i])
return [newVarList, filteredDataset]