我尝试将tensorflow数据集与lstm模型结合使用。这是我第一个集成tf数据集的项目,因此欢迎所有建议。
这里我所有的fonctions从许多csv中获取数据并将其处理为dataset:
def csv_loader(path):
return tf.data.experimental.CsvDataset(
path,
record_defaults=[tf.float32, tf.float32, tf.float32],
header=header,
field_delim=',',
select_cols=[0,1,2,5],
na_value='nan')
def split_feature_label(x, lbl_position, nb_attrib):
output = x[self.input_sequence_length - 1, lbl_position + 1]
# remove output from input set
sub = list(range(x.shape[1]))
sub.pop(lbl_position + 1)
# remove id column used by fonction "filter_mixed_csv_sample"
sub.pop(0)
x = tf.transpose(tf.gather(tf.transpose(x), sub))
return {'input1': x[:self.input_sequence_length, :-nb_attrib],
'input2': x[self.input_sequence_length - 1, -nb_attrib:]}, output
def filter_mixed_csv_sample(x):
# remove samples with mixed csv values
y, idx = tf.unique(x[:, 0])
if len(y) > 1:
return False
return True
def filter_nan_missing_values(x):
# find NaN and reject samples not sure if it always work...
try:
ynan = tf.math.is_nan(x) # return true un ou plusieurs NaN inside sample
return tf.math.logical_not(tf.math.reduce_any(ynan)) # inverse la réponse pour rejeter avec un False si Nan
except:
return False
def to_timeseries(x):
# turn dataset into 3D lstm compatible dataset
x = x.map(lambda *items: tf.stack(items), num_parallel_calls=tf.data.AUTOTUNE)
x = x.window(self.input_sequence_length + self.output_sequence_length, shift=1,
drop_remainder=True)
x = x.flat_map(lambda i: i).batch(self.input_sequence_length + self.output_sequence_length)
return x
def is_test(x, _):
# split dataset into test et trainning dataset
return x % int(self.val_split * 100) == 0
def is_train(x, y):
return not is_test(x, y)
def apply_scaler(x, y):
x1_std = (x['input1'] - x1_scaler.data_min_) / (x1_scaler.data_max_ - x1_scaler.data_min_)
x1_scaled = x1_std * (x1_scaler.feature_range[1] - x1_scaler.feature_range[0]) + x1_scaler.feature_range[0]
x2_std = (x['input2'] - x2_scaler.data_min_) / (x2_scaler.data_max_ - x2_scaler.data_min_)
x2_scaled = x2_std * (x2_scaler.feature_range[1] - x2_scaler.feature_range[0]) + x2_scaler.feature_range[0]
y_std = (y - y_scaler.data_min_) / (y_scaler.data_max_ - y_scaler.data_min_)
y_scaled = y_std * (y_scaler.feature_range[1] - y_scaler.feature_range[0]) + y_scaler.feature_range[0]
return {'input1': x1_scaled, 'input2': x2_scaled}, y_scaled
字符串
以及我如何安排这些处理:
tf_list = tf.data.Dataset.list_files(list_files, shuffle=True)
dataset = tf_list.interleave(csv_loader, cycle_length=1)
with tf.device('/cpu:0'):
dataset = to_timeseries(self.dataset)
dataset = dataset.ignore_errors()
dataset = dataset.filter(filter_nan_missing_values)
dataset = dataset.filter(filter_mixed_csv_sample)
if shuffle:
dataset = dataset.shuffle(shuffle_buffer)
dataset = dataset.map(lambda x: split_feature_label(x, label_index, nb_attributs), num_parallel_calls=tf.data.AUTOTUNE)
# Split dataset to train/test set
if val_split > 0:
recover = lambda x, y: y
test_set = dataset.enumerate() \
.filter(is_test) \
.map(recover)
trainning_set = dataset.enumerate() \
.filter(is_train) \
.map(recover)
# set-up multi-GPUs config if availlable
if gpu:
strategy = tf.distribute.MirroredStrategy()
batch_size_per_replica = batch_size * strategy.num_replicas_in_sync
else:
batch_size_per_replica = batch_size
if val_split == 0:
dataset = dataset.batch(batch_size_per_replica)
dataset = dataset.cache()
dataset = dataset.prefetch(2)
else:
trainning_set = trainning_set.batch(batch_size_per_replica).cache().prefetch(2)
test_set = test_set.batch(batch_size_per_replica).cache().prefetch(2)
x1_scaler = load('/artefacts/scalers/x1_scaler.sclr')
x2_scaler = load('/artefacts/scalers/x2_scaler.sclr')
y_scaler = load('/artefacts/scalers/y_scaler.sclr')
dataset = dataset.map(apply_scaler, num_parallel_calls=tf.data.AUTOTUNE)
型
最后,我是如何开始训练的:
if val_split > 0:
history = model.fit(trainning_set, validation_data=test_set,
verbose=verbose,
epochs=epochs,
callbacks=[checkpoint, early_stop],
shuffle=shuffle)
else:
history = model.fit(dataset,
verbose=verbose,
epochs=epochs,
callbacks=[checkpoint, early_stop],
shuffle=shuffle)
型
这些代码在早期版本中工作,我做了一些“轻微”修改,如添加多GPU优化,并重新组织我的代码以清理无用的测试行.
Exception has occurred: TypeError
unhashable type: 'list'
File "/home/cy6112/CASTOR-S3/CASTOR-S3-cy6112/Code/timeseries_nn/core.py", line 797, in train
history = model.fit(dataset,
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/cy6112/CASTOR-S3/CASTOR-S3-cy6112/Code/timeseries_nn/workbench.py", line 33, in <module>
NN.train(ds, 200, 10, verbose=2, shuffle=True)
TypeError: unhashable type: 'list'
型
我的数据集是:
<_ParallelMapDataset element_spec=({'input1': TensorSpec(shape=(None, None, 3), dtype=tf.float32, name=None), 'input2': TensorSpec(shape=(None, 13), dtype=tf.float32, name=None)}, TensorSpec(shape=(None,), dtype=tf.float32, name=None))
型
感谢您的帮助和建议,如果你看到的东西没有做好与数据集的方式
1条答案
按热度按时间ctrmrzij1#
您得到的错误源于
dataset
示例。在预处理中,您在某处提供了一个列表作为要散列的键,但列表不是可散列类型。例如,字符串
我不能确定你错误地使用了哪个值,但下面这行有点可疑:
型
我建议你在函数中使用
breakpoint()
来检查变量,或者看看它在哪里出错。