tensorflow tf dataset“TypeError:unhashable type:'list'”错误

pobjuy32  于 7个月前  发布在  其他
关注(0)|答案(1)|浏览(130)

我尝试将tensorflow数据集与lstm模型结合使用。这是我第一个集成tf数据集的项目,因此欢迎所有建议。
这里我所有的fonctions从许多csv中获取数据并将其处理为dataset:

def csv_loader(path):
            return tf.data.experimental.CsvDataset(
                path,
                record_defaults=[tf.float32, tf.float32, tf.float32], 
                header=header,
                field_delim=',', 
                select_cols=[0,1,2,5],
                na_value='nan')

def split_feature_label(x, lbl_position, nb_attrib):
            output = x[self.input_sequence_length - 1, lbl_position + 1]
            # remove output from input set
            sub = list(range(x.shape[1]))
            sub.pop(lbl_position + 1)  
            # remove id column used by fonction "filter_mixed_csv_sample"
            sub.pop(0)  

            x = tf.transpose(tf.gather(tf.transpose(x), sub))
            return {'input1': x[:self.input_sequence_length, :-nb_attrib],
                    'input2': x[self.input_sequence_length - 1, -nb_attrib:]}, output

def filter_mixed_csv_sample(x):
            # remove samples with mixed csv values 
            y, idx = tf.unique(x[:, 0])
            if len(y) > 1:
                return False
            return True

def filter_nan_missing_values(x):
            # find NaN and reject samples not sure if it always work...
            try:
                ynan = tf.math.is_nan(x)  # return true un ou plusieurs NaN inside sample
                return tf.math.logical_not(tf.math.reduce_any(ynan))  # inverse la réponse pour rejeter avec un False si Nan
            except:
                return False

def to_timeseries(x):
            # turn dataset into 3D lstm compatible dataset
            x = x.map(lambda *items: tf.stack(items), num_parallel_calls=tf.data.AUTOTUNE)
            x = x.window(self.input_sequence_length + self.output_sequence_length, shift=1,
                        drop_remainder=True)  
            x = x.flat_map(lambda i: i).batch(self.input_sequence_length + self.output_sequence_length)
            return x

def is_test(x, _):
        # split dataset into test et trainning dataset
        return x % int(self.val_split * 100) == 0
            
def is_train(x, y):
        return not is_test(x, y)

def apply_scaler(x, y):
            x1_std = (x['input1'] - x1_scaler.data_min_) / (x1_scaler.data_max_ - x1_scaler.data_min_)
            x1_scaled = x1_std * (x1_scaler.feature_range[1] - x1_scaler.feature_range[0]) + x1_scaler.feature_range[0]

            x2_std = (x['input2'] - x2_scaler.data_min_) / (x2_scaler.data_max_ - x2_scaler.data_min_)
            x2_scaled = x2_std * (x2_scaler.feature_range[1] - x2_scaler.feature_range[0]) + x2_scaler.feature_range[0]

            y_std = (y - y_scaler.data_min_) / (y_scaler.data_max_ - y_scaler.data_min_)
            y_scaled = y_std * (y_scaler.feature_range[1] - y_scaler.feature_range[0]) + y_scaler.feature_range[0]
            return {'input1': x1_scaled, 'input2': x2_scaled}, y_scaled

字符串
以及我如何安排这些处理:

tf_list = tf.data.Dataset.list_files(list_files, shuffle=True)
dataset = tf_list.interleave(csv_loader, cycle_length=1)

with tf.device('/cpu:0'):  
            dataset = to_timeseries(self.dataset)
            dataset = dataset.ignore_errors()
            dataset = dataset.filter(filter_nan_missing_values)
            dataset = dataset.filter(filter_mixed_csv_sample)
            if shuffle:
                dataset = dataset.shuffle(shuffle_buffer)

            dataset = dataset.map(lambda x: split_feature_label(x, label_index, nb_attributs), num_parallel_calls=tf.data.AUTOTUNE)
            
            # Split dataset to train/test set
            if val_split > 0:
                recover = lambda x, y: y
                test_set = dataset.enumerate() \
                    .filter(is_test) \
                    .map(recover)

                trainning_set = dataset.enumerate() \
                    .filter(is_train) \
                    .map(recover)
            
            # set-up multi-GPUs config if availlable
            if gpu:
                strategy = tf.distribute.MirroredStrategy()
                batch_size_per_replica = batch_size * strategy.num_replicas_in_sync
            else:
                batch_size_per_replica = batch_size
            
            if val_split == 0:
                dataset = dataset.batch(batch_size_per_replica)
                dataset = dataset.cache()
                dataset = dataset.prefetch(2)  
    
            else:
                trainning_set = trainning_set.batch(batch_size_per_replica).cache().prefetch(2)
                test_set = test_set.batch(batch_size_per_replica).cache().prefetch(2)

x1_scaler = load('/artefacts/scalers/x1_scaler.sclr')
x2_scaler = load('/artefacts/scalers/x2_scaler.sclr')
y_scaler = load('/artefacts/scalers/y_scaler.sclr')
dataset = dataset.map(apply_scaler, num_parallel_calls=tf.data.AUTOTUNE)


最后,我是如何开始训练的:

if val_split > 0:
    history = model.fit(trainning_set, validation_data=test_set,
                        verbose=verbose,
                        epochs=epochs,
                        callbacks=[checkpoint, early_stop],
                        shuffle=shuffle)
            
else:
    history = model.fit(dataset,
                        verbose=verbose,
                        epochs=epochs,
                        callbacks=[checkpoint, early_stop],
                        shuffle=shuffle)


这些代码在早期版本中工作,我做了一些“轻微”修改,如添加多GPU优化,并重新组织我的代码以清理无用的测试行.

Exception has occurred: TypeError
unhashable type: 'list'
  File "/home/cy6112/CASTOR-S3/CASTOR-S3-cy6112/Code/timeseries_nn/core.py", line 797, in train
    history = model.fit(dataset,
              ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/cy6112/CASTOR-S3/CASTOR-S3-cy6112/Code/timeseries_nn/workbench.py", line 33, in <module>
    NN.train(ds, 200, 10, verbose=2, shuffle=True)
TypeError: unhashable type: 'list'


我的数据集是:

<_ParallelMapDataset element_spec=({'input1': TensorSpec(shape=(None, None, 3), dtype=tf.float32, name=None), 'input2': TensorSpec(shape=(None, 13), dtype=tf.float32, name=None)}, TensorSpec(shape=(None,), dtype=tf.float32, name=None))


感谢您的帮助和建议,如果你看到的东西没有做好与数据集的方式

ctrmrzij

ctrmrzij1#

您得到的错误源于dataset示例。在预处理中,您在某处提供了一个列表作为要散列的键,但列表不是可散列类型。例如,

dictionary = {'key': 10}  # This works
dictionary = {['key']: 10}  # This does not

字符串
我不能确定你错误地使用了哪个值,但下面这行有点可疑:

sub = list(range(x.shape[1]))


我建议你在函数中使用breakpoint()来检查变量,或者看看它在哪里出错。

相关问题