pandas 基于df1中的0,1和df2中的值,从另外两个df构建一个df

ulydmbyx  于 6个月前  发布在  其他
关注(0)|答案(1)|浏览(72)

我有两个dfs。一个只有1和0**(df_one_zero**)。另一个有不同的值df_value_total。这两个有一千行和列!
每个df的第一列是id,我们根本不想改变它。
我想移动与滑动窗口5通过列。所以,每个窗口,我想与这两个工作:df_one_zero_window,df_value_window。
在每个窗口中,1开始和结束的列很重要。
然后我想创建另一个与df_one_zero形状相同的df_out(最初设置为零),考虑在列col中,1开始并结束于col_end,
将值放入df_out(row,col-1)= df_value_window(row,col-1)-df_value_window(row,col),其他值为零。(如果1从索引0开始,或者在最后一列结束,那么它就可以了。它不需要为此设置值)此外,如果df_one_zero_window中的1在col_end结束,那么df_out(row,col_end+1)= df_value_window(row,col_end+1)-df_value_window(row,col_end)。在下面的dfs中,我想创建df_out= df 2。df_value_total中的值非常多样化,这里我只选择了我的df中的一些简单数字。

## only has zero and 1
df = pd.DataFrame()
df['id'] = ['a', 'b', 'c']
df['0'] = [0, 0, 0]
df['1'] = [1, 0, 1]
df['2'] = [1, 1, 1]
df['3'] = [0, 0, 0]
df['4'] = [0, 0, 0]

df['5'] = [0, 0, 0]
df['6'] = [0, 1, 1]
df['7'] = [0, 0, 1]
df['8'] = [0, 0, 0]
df['9'] = [0, 0, 0]

df['10'] = [0, 0, 0]
df['11'] = [0, 0, 1]
df['12'] = [1, 1, 1]
df['13'] = [1, 0, 0]
df['14'] = [0, 0, 0]

df['15'] = [0, 0, 0]
df['16'] = [0, 1, 1]
df['17'] = [1, 1, 0]
df['18'] = [0, 0, 0]
df['19'] = [0, 0, 0]

## this is that which has different values
df1 = pd.DataFrame()
df1['id'] = ['a', 'b', 'c']
df1['0'] = [4, 0, 9]
df1['1'] = [0, 0, 1]
df1['2'] = [1, 1, 3]
df1['3'] = [6, 2, 0]
df1['4'] = [0, 0, 0]

df1['5'] = [0, 5, 0]
df1['6'] = [0, 1, 2]
df1['7'] = [0, 0, 1]
df1['8'] = [0, 0, 3]
df1['9'] = [0, 0, 0]

df1['10'] = [0, 0, 0]
df1['11'] = [0, 0, 1]
df1['12'] = [1, 1, 1]
df1['13'] = [1, 3, 4]
df1['14'] = [9, 0, 0]

df1['15'] = [0, 0, 0]
df1['16'] = [2, 1, 1]
df1['17'] = [1, 1, 4]
df1['18'] = [0, 5, 0]
df1['19'] = [0, 0, 0]

字符串
我试着做了一些部分,但我无法跟踪1是在哪里完成的,而且我认为这不是最佳的!你能帮我吗?

def generate_df_out(df_one_zero, df_value_total, window_size=5):

    for col in range(1, len(df_one_zero.columns), window_size):
        
        df1_window = df_one_zero.iloc[:, col:col + window_size]
        
        df_value_window = df_value_total.iloc[:, col:col + window_size]
        
        for row in range(df1_window.shape[0]):
            
            start_idx = 0
            
            for col in range(window_size):
            
                if df1_window.iloc[row, col] == 1 and start_idx==0:
                    
                    df_out.iloc[row, col-1] = df_value_window.iloc[row, col] - df_value_window.iloc[row, col-1]
                
                    start_idx += col

    return df_out

df_out = generate_df_out(df, df1)


我想要的输出是这样的:

df2 = pd.DataFrame()
df2['id'] = ['a', 'b', 'c']
df2['0'] = [4, 0, 8]
df2['1'] = [0, -1, 0]
df2['2'] = [0, 1, 0]
df2['3'] = [5, 1, -1]
df2['4'] = [0, 0, 0]
​
df2['5'] = [0, 4, -1]
df2['6'] = [0, 0, 0]
df2['7'] = [0, -1, 0]
df2['8'] = [0, 0, 2]
df2['9'] = [0, 0, 0]
​
df2['10'] = [0, 0, -1]
df2['11'] = [-1, -1, 0]
df2['12'] = [0, 0, 0]
df2['13'] = [0, 2, 3]
df2['14'] = [9, 0, 0]
​
df2['15'] = [0, -1, -1]
df2['16'] = [1, 0, 0]
df2['17'] = [0, 0, 3]
df2['18'] = [-1, 4, 0]
df2['19'] = [0, 0, 0]
df2
id  0   1   2   3   4   5   6   7   8   ... 10  11  12  13  14  15  16  17  18  19
0   a   4   0   0   5   0   0   0   0   0   ... 0   -1  1   1   9   0   1   0   -1  0
1   b   0   -1  1   1   0   4   0   -1  0   ... 0   -1  0   2   0   -1  0   0   4   0
2   c   8   0   0   -1  0   -1  0   0   2   ... -1  0   0   3   0   -1  0   3   0   0

yebdmbv4

yebdmbv41#

为了实现这一点,你需要一个函数来处理每个窗口并相应地更新df_out。该函数应该遍历每个窗口,跟踪df_one_zero中1序列的开始和结束,并根据这些索引计算df_value_total中的差异。以下是更新后的函数:

def generate_df_out(df_one_zero, df_value_total, window_size=5):
    df_out = pd.DataFrame(0, index=df_one_zero.index, columns=df_one_zero.columns)
    df_out['id'] = df_one_zero['id']

    for col in range(1, len(df_one_zero.columns), window_size):
        for row in range(df_one_zero.shape[0]):
            sequence_started = False

            for c in range(col, min(col + window_size, len(df_one_zero.columns))):
                if df_one_zero.iloc[row, c] == 1:
                    if not sequence_started:
                        if c > 1:  # Check for the start of the sequence
                            df_out.iloc[row, c - 1] = df_value_total.iloc[row, c - 1] - df_value_total.iloc[row, c]
                        sequence_started = True

                    if c + 1 < len(df_one_zero.columns) and df_one_zero.iloc[row, c + 1] == 0:
                        df_out.iloc[row, c + 1] = df_value_total.iloc[row, c + 1] - df_value_total.iloc[row, c]

    return df_out

字符串
样品运行:

import pandas as pd

## only has zero and 1
df = pd.DataFrame()
df['id'] = ['a', 'b', 'c']
df['0'] = [0, 0, 0]
df['1'] = [1, 0, 1]
df['2'] = [1, 1, 1]
df['3'] = [0, 0, 0]
df['4'] = [0, 0, 0]

df['5'] = [0, 0, 0]
df['6'] = [0, 1, 1]
df['7'] = [0, 0, 1]
df['8'] = [0, 0, 0]
df['9'] = [0, 0, 0]

df['10'] = [0, 0, 0]
df['11'] = [0, 0, 1]
df['12'] = [1, 1, 1]
df['13'] = [1, 0, 0]
df['14'] = [0, 0, 0]

df['15'] = [0, 0, 0]
df['16'] = [0, 1, 1]
df['17'] = [1, 1, 0]
df['18'] = [0, 0, 0]
df['19'] = [0, 0, 0]

## this is that which has different values
df1 = pd.DataFrame()
df1['id'] = ['a', 'b', 'c']
df1['0'] = [4, 0, 9]
df1['1'] = [0, 0, 1]
df1['2'] = [1, 1, 3]
df1['3'] = [6, 2, 0]
df1['4'] = [0, 0, 0]

df1['5'] = [0, 5, 0]
df1['6'] = [0, 1, 2]
df1['7'] = [0, 0, 1]
df1['8'] = [0, 0, 3]
df1['9'] = [0, 0, 0]

df1['10'] = [0, 0, 0]
df1['11'] = [0, 0, 1]
df1['12'] = [1, 1, 1]
df1['13'] = [1, 3, 4]
df1['14'] = [9, 0, 0]

df1['15'] = [0, 0, 0]
df1['16'] = [2, 1, 1]
df1['17'] = [1, 1, 4]
df1['18'] = [0, 5, 0]
df1['19'] = [0, 0, 0]



def generate_df_out(df_one_zero, df_value_total, window_size=5):
    df_out = pd.DataFrame(0, index=df_one_zero.index, columns=df_one_zero.columns)
    df_out['id'] = df_one_zero['id']

    for col in range(1, len(df_one_zero.columns), window_size):
        for row in range(df_one_zero.shape[0]):
            sequence_started = False

            for c in range(col, min(col + window_size, len(df_one_zero.columns))):
                if df_one_zero.iloc[row, c] == 1:
                    if not sequence_started:
                        if c > 1:  # Check for the start of the sequence
                            df_out.iloc[row, c - 1] = df_value_total.iloc[row, c - 1] - df_value_total.iloc[row, c]
                        sequence_started = True

                    if c + 1 < len(df_one_zero.columns) and df_one_zero.iloc[row, c + 1] == 0:
                        df_out.iloc[row, c + 1] = df_value_total.iloc[row, c + 1] - df_value_total.iloc[row, c]

    return df_out

# Example usage
df_out = generate_df_out(df, df1)

print(df_out)


输出量:

id  0  1  2  3  4  5  6  7  8  9  10  11  12  13  14  15  16  17  18  19
0  a  4  0  0  5  0  0  0  0  0  0   0  -1   0   0   8   0   1   0  -1   0
1  b  0 -1  0  1  0  4  0 -1  0  0   0  -1   0   2   0  -1   0   0   4   0
2  c  8  0  0 -3  0 -2  0  0  2  0  -1   0   0   3   0  -1   0   3   0   0

相关问题