在R中汇总分组行

rqmkfv5c  于 5个月前  发布在  其他
关注(0)|答案(4)|浏览(42)

我有一个像这样的大框架:

t_stamp relaxation  pit
2023-12-19T15:06:35Z    FALSE   -0.038911564
2023-12-19T15:06:51Z    FALSE   -0.037203242
2023-12-19T15:07:03Z    FALSE   -0.035922001
2023-12-19T15:07:19Z    FALSE   -0.034213679
2023-12-19T15:07:35Z    FALSE   -0.032505357
2023-12-19T15:07:46Z    FALSE   -0.031224116
2023-12-19T15:08:06Z    FALSE   -0.029483415
2023-12-19T15:08:14Z    FALSE   -0.029968068
2023-12-19T15:08:34Z    FALSE   -0.030740263
2023-12-19T15:08:46Z    FALSE   0.073617363
2023-12-19T15:09:06Z    TRUE    0.20609313249588013
2023-12-19T15:09:10Z    TRUE    0.21614002653179293
2023-12-19T15:09:33Z    TRUE    0.24940015375614166
2023-12-19T15:09:49Z    TRUE    0.11678649167975427
2023-12-19T15:10:05Z    FALSE   0.008699417
2023-12-19T15:10:17Z    FALSE   0.002999872
2023-12-19T15:10:33Z    FALSE   -0.003341056
2023-12-19T15:10:44Z    FALSE   -0.008096753
2023-12-19T15:11:04Z    FALSE   -0.016022914

字符串
我想在每次“relaxation”列中的标签发生变化时计算“pit”列中的值的平均值。例如,对于此代码段,将创建三个汇总值:两个用于两组“pit”值,一个用于一组TRUE值。
我试着使用了来自dubur的group_by and summarize技术,但最终只创建了两个组:一个是当“relaxation”为“pit”时的“pit”值,另一个是当“relaxation”值为TRUE时的“pit”值。我想创建另一个变量来有效地对值进行分组,但找不到一个合乎逻辑的方法来做到这一点。

gfttwv5a

gfttwv5a1#

data.table

library(data.table)

setDT(df)[, list(MEAN = mean(pit)), by = rleid(relaxation)]

   rleid         MEAN
1:     1 -0.022655434
2:     2  0.197104951
3:     3 -0.003152287

字符串

s8vozzvw

s8vozzvw2#

不如这样吧:

  • cumsum(relaxation != lag(relaxation, default = TRUE))创建一个数字变量,该变量以1开头,每次relaxationTRUE/FALSE之间切换时递增,允许您检测组
library(tidyverse)

df <- read_table("t_stamp relaxation  pit
2023-12-19T15:06:35Z    FALSE   -0.038911564
2023-12-19T15:06:51Z    FALSE   -0.037203242
2023-12-19T15:07:03Z    FALSE   -0.035922001
2023-12-19T15:07:19Z    FALSE   -0.034213679
2023-12-19T15:07:35Z    FALSE   -0.032505357
2023-12-19T15:07:46Z    FALSE   -0.031224116
2023-12-19T15:08:06Z    FALSE   -0.029483415
2023-12-19T15:08:14Z    FALSE   -0.029968068
2023-12-19T15:08:34Z    FALSE   -0.030740263
2023-12-19T15:08:46Z    FALSE   0.073617363
2023-12-19T15:09:06Z    TRUE    0.20609313249588013
2023-12-19T15:09:10Z    TRUE    0.21614002653179293
2023-12-19T15:09:33Z    TRUE    0.24940015375614166
2023-12-19T15:09:49Z    TRUE    0.11678649167975427
2023-12-19T15:10:05Z    FALSE   0.008699417
2023-12-19T15:10:17Z    FALSE   0.002999872
2023-12-19T15:10:33Z    FALSE   -0.003341056
2023-12-19T15:10:44Z    FALSE   -0.008096753
2023-12-19T15:11:04Z    FALSE   -0.016022914")

df |> 
  mutate(group = cumsum(relaxation != lag(relaxation, default = TRUE))) |> 
  group_by(group) |> 
  summarise(mean_pit = mean(pit))
#> # A tibble: 3 × 2
#>   group mean_pit
#>   <int>    <dbl>
#> 1     1 -0.0227 
#> 2     2  0.197  
#> 3     3 -0.00315

字符串

xn1cxnb4

xn1cxnb43#

使用tapply

> tapply(dat$pit, cumsum(c(0, diff(dat$relaxation)) != 0), mean)
           0            1            2 
-0.022655434  0.197104951 -0.003152287

字符串
使用ave,您也可以将其作为列。

> transform(dat, pit_mn=ave(pit, cumsum(c(0, diff(relaxation)) != 0), FUN=mean))
                t_stamp relaxation          pit       pit_mn
1  2023-12-19T15:06:35Z      FALSE -0.038911564 -0.022655434
2  2023-12-19T15:06:51Z      FALSE -0.037203242 -0.022655434
3  2023-12-19T15:07:03Z      FALSE -0.035922001 -0.022655434
4  2023-12-19T15:07:19Z      FALSE -0.034213679 -0.022655434
5  2023-12-19T15:07:35Z      FALSE -0.032505357 -0.022655434
6  2023-12-19T15:07:46Z      FALSE -0.031224116 -0.022655434
7  2023-12-19T15:08:06Z      FALSE -0.029483415 -0.022655434
8  2023-12-19T15:08:14Z      FALSE -0.029968068 -0.022655434
9  2023-12-19T15:08:34Z      FALSE -0.030740263 -0.022655434
10 2023-12-19T15:08:46Z      FALSE  0.073617363 -0.022655434
11 2023-12-19T15:09:06Z       TRUE  0.206093132  0.197104951
12 2023-12-19T15:09:10Z       TRUE  0.216140027  0.197104951
13 2023-12-19T15:09:33Z       TRUE  0.249400154  0.197104951
14 2023-12-19T15:09:49Z       TRUE  0.116786492  0.197104951
15 2023-12-19T15:10:05Z      FALSE  0.008699417 -0.003152287
16 2023-12-19T15:10:17Z      FALSE  0.002999872 -0.003152287
17 2023-12-19T15:10:33Z      FALSE -0.003341056 -0.003152287
18 2023-12-19T15:10:44Z      FALSE -0.008096753 -0.003152287
19 2023-12-19T15:11:04Z      FALSE -0.016022914 -0.003152287

  • 数据类型:*
> dput(dat)
structure(list(t_stamp = c("2023-12-19T15:06:35Z", "2023-12-19T15:06:51Z", 
"2023-12-19T15:07:03Z", "2023-12-19T15:07:19Z", "2023-12-19T15:07:35Z", 
"2023-12-19T15:07:46Z", "2023-12-19T15:08:06Z", "2023-12-19T15:08:14Z", 
"2023-12-19T15:08:34Z", "2023-12-19T15:08:46Z", "2023-12-19T15:09:06Z", 
"2023-12-19T15:09:10Z", "2023-12-19T15:09:33Z", "2023-12-19T15:09:49Z", 
"2023-12-19T15:10:05Z", "2023-12-19T15:10:17Z", "2023-12-19T15:10:33Z", 
"2023-12-19T15:10:44Z", "2023-12-19T15:11:04Z"), relaxation = c(FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE), pit = c(-0.038911564, 
-0.037203242, -0.035922001, -0.034213679, -0.032505357, -0.031224116, 
-0.029483415, -0.029968068, -0.030740263, 0.073617363, 0.20609313249588, 
0.216140026531793, 0.249400153756142, 0.116786491679754, 0.008699417, 
0.002999872, -0.003341056, -0.008096753, -0.016022914)), class = "data.frame", row.names = c(NA, 
-19L))

lqfhib0f

lqfhib0f4#

这里有一个使用ifelse和cumsum创建分组列的解决方案。然后可以如上所述使用group_by()

df <- read.table(header=TRUE, text="t_stamp relaxation  pit
2023-12-19T15:06:35Z    FALSE   -0.038911564
2023-12-19T15:06:51Z    FALSE   -0.037203242
2023-12-19T15:07:03Z    FALSE   -0.035922001
2023-12-19T15:07:19Z    FALSE   -0.034213679
2023-12-19T15:07:35Z    FALSE   -0.032505357
2023-12-19T15:07:46Z    FALSE   -0.031224116
2023-12-19T15:08:06Z    FALSE   -0.029483415
2023-12-19T15:08:14Z    FALSE   -0.029968068
2023-12-19T15:08:34Z    FALSE   -0.030740263
2023-12-19T15:08:46Z    FALSE   0.073617363
2023-12-19T15:09:06Z    TRUE    0.20609313249588013
2023-12-19T15:09:10Z    TRUE    0.21614002653179293
2023-12-19T15:09:33Z    TRUE    0.24940015375614166
2023-12-19T15:09:49Z    TRUE    0.11678649167975427
2023-12-19T15:10:05Z    FALSE   0.008699417
2023-12-19T15:10:17Z    FALSE   0.002999872
2023-12-19T15:10:33Z    FALSE   -0.003341056
2023-12-19T15:10:44Z    FALSE   -0.008096753
2023-12-19T15:11:04Z    FALSE   -0.016022914")
                                                         
library(dplyr)
#define the grouping column
df$groupings = cumsum(ifelse(df$relaxation== lag(df$relaxation, default=1), 0, 1))

#create the summary
df %>% group_by(groupings) %>% summarize(relaxation= first(relaxation), meanvalue= mean(pit))

字符串

相关问题