stats
考虑使用 R 的完全聚合的内置库aggregate
:
agg_df <- aggregate(num_col ~ GROUP + group_1, flow_group_df, length)
或使用内联聚合ave
:
flow_group_df$group_1_count <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length))
用随机数据进行演示:
set.seed(72318)
flow_group_df <- data.frame(GROUP = c("julia", "r", "pandas"),
group_1 = sample(c("flagged", "checked"), 60, replace=TRUE),
num_col = runif(60, 0, 100))
总计的
agg_df <- aggregate(num_col ~ GROUP + group_1, flow_group_df, length)
agg_df <- with(agg_df, agg_df[order(GROUP, group_1),]) # ORDER BY GROUPS
row.names(agg_df) <- NULL # RESET ROW NAMES
colnames(agg_df)[3] <- "count" # RENAME KEY COL
agg_df
# GROUP group_1 count
# 1 julia checked 10
# 2 julia flagged 10
# 3 pandas checked 8
# 4 pandas flagged 12
# 5 r checked 7
# 6 r flagged 13
Ave (计数和百分比计算)
flow_group_df$group_1_count <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length))
flow_group_df$group_1_pct <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length)) /
with(flow_group_df, ave(num_col, GROUP, FUN=length))
flow_group_df <- with(flow_group_df, flow_group_df[order(GROUP, group_1),]) # ORDER BY GROUPS
row.names(flow_group_df) <- NULL # RESET ROW NAMES
tail(flow_group_df, 20)
# GROUP group_1 num_col group_1_count group_1_pct
# 41 r checked 8.128056 7 0.35
# 42 r checked 86.439911 7 0.35
# 43 r checked 75.488474 7 0.35
# 44 r checked 88.120510 7 0.35
# 45 r checked 43.058268 7 0.35
# 46 r checked 46.662674 7 0.35
# 47 r checked 42.329505 7 0.35
# 48 r flagged 94.959380 13 0.65
# 49 r flagged 64.817015 13 0.65
# 50 r flagged 61.118952 13 0.65
# 51 r flagged 69.104977 13 0.65
# 52 r flagged 98.078729 13 0.65
# 53 r flagged 74.857959 13 0.65
# 54 r flagged 83.813440 13 0.65
# 55 r flagged 99.069011 13 0.65
# 56 r flagged 62.298414 13 0.65
# 57 r flagged 14.335920 13 0.65
# 58 r flagged 70.404048 13 0.65
# 59 r flagged 18.744892 13 0.65
# 60 r flagged 21.598072 13 0.65