2

我有几个我想要的数据框interval_left_join。理论上我可以逐步加入数据帧,但更喜欢一个函数来一次执行连接:

数据:

df1 <- data.frame(
  line = 1:4,
  key = c("a", "b", NA, "a"),
  start = c(75,100,170,240),
  end = c(100,150,190,300)
)

df2 <- data.frame(
  v2 = c("A","B","C","D","E","F","G","H","I","J","K","F"),
  start = c(0,10,30,90,120,130,154,161,175,199,205,300),
  end = c(10,20,50,110,130,140,160,165,180,250,300,305)
)

df3 <- data.frame(
  v3 = c("a","b","c","d","e","f"),
  start = c(5,90,200,333,1000,1500),
  end = c(75,171,210,400,1001,1600)
)

df4 <- data.frame(
  v4 = c("x","y","z","xx","yy","zz"),
  start = c(55,90,200,333,1000,1500),
  end = c(1005,171,210,400,1001,1600)
)

我想df1根据它们startend间隔汇集到的变量是v2, v3, v4. 到目前为止我尝试过的是下面的代码:它输出了partyl不正确的数据并且完全v2失败了——这里有什么遗漏或错误?v3v4

# install package "IRanges":
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("IRanges")

library(BiocManager)
library(fuzzyjoin)
library(data.table)
library(dplyr)
join_dataframes <- function(df1, df2) {
  interval_left_join(x = df1,
                     y = df2,
                     by = c("start", "end")) %>%
    group_by(grp = rleid(key)) %>%
    summarise(across(c(line, start.x, end.x), first), 
                    key = unique(key),
                    v2 = str_c(if_else(!is.na(v2), v2, "*" ), collapse = ",")) %>%
    rename(start = start.x, end = end.x) %>%
    select(-grp)
}

list_df <- list(df1, df2, df3, df4)
Reduce(join_dataframes, list_df)

期望的结果是这样的:

# A tibble: 4 x 7
   line key   v2    v3    start   end v4   
  <int> <chr> <chr> <chr> <dbl> <dbl> <chr>
1     1 a     D     a,b      75   100 x,y  
2     2 b     D,E,F b       100   150 x,y  
3     3 NA    I     b       170   190 x,y  
4     4 a     J,K,F *       240   300 x 
4

1 回答 1

3

仅在Reduce, v2, v3,v4列中执行 join 可以在 join 之后进行汇总。

library(dplyr)
library(fuzzyjoin)
library(data.table)

join_dataframes <- function(df1, df2) {
  interval_left_join(x = df1,
                     y = df2,
                     by = c("start", "end")) %>%
    select(-c(start.y, end.y)) %>%
    rename(start = start.x, end = end.x)
}

list_df <- list(df1, df2, df3, df4)

Reduce(join_dataframes, list_df) %>%
  group_by(grp = rleid(key)) %>%
  summarise(across(c(line, start, end), first), 
            across(v2:v4, ~toString(unique(if_else(!is.na(.), ., "*")))), 
            key = unique(key))


#    grp  line start   end v2      v3    v4    key  
#* <int> <int> <dbl> <dbl> <chr>   <chr> <chr> <chr>
#1     1     1    75   100 D       a, b  x, y  a    
#2     2     2   100   150 D, E, F b     x, y  b    
#3     3     3   170   190 I       b     x, y  NA   
#4     4     4   240   300 J, K, F *     x     a   
于 2021-03-02T11:49:28.957 回答