只是部分答案,因为我是 data.table 的新手。自联接适用于数字,但同样适用于字符串。我确信其中一位专业的数据表格师知道该怎么做。
library(data.table)
n <- 1000000
cpt.desc <- data.table(
cpt=rep(c(23456,23456,10000,44555,44555,NA),n),
description=rep(c("tonsillectomy","tonsillectomy in >12 year old","brain transplant","castration","orchidectomy","miscellaneous procedure"),n))
# Added on revision. Not very elegant, though. Faster by factor of 3
# but probably better scaling
setkey(cpt.desc,cpt)
system.time(a<-cpt.desc[-cpt.desc[J(23456,45555),which=TRUE]])
system.time(b<-cpt.desc[!(cpt %in% c(23456,45555))] )
str(a)
str(b)
identical(as.data.frame(a),as.data.frame(b))
# A self-join works Ok with numbers
setkey(cpt.desc,cpt)
system.time(a<-cpt.desc[cpt %in% c(23456,45555),])
system.time(b<-cpt.desc[J(23456,45555)])
str(a)
str(b)
identical(as.data.frame(a),as.data.frame(b)[,-3])
# But the same failes with characters
setkey(cpt.desc,description)
system.time(a<-cpt.desc[description %in% c("castration","orchidectomy"),])
system.time(b<-cpt.desc[J("castration","orchidectomy"),])
identical(as.data.frame(a),as.data.frame(b)[,-3])
str(a)
str(b)