0

下面的 R 脚本计算列“names1”和“names2”中两个文本字符串之间的百分比相似度。但是,我的要求是对 6k-10K+ 列项执行相同的操作。当下面的公式应用于如此大的列时,由于行项目数以百万计,因此该解决方案被折腾,并且对于企业交付而言并不重要。除了“百分比”列之外,我还需要添加 6-7 个其他列,这将使解决方案大小超过 1 GB。请帮助我更新脚本,否则可能的解决方案可以实现相同的目标。非常感谢。

library(stringdist)
library(RecordLinkage)
library(dplyr)
library(scales)
names1 <- c("Adam Shaw","Justin Bose","Cydney Clide")
names2 <- c("Adam Shaw","Justin Bose","Cydney Clide")
names1 <- as.character(names1)
names2 <- as.character(names2)
Percent <- paste(round(unlist(lapply(1:length(names1), function(x) { 
levenshteinSim(names1[x], names2[-x])}))*100, 1), "%", sep="")
4

1 回答 1

1

您可能会发现向量化很有帮助:

#Create a large character Vector:
names1<-as.character(rep(iris$Species,10))

# Use Lapply
system.time(Percent <- paste(round(unlist(lapply(1:length(names1), function(x) { 
  levenshteinSim(names1[x], names1[-x])}))*100, 1), "%", sep=""))

#Create Vectorized Function
fun1<-function(names,x) {
  return(levenshteinSim(names[x],names[-x]))
}

vecFun1<-Vectorize(fun1,vectorize.args = "x")


#Execute Vectorized Function
system.time(percentVec<-vecFun1(names1,c(1:length(names1))))
percentVec<-paste(as.character(round(c(percentVec)*100,1)),"%",sep="")

这里是代码执行,向量化花费不到 1/3 的时间

> names1<-as.character(rep(iris$Species,10))
> system.time(Percent <- paste(round(unlist(lapply(1:length(names1), function(x) { 
+   levenshteinSim(names1[x], names1[-x])}))*100, 1), "%", sep=""))
   user  system elapsed 
   5.07    0.02    5.09 
> 
> fun1<-function(names,x) {
+   return(levenshteinSim(names[x],names[-x]))
+ }
> 
> vecFun1<-Vectorize(fun1,vectorize.args = "x")
> 
> system.time(percentVec<-vecFun1(names1,c(1:length(names1))))
   user  system elapsed 
   1.62    0.00    1.62 

我还将您的示例与 3 个元素的字符向量一起使用

> names2<-c("Adam Shaw","Justin Bose","Cydney Clide")
> names2 <- as.character(names2)
> system.time(Percent <- paste(round(unlist(lapply(1:length(names2), function(x) { 
+   levenshteinSim(names2[x], names2[-x])}))*100, 1), "%", sep=""))
   user  system elapsed 
      0       0       0 
> 
> fun1<-function(names,x) {
+   return(levenshteinSim(names[x],names[-x]))
+ }
> 
> vecFun1<-Vectorize(fun1,vectorize.args = "x")
> 
> system.time(percentVec<-vecFun1(names2,c(1:length(names2))))
   user  system elapsed 
      0       0       0 
> 
> percentVec<-paste(as.character(round(c(percentVec)*100,1)),"%",sep="")
> 
> Percent
[1] "9.1%"  "16.7%" "9.1%"  "16.7%" "16.7%" "16.7%"
> percentVec
[1] "9.1%"  "16.7%" "9.1%"  "16.7%" "16.7%" "16.7%"
于 2018-01-13T12:19:28.103 回答