这种方法只会产生一个匹配(列match
),因为即使存在距离关系,which.min
和也是长度为一的。max.col
手动检查关系很重要。可以在 data.frame res
、 columnminMatchSeveral
或下面的第二个脚本中检查关系。
require(stringdist)
{
firstvector <-A$Name
secondvector<-B$Employee
threshold <- 14 # max 14 characters of divergence
lenMin<-mindist<-integer()
match <- minMatchSeveral <- sortedmatches <- character()
for (i in 1:length(firstvector) ) {
matchdist <- stringdist::stringdist(firstvector[i],secondvector,"lcs") # several methods available
matchdist <- ifelse(matchdist>threshold,NA,matchdist)
sortedmatches[i] <- paste(secondvector[order(matchdist, na.last=NA)], collapse = ", ")
mindist[i]<- tryCatch(ifelse(is.integer(which.min(matchdist)),matchdist[which.min(matchdist)],NA), error = function(e){NA})
lenMin[i] <- tryCatch(length(matchdist[which(matchdist == min(matchdist, na.rm=T) ) ]),warning = function(w){""} )
match[i]<-ifelse(length(secondvector[which.min(matchdist)])==0,NA,
secondvector[which.min(matchdist)] )
minMatchSeveral[i] <- ifelse(lenMin[i]>1,
suppressWarnings(ifelse(length(secondvector[which(matchdist==min(matchdist, na.rm=T) ) ] )==0,
NA,
paste(secondvector[ which(matchdist==min(matchdist, na.rm=T) ) ], collapse = ", " )
))
, NA)
}
res<-data.frame(firstvector=firstvector,
match=match,divergence=mindist,
lenMin= lenMin,
minMatchSeveral = minMatchSeveral,
sortedmatches=sortedmatches,
stringsAsFactors = F)
}
res
firstvector match divergence lenMin minMatchSeveral sortedmatches
1 Mike Adall Micheal Adall 5 2 Micheal Adall, Micheol Adall Micheal Adall, Micheol Adall, Brian Adams, Semi Ajayi
2 Brian Adams Brian Adams 0 1 <NA> Brian Adams, Rohan Sudarsh, Micheal Adall, Adrián Silva, Semi Ajayi, Micheol Adall
3 Adrian Adrián Silva 8 1 <NA> Adrián Silva, Brian Adams, Lothar Fiend, Semi Ajayi, Micheal Adall, Micheol Adall, Jomü Ria Aké
4 Floyd Oid Floyd Öid Matheus 10 1 <NA> Floyd Öid Matheus, Lothar Fiend
5 Semi Ajayi Semi Ajayi 0 1 <NA> Semi Ajayi, Brian Adams, Jomü Ria Aké
6 Jomu Aké Jomü Ria Aké 6 1 <NA> Jomü Ria Aké, Semi Ajayi
A$match<-match
# For large tables, consider using data.table::merge
C <- merge(A, B, by.x="match", by.y = "Employee", all.x=T)
C[,2:ncol(C)]
ID Name Expense Category
1 3 Adrián 1 A1
2 2 Brian Adams 4 B2
3 4 Floyd Oid 1 B1
4 6 Jomu Aké 3 B1
5 1 Mike Adall 3 A1
6 5 Semi Ajayi 4 A1
来自?stringdist-metrics
最长公共子串(method='lcs')定义为在保持字符顺序不变的情况下,将a和b中的字符配对得到的最长字符串。lcs-distance 定义为不成对字符的数量。该距离相当于编辑距离,只允许删除和插入,每个权重为 1。
另外你可以看看stringi::stri_trans_general
编辑:另一种可视化关系的方法
{
mm <- -t(sapply(A$Name,stringdist::stringdist,B$Employee,"lcs"))
idx <- mm[cbind(seq_along(max.col(mm)),max.col(mm))]
ties <-sapply(seq_along(mm[,1]), function(x) which(mm[x,] %in% idx[x]) )
list <-sapply(ties, function(x) paste(B[x,] ), simplify=F)
my<-as.matrix(do.call("rbind",list) )
dimnames( my)[[2]] <- c("closestMatch","Category")
cbind(A, my )
}
ID Name Expense closestMatch Category
1 1 Mike Adall 3 c("Micheal Adall", "Micheol Adall") c("A1", "A1")
2 2 Brian Adams 4 Brian Adams B2
3 3 Adrian 1 Adrián Silva A1
4 4 Floyd Oid 1 Floyd Öid Matheus B1
5 5 Semi Ajayi 4 Semi Ajayi A1
6 6 Jomu Aké 3 Jomü Ria Aké B1
数据
{
A<-read.table(text="ID Name Expense
1 \"Mike Adall\" 3
2 \"Brian Adams\" 4
3 \"Adrian\" 1
4 \"Floyd Oid\" 1
5 \"Semi Ajayi\" 4
6 \"Jomu Aké\" 3 ", header=T, stringsAsFactors = F)
B<-read.table(text="Employee Category
\"Lothar Fiend\" B2
\"Rohan Sudarsh\" A2
\"Adrián Silva\" A1
\"Semi Ajayi\" A1
\"Micheal Adall\" A1
\"Micheol Adall\" A1 # testing ties
\"Jomü Ria Aké\" B1
\"Brian Adams\" B2
\"Floyd Öid Matheus\" B1", header=T, stringsAsFactors = F)
}