我发现有大量数据时,stringdist 函数可能会陷入困境。因此,如果您遇到速度问题,还有其他包选项(例如,RecordLinkage
包,agrep
)和其他匹配字符串的方法(例如,其他距离度量)。此外,不是 100% 清楚你在问什么,但如果你的问题是你想测试翻转名字和姓氏,你总是可以使用strsplit.
例如,
> library(stringdist)
>
> #Table A
> word <- c("PILLAY NOLAN VICTOR", "PILLAY NICHOLAS")
> #Master Table
> choices <- c("IGOR JOSE VICTOR","WILLIAM NICHOLAS","NOLAN PILLAY","NICHOLAS PILLAY")
>
> # Try # 1
> match_dist <- sapply(word,
+ function(x) min(stringdist(x, choices, method = "lv")))
>
> match_text <- sapply(word,
+ function(x) choices[which.min(stringdist(x, choices, method = "lv"))])
>
> df <- data.frame("traveler name" = word,
+ "people name" = match_text,
+ "dist" = match_dist, stringsAsFactors = FALSE, row.names = NULL)
> # Checking results
> df
traveler.name people.name dist
1 PILLAY NOLAN VICTOR IGOR JOSE VICTOR 9
2 PILLAY NICHOLAS WILLIAM NICHOLAS 3
>
>
> # Reversing srings, assuming names are sepearated by a space
> reversed <- sapply(strsplit(choices, " "), function(x) paste(rev(x), collapse=" ")) #reversing words
> choices <- c(choices, reversed)
> choices <- unique(choices)
>
>
> # Try # 2
> match_dist <- sapply(word,
+ function(x) min(stringdist(x, choices, method = "lv")))
>
> match_text <- sapply(word,
+ function(x) choices[which.min(stringdist(x, choices, method = "lv"))])
>
> df <- data.frame("traveler name" = word,
+ "people name" = match_text,
+ "dist" = match_dist, stringsAsFactors = FALSE, row.names = NULL)
>
> # Checking the new results
> df
traveler.name people.name dist
1 PILLAY NOLAN VICTOR PILLAY NOLAN 7
2 PILLAY NICHOLAS PILLAY NICHOLAS 0
根据您的数据设置方式,您可能会发现删除中间名或以其他方式清理数据有帮助(或没有帮助),但这应该让您开始。
编辑:
我测试了几个不同的解决方案,但没有测试agrep
,因此可能值得一试。我肯定会赞成RecordLinkage
,我什至会考虑将您的数据集分解为完美匹配和不匹配,然后只反转(或排序)不匹配。该代码将成为计算距离度量的瓶颈,因此任何减少需要距离度量的名称数量的方法都可能对您有所帮助。
> library(stringdist)
> library(RecordLinkage)
> library(microbenchmark)
>
> #Table A
> word <- c("PILLAY NOLAN VICTOR", "PILLAY NICHOLAS", "WILLIAM NICHOLAS")
> #Master Table
> choices <- c("IGOR JOSE VICTOR","WILLIAM NICHOLAS","NOLAN PILLAY","NICHOLAS PILLAY")
>
> microbenchmark({
+
+ # All reversed
+ reversed <- sapply(strsplit(choices, " "), function(x) paste(rev(x), collapse=" ")) #reversing words
+ choices1 <- c(choices, reversed)
+ choices1 <- unique(choices1)
+
+ match_dist <- sapply(word, function(x) min(stringdist(x, choices1, method = "lv")))
+ match_text <- sapply(word, function(x) choices1[which.min(stringdist(x, choices1, method = "lv"))])
+
+ df1 <- data.frame("traveler name" = word,
+ "people name" = match_text,
+ "dist" = match_dist,
+ stringsAsFactors = FALSE, row.names = NULL)
+ },
+
+ {
+ # Record linkage
+ reversed <- sapply(strsplit(choices, " "), function(x) paste(rev(x), collapse=" ")) #reversing words
+ choices2 <- c(choices, reversed)
+ choices2 <- unique(choices2)
+
+ match_dist2 <- sapply(word, function(x) min(levenshteinDist(x, choices2)))
+ match_text2 <- sapply(word, function(x) choices2[which.min(levenshteinDist(x, choices2))])
+
+ df2 <- data.frame("traveler name" = word,
+ "people name" = match_text2,
+ "dist" = match_dist2,
+ stringsAsFactors = FALSE, row.names = NULL)
+ },
+
+ {
+ # Sorted
+
+ sorted <- sapply(strsplit(choices, " "), function(x) paste(sort(x), collapse=" ")) #sorting choices
+ choices3 <- c(choices, sorted)
+ choices3 <- unique(choices3)
+ word3 <- sapply(strsplit(word, " "), function(x) paste(sort(x), collapse=" ")) #sorting words
+
+ match_dist3 <- sapply(word3, function(x) min(stringdist(x, choices3, method = "lv")))
+ match_text3 <- sapply(word3, function(x) choices3[which.min(stringdist(x, choices3, method = "lv"))])
+
+ df3 <- data.frame("traveler name" = word3,
+ "people name" = match_text3,
+ "dist" = match_dist3,
+ stringsAsFactors = FALSE, row.names = NULL)
+ },
+ times = 1)
Unit: milliseconds
expr min lq mean median uq max neval
revers 6.627258 6.627258 6.627258 6.627258 6.627258 6.627258 1
reversRL 4.016632 4.016632 4.016632 4.016632 4.016632 4.016632 1
sort 7.223453 7.223453 7.223453 7.223453 7.223453 7.223453 1
>
> all.equal(df1, df2)
[1] TRUE
>
> df2
traveler.name people.name dist
1 PILLAY NOLAN VICTOR PILLAY NOLAN 7
2 PILLAY NICHOLAS PILLAY NICHOLAS 0
3 WILLIAM NICHOLAS WILLIAM NICHOLAS 0
> df3
traveler.name people.name dist
1 NOLAN PILLAY VICTOR NOLAN PILLAY 7
2 NICHOLAS PILLAY NICHOLAS PILLAY 0
3 NICHOLAS WILLIAM NICHOLAS WILLIAM 0