0

我遇到了编码和部分匹配的问题。

我有两个数据帧,A 和 B。A 通过 UTF-8 编码调用,B 在 Latin1 上调用。尽管我不确定,但这可能已经是问题的一部分。这是我知道如何正确导入它的唯一方法。

编辑:我应该澄清一下。这只是样本数据。两个数据框都包含大量的行和其他列。

           A                                                        B
ID       Name    Expense                              Employee           Category
1    Mike Adall   3                                   Lothar Fiend          B2
2   Brian Adams   4                                   Rohan Sudarsh         A2
3        Adrián   1                                   Adrián Silva          A1
4     Floyd Oid   1                                   Semi Ajayi            A1
5    Semi Ajayi   4                                   Micheal Adall         A1
6      Jomu Aké   3                                   Jomü Ria Aké          B1
                                                      Brian Adams           B2
                                                      Floyd Öid Matheus     B1       

            

我一直在尝试提取 B$Employee$ 并将它们与 A$Name 部分匹配,以创建一个包含 B$Category 的新 df C。这是我想要的输出。

编辑:使用类别,我还想包括 A 和 B 的所有其他列,不包括员工。

             C
ID       Name    Expense   Category
1    Mike Adall   3        A1
2   Brian Adams   4        B2
3        Adrián   1        A1
4     Floyd Oid   1        B1
5    Semi Ajayi   4        A1
6      Jomu Aké   3        B1

到目前为止,我已经使用fuzzyjoin 包匹配了80% 的字符。

C <- A %>% fuzzy_inner_join(B, by = c(Name = "Employee"))

主要问题似乎是这些奇怪的拉丁字符,例如 Ö、ß 等,或者有时出现在“Aké”等名称的末尾。结果似乎因名称而异。

我怎样才能让它部分匹配所有的名字?

4

2 回答 2

0

在基础 R 中,您可以同时使用agrepadist,如下所示:

d<-sapply(A$Name,agrep, B$Employee)
d[e]<-max.col(-adist(e<-names(Filter(Negate(length),d)), B$Employee))
cbind(A,B[unlist(d),])

 ID        Name Expense          Employee Category
5  1  Mike Adall       3     Micheal Adall       A1
7  2 Brian Adams       4       Brian Adams       B2
3  3      Adrián       1      Adrián Silva       A1
8  4   Floyd Oid       1 Floyd Öid Matheus       B1
4  5  Semi Ajayi       4        Semi Ajayi       A1
6  6    Jomu Aké       3      Jomü Ria Aké       B1

编辑:

使用stringdist包:你可以这样做:

cbind(A, B[max.col(-t(sapply(A$Name,stringdist::stringdist,B$Employee,"lcs"))),])
  ID        Name Expense          Employee Category
5  1  Mike Adall       3     Micheal Adall       A1
7  2 Brian Adams       4       Brian Adams       B2
3  3      Adrián       1      Adrián Silva       A1
8  4   Floyd Oid       1 Floyd Öid Matheus       B1
4  5  Semi Ajayi       4        Semi Ajayi       A1
6  6    Jomu Aké       3      Jomü Ria Aké       B1
于 2020-10-16T16:52:36.117 回答
0

这种方法只会产生一个匹配(列match),因为即使存在距离关系,which.min和也是长度为一的。max.col

手动检查关系很重要。可以在 data.frame res、 columnminMatchSeveral或下面的第二个脚本中检查关系。

require(stringdist)
{
firstvector <-A$Name
secondvector<-B$Employee   
threshold <- 14   # max 14 characters of divergence

lenMin<-mindist<-integer()
match <- minMatchSeveral <- sortedmatches <- character()

for (i in 1:length(firstvector) ) {
  matchdist <- stringdist::stringdist(firstvector[i],secondvector,"lcs") # several methods available
  matchdist <- ifelse(matchdist>threshold,NA,matchdist)
  sortedmatches[i] <- paste(secondvector[order(matchdist, na.last=NA)], collapse = ", ")
  mindist[i]<- tryCatch(ifelse(is.integer(which.min(matchdist)),matchdist[which.min(matchdist)],NA), error = function(e){NA})
  lenMin[i] <- tryCatch(length(matchdist[which(matchdist == min(matchdist, na.rm=T) ) ]),warning = function(w){""} )
  match[i]<-ifelse(length(secondvector[which.min(matchdist)])==0,NA,
                   secondvector[which.min(matchdist)] )
  minMatchSeveral[i] <- ifelse(lenMin[i]>1, 
                               suppressWarnings(ifelse(length(secondvector[which(matchdist==min(matchdist, na.rm=T) )  ] )==0,
                                                       NA,
                                                       paste(secondvector[ which(matchdist==min(matchdist, na.rm=T) )  ], collapse = ", " )
                               ))
                               , NA) 
}

res<-data.frame(firstvector=firstvector,
                match=match,divergence=mindist, 
                lenMin= lenMin,
                minMatchSeveral = minMatchSeveral,
                sortedmatches=sortedmatches, 
                stringsAsFactors = F)
}
res
  firstvector             match divergence lenMin              minMatchSeveral                                                                                   sortedmatches
1  Mike Adall     Micheal Adall          5      2 Micheal Adall, Micheol Adall                                           Micheal Adall, Micheol Adall, Brian Adams, Semi Ajayi
2 Brian Adams       Brian Adams          0      1                         <NA>              Brian Adams, Rohan Sudarsh, Micheal Adall, Adrián Silva, Semi Ajayi, Micheol Adall
3      Adrian      Adrián Silva          8      1                         <NA> Adrián Silva, Brian Adams, Lothar Fiend, Semi Ajayi, Micheal Adall, Micheol Adall, Jomü Ria Aké
4   Floyd Oid Floyd Öid Matheus         10      1                         <NA>                                                                 Floyd Öid Matheus, Lothar Fiend
5  Semi Ajayi        Semi Ajayi          0      1                         <NA>                                                           Semi Ajayi, Brian Adams, Jomü Ria Aké
6    Jomu Aké      Jomü Ria Aké          6      1                         <NA>                                                                        Jomü Ria Aké, Semi Ajayi

A$match<-match
# For large tables, consider using data.table::merge
C <- merge(A, B, by.x="match", by.y = "Employee", all.x=T)
C[,2:ncol(C)]

  ID        Name Expense Category
1  3      Adrián       1       A1
2  2 Brian Adams       4       B2
3  4   Floyd Oid       1       B1
4  6    Jomu Aké       3       B1
5  1  Mike Adall       3       A1
6  5  Semi Ajayi       4       A1

来自?stringdist-metrics

最长公共子串(method='lcs')定义为在保持字符顺序不变的情况下,将a和b中的字符配对得到的最长字符串。lcs-distance 定义为不成对字符的数量。该距离相当于编辑距离,只允许删除和插入,每个权重为 1。

另外你可以看看stringi::stri_trans_general

编辑:另一种可视化关系的方法

{
mm  <- -t(sapply(A$Name,stringdist::stringdist,B$Employee,"lcs"))
idx <- mm[cbind(seq_along(max.col(mm)),max.col(mm))]
ties <-sapply(seq_along(mm[,1]), function(x) which(mm[x,] %in% idx[x]) )
list <-sapply(ties, function(x) paste(B[x,] ), simplify=F)
my<-as.matrix(do.call("rbind",list) )
dimnames( my)[[2]] <- c("closestMatch","Category") 
cbind(A, my )  
}

  ID        Name Expense                        closestMatch      Category
1  1  Mike Adall       3 c("Micheal Adall", "Micheol Adall") c("A1", "A1")
2  2 Brian Adams       4                         Brian Adams            B2
3  3      Adrian       1                        Adrián Silva            A1
4  4   Floyd Oid       1                   Floyd Öid Matheus            B1
5  5  Semi Ajayi       4                          Semi Ajayi            A1
6  6    Jomu Aké       3                        Jomü Ria Aké            B1

数据

{
A<-read.table(text="ID       Name    Expense
1    \"Mike Adall\"   3                             
2   \"Brian Adams\"   4                             
3        \"Adrian\"   1                             
4     \"Floyd Oid\"   1                             
5    \"Semi Ajayi\"   4                             
6      \"Jomu Aké\"   3 ", header=T, stringsAsFactors = F)                            
 
B<-read.table(text="Employee           Category
\"Lothar Fiend\"          B2
\"Rohan Sudarsh\"         A2
\"Adrián Silva\"          A1
\"Semi Ajayi\"            A1
\"Micheal Adall\"         A1
\"Micheol Adall\"         A1 # testing ties
\"Jomü Ria Aké\"          B1
\"Brian Adams\"           B2
\"Floyd Öid Matheus\"     B1", header=T, stringsAsFactors = F)
}
于 2020-10-16T16:06:29.583 回答