1

我正在尝试MEM_Response根据正确的响应(列)计算参与者响应(列)的准确性MEM_Correct。分组变量将是参与者的 ID(在本例中为列SERIAL--> 每个参与者 15 个案例)。

dput(example)
structure(list(MEM_Correct = c("ZLHK", "RZKX", "DGWL", "BCJSP", 
"WRKTJ", "CHBXS", "HNDCWX", "SWVNDT", "WLDGPB", "DSHRKBV", "HCXLZWB", 
"HDNBVZC", "BCRHKVDM", "RVTBWKFS", "NWHVZFLD", "ZLHK", "RZKX", 
"DGWL", "BCJSP", "WRKTJ", "CHBXS", "HNDCWX", "SWVNDT", "WLDGPB", 
"DSHRKBV", "HCXLZWB", "HDNBVZC", "BCRHKVDM", "RVTBWKFS", "NWHVZFLD"
), MEM_Response = c("ZLHK", "RZKX", "DGWL", "BCJSP", "WRKLTJ", 
"CHBXS", "HNDCWX", "SWVDTN", "WLDGPB", "DSHRKBV", "HCXLZWB", 
"HDNBVZC", "BCRHKVDM", "RVTBWKFS", "NWHVZFLD", "ZLHK", "RZKX", 
"DGWL", "BCJSB", "WRKTJ", "CHBXA", "HDNDWX", "SWVNDT", "WLGPBD", 
"DSHKRBV", "WLGJHKK", "HDBNVZC", "BCHRKVBM", "RVGBKSNM", "NWHVZWHJ"
), SERIAL = c("4444", "4444", "4444", "4444", "4444", "4444", 
"4444", "4444", "4444", "4444", "4444", "4444", "4444", "4444", 
"4444", "5555", "5555", "5555", "5555", "5555", "5555", "5555", 
"5555", "5555", "5555", "5555", "5555", "5555", "5555", "5555"
)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 
12L, 13L, 14L, 15L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
26L, 27L, 28L, 29L, 30L, 31L), class = "data.frame")

我尝试使用多种方法计算准确度(即正确响应和实际响应之间的距离),但到目前为止我没有收到令人满意的输出。

stringdist用于 Hamming & Levenshtein 距离:

文史丹:

example$MEM_Lev = stringdist(example$MEM_Correct, example$MEM_Response, method = c("lv"))

汉明:

example$MEM_Ham = stringdist(example$MEM_Correct, example$MEM_Response, method = c("hamming"))

问题:我有每个案例的汉明距离,但我将如何计算每个参与者的准确度,最终得到 0 到 1 之间的范围(即 0 到 100% 准确度)?汉明距离的问题还在于不同长度的情况(参见第 5 行:WRKTJWRKLTJ)产生inf。所以我可能会更好地使用 Levenshtein 距离,对吗?

然后我尝试with()了 Levensthein 距离的函数:

with(example, levenshteinSim(example$MEM_Correct, example$MEM_Response))

这一次,值介于 0 和 1 之间,我认为这是向前迈出的一步。再看第 5 行:WRKTJ(5 个字母)与 WRKLTJ(6 个字母)的不同之处在于后者在中间有一个额外的“L”。因此,需要进行 1 次编辑(在这种情况下是删除)才能匹配正确的响应。它的 Levenshtein 值 0.8333 对应于 5/6 正确(即使正确值只有 5)。我是否使用了正确的距离功能?

最后,我的最后一个问题是:

如何匹配/计算每个参与者的平均准确度?我有另一个与所有参与者一起的 df,我想将示例的输出与每个人的数据框合并,其中 1 行 = 1 个参与者。

我希望这是有道理的 - 如果没有,我可以尝试包含更多信息。如果您认为我没有使用正确的方法,请随时提出其他方法。

先感谢您!

4

1 回答 1

0

您想如何定义“准确性”是一个必须由您决定的方法决定,文献中可能有一些参考资料,但这里有一个建议。

example$lv.dist <- stringdist(example[,1], example[,2], method="lv")
head(example)
#   MEM_Correct MEM_Response SERIAL lv.dist
# 1        ZLHK         ZLHK   4444       0
# 2        RZKX         RZKX   4444       0
# 3        DGWL         DGWL   4444       0
# 4       BCJSP        BCJSP   4444       0
# 5       WRKTJ       WRKLTJ   4444       1
# 6       CHBXS        CHBXS   4444       0

aggregate(lv.dist ~ SERIAL, example, mean)
#   SERIAL  lv.dist
# 1   4444 0.200000
# 2   5555 1.866667

aggregate(lv.dist ~ SERIAL, example, function(x) round(mean(100/(1+x)), 2))
#   SERIAL lv.dist
# 1   4444   92.22
# 2   5555   54.17

# Using stringsim()
example$lv.sim <- stringsim(example[,1], example[,2], method="lv")

(agg <- aggregate(lv.sim ~ SERIAL, example, function(x) round(mean(x)*100, 2)))
#   SERIAL lv.sim
# 1   4444  96.67
# 2   5555  73.25

# Merging two data.frames is easy as long as they have a have a 
# column in common (SERIAL in this case)    
participants <- data.frame(age=7:9, SERIAL=c(5555, 4444, 1234))

merge(participants, agg)
#   SERIAL age lv.sim
# 1   4444   9  96.67
# 2   5555   8  73.25

merge(participants, agg, all=TRUE)
#   SERIAL age lv.sim
# 1   1234   9     NA
# 2   4444   8  96.67
# 3   5555   7  73.25
于 2019-06-25T10:46:27.597 回答