0

我对文本中的单词进行匹配以检索单词偏移开始和结束。当使用适当的 unicode 感知正则表达式时,这通常适用于 ascii 和 unicode 文本 '(?<=^|\\PL)$1(?=\\PL|$)'。当我混合文本(比如这里的韩文和英文)时,在标记化时会出现一些问题:

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");
  // commas if followed by space
  text = text.replace(/(,\s)/g, " $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g, " $1");
  // single quotes if last char
  text = text.replace(/('$)/, " $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g, " . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
  var wordRegex = new XRegExp(pattern.replace('$1', escaped), "g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
  return item;
});
indexes.forEach(index => {
  if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
    console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  } else {
    console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  }
});
<script src="https://unpkg.com/xregexp/xregexp-all.js"></script>

问题是我在标记化中做了一些清理,比如

text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g, " $1 ");

where\w不支持 unicode,但如果我将其替换为\p{Alnum}

text = text.replace(/([^\p{Alnum}\.\'\-\/\+\<\>,&])/g, " $1 ");

它应该是 Unicode 字的等价物,它不能正常工作。

注意 请注意,我确实使用XRegExp来支持 JavaScript 中的 Unicode 正则表达式。

更新 根据下面的评论,由于缺少对不同宽度后视模式的支持(见评论),我已经用 Wiktor Stribiżew 修改的模式正则表达式更新了代码,'(?<=^|\\PL)$1(?=\\PL|$)'内置的正则表达式替换了 XRegExp。此解决方案效果更好,但我发现了另一种情况,即给定输入文本的字符偏移开始和结束无法匹配:"점점 더 깊이 끌려가"输出将缺少偏移量/匹配

{
    "index": 2,
    "word": "점&quot;
}

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/[^\w\.\-\/\+\<\>,&]/g, " $& ");
  // commas if followed by space
  text = text.replace(/(,\s)/g, " $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g, " $1");
  // single quotes if last char
  text = text.replace(/('$)/, " $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g, " $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g, " . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g, "");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g, "");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}
var text = "점점 더 깊이 끌려가"
var tokens = aggressive_tokenizer(text);
var seen = new Map();
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = '(?<!\\pL\\pM*)$1(?!\\pL)';
  var escaped = token.replace(/[\-\[\]{}()*+?.,\\\^$|#\s]/g, "\\$&");
  var wordRegex = new RegExp(pattern.replace('$1', escaped), "g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token, wordEnd);
      break;
    }
  }
  return item;
});
indexes.forEach(index => {
  if (!index.characterOffsetBegin && !index.characterOffsetEnd) {
    console.log("MISSING INDEXES " + index.word);
  } else if (index.word != text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1)) {
    console.log("NOT MATCHING!!! " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  } else {
    console.log("\tMATCHED " + index.word + " : " + text.slice(index.characterOffsetBegin, index.characterOffsetEnd + 1))
  }
});

4

0 回答 0