Add new fuzzy quote matching implementation

Implement a `matchQuote` function which will be used to replace `dom-anchor-text-quote` for finding the best match for annotation quotes in the document text. The new implementation is based on the `approx-string-match` library and provides several improvements over the existing one: - Better performance when there are many differences between the quote and closest document text - It will be easier for us to tune the degree of mismatch allowed between the quote and document text and how candidate matches are ranked

Add new fuzzy quote matching implementation
Implement a `matchQuote` function which will be used to replace `dom-anchor-text-quote` for finding the best match for annotation quotes in the document text. The new implementation is based on the `approx-string-match` library and provides several improvements over the existing one: - Better performance when there are many differences between the quote and closest document text - It will be easier for us to tune the degree of mismatch allowed between the quote and document text and how candidate matches are ranked
d2e9f195 · Robert Knight · f9ae90c0 · d2e9f195 · d2e9f195 · d2e9f195
Commit d2e9f195 authored Dec 10, 2020 by Robert Knight
Showing with 362 additions and 0 deletions

package.json package.json +1 -0

match-quote.js src/annotator/anchoring/match-quote.js +158 -0

match-quote-test.js src/annotator/anchoring/test/match-quote-test.js +198 -0

yarn.lock yarn.lock +5 -0

No files found.
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
    "@babel/preset-react": "^7.0.0",
    "@octokit/rest": "^18.0.0",
    "@sentry/browser": "^5.6.2",
+    "approx-string-match": "^1.1.0",
    "autoprefixer": "^10.0.1",
    "aws-sdk": "^2.345.0",
    "axe-core": "^4.0.0",

--- a/src/annotator/anchoring/match-quote.js
+++ b/src/annotator/anchoring/match-quote.js
+import approxSearch from 'approx-string-match';
+
+/**
+ * @typedef {import('approx-string-match').Match} StringMatch
+ */
+
+/**
+ * @typedef Match
+ * @prop {number} start - Start offset of match in text
+ * @prop {number} end - End offset of match in text
+ * @prop {number} score -
+ *   Score for the match between 0 and 1.0, where 1.0 indicates a perfect match
+ *   for the quote and context.
+ */
+
+/**
+ * Find the best approximate matches for `str` in `text` allowing up to `maxErrors` errors.
+ *
+ * @param {string} text
+ * @param {string} str
+ * @param {number} maxErrors
+ * @return {StringMatch[]}
+ */
+function search(text, str, maxErrors) {
+  // Do a fast search for exact matches. The `approx-string-match` library
+  // doesn't currently incorporate this optimization itself.
+  let matchPos = 0;
+  let exactMatches = [];
+  while (matchPos !== -1) {
+    matchPos = text.indexOf(str, matchPos);
+    if (matchPos !== -1) {
+      exactMatches.push({
+        start: matchPos,
+        end: matchPos + str.length,
+        errors: 0,
+      });
+      matchPos += 1;
+    }
+  }
+  if (exactMatches.length > 0) {
+    return exactMatches;
+  }
+
+  // If there are no exact matches, do a more expensive search for matches
+  // with errors.
+  return approxSearch(text, str, maxErrors);
+}
+
+/**
+ * Compute a score between 0 and 1.0 for the similarity between `text` and `str`.
+ *
+ * @param {string} text
+ * @param {string} str
+ */
+function textMatchScore(text, str) {
+  /* istanbul ignore next - `scoreMatch` will never pass an empty string */
+  if (str.length === 0) {
+    return 0.0;
+  }
+  const matches = search(text, str, str.length);
+
+  // prettier-ignore
+  return 1 - (matches[0].errors / str.length);
+}
+
+/**
+ * Find the best approximate match for `quote` in `text`.
+ *
+ * Returns `null` if no match exceeding the minimum quality threshold was found.
+ *
+ * @param {string} text - Document text to search
+ * @param {string} quote - String to find within `text`
+ * @param {Object} context -
+ *   Context in which the quote originally appeared. This is used to choose the
+ *   best match.
+ *   @param {string} [context.prefix] - Expected text before the quote
+ *   @param {string} [context.suffix] - Expected text after the quote
+ *   @param {number} [context.hint] - Expected offset of match within text
+ * @return {Match|null}
+ */
+export function matchQuote(text, quote, context = {}) {
+  if (quote.length === 0) {
+    return null;
+  }
+
+  // Choose the maximum number of errors to allow for the initial search.
+  // This choice involves a tradeoff between:
+  //
+  //  - Recall (proportion of "good" matches found)
+  //  - Precision (proportion of matches found which are "good")
+  //  - Cost of the initial search and of processing the candidate matches [1]
+  //
+  // [1] Specifically, the expected-time complexity of the initial search is
+  //     `O((maxErrors / 32) * text.length)`. See `approx-string-match` docs.
+  const maxErrors = Math.min(256, quote.length / 2);
+
+  // Find closest matches for `quote` in `text` based on edit distance.
+  const matches = search(text, quote, maxErrors);
+
+  if (matches.length === 0) {
+    return null;
+  }
+
+  /**
+   * Compute a score between 0 and 1.0 for a match candidate.
+   *
+   * @param {StringMatch} match
+   */
+  const scoreMatch = match => {
+    const quoteWeight = 50; // Similarity of matched text to quote.
+    const prefixWeight = 20; // Similarity of text before matched text to `context.prefix`.
+    const suffixWeight = 20; // Similarity of text after matched text to `context.suffix`.
+    const posWeight = 2; // Proximity to expected location. Used as a tie-breaker.
+
+    const quoteScore = 1 - match.errors / quote.length;
+
+    const prefixScore = context.prefix
+      ? textMatchScore(
+          text.slice(match.start - context.prefix.length, match.start),
+          context.prefix
+        )
+      : 1.0;
+    const suffixScore = context.suffix
+      ? textMatchScore(
+          text.slice(match.end, match.end + context.suffix.length),
+          context.suffix
+        )
+      : 1.0;
+
+    let posScore = 1.0;
+    if (typeof context.hint === 'number') {
+      const offset = Math.abs(match.start - context.hint);
+      posScore = 1.0 - offset / text.length;
+    }
+
+    const rawScore =
+      quoteWeight * quoteScore +
+      prefixWeight * prefixScore +
+      suffixWeight * suffixScore +
+      posWeight * posScore;
+    const maxScore = quoteWeight + prefixWeight + suffixWeight + posWeight;
+    const normalizedScore = rawScore / maxScore;
+
+    return normalizedScore;
+  };
+
+  // Rank matches based on similarity of actual and expected surrounding text
+  // and actual/expected offset in the document text.
+  const scoredMatches = matches.map(m => ({
+    start: m.start,
+    end: m.end,
+    score: scoreMatch(m),
+  }));
+
+  // Choose match with highest score.
+  scoredMatches.sort((a, b) => b.score - a.score);
+  return scoredMatches[0];
+}
--- a/src/annotator/anchoring/test/match-quote-test.js
+++ b/src/annotator/anchoring/test/match-quote-test.js
+import { matchQuote } from '../match-quote';
+
+const fixtures = {
+  solitude: `Many years later, as he faced the firing squad,
+    Colonel Aureliano Buendía was to remember that distant afternoon
+    when his father took him to discover ice`,
+
+  twoCities: `It was the best of times, it was the worst of times,
+    it was the age of wisdom, it was the age of foolishness, it was the epoch of belief,
+    it was the epoch of incredulity, it was the season of Light, it was the
+    season of Darkness, it was the spring of hope, it was the winter of despair, we had
+    everything before us, we had nothing before us, we were all going direct to Heaven,
+    we were all going direct the other way.`,
+};
+
+function normalize(str) {
+  // Normalize whitespace.
+  return str.replace(/\s+/g, ' ');
+}
+
+Object.keys(fixtures).forEach(k => (fixtures[k] = normalize(fixtures[k])));
+
+describe('matchQuote', () => {
+  it('finds exact match', () => {
+    const match = matchQuote(fixtures.solitude, 'discover ice');
+    assert.equal(match.score, 1.0);
+    assert.equal(
+      fixtures.solitude.slice(match.start, match.end),
+      'discover ice'
+    );
+  });
+
+  it('finds best approximate match if there is no exact match', () => {
+    const match = matchQuote(fixtures.solitude, 'some years later');
+    assert.isTrue(match.score > 0);
+    assert.isTrue(match.score < 1);
+    assert.equal(
+      fixtures.solitude.slice(match.start, match.end),
+      'Many years later'
+    );
+  });
+
+  it('scores matches based on quote similarity', () => {
+    // List of quotes in descending order of similarity to the text.
+    const quotes = [
+      'Many years later',
+      'Many yers later',
+      'Some years later',
+      'Some years after',
+    ];
+
+    const scores = quotes.map(q => matchQuote(fixtures.solitude, q).score);
+
+    for (let i = 1; i < scores.length; i++) {
+      assert.isBelow(scores[i], scores[i - 1]);
+    }
+  });
+
+  it('scores matches based on prefix similarity', () => {
+    // List of prefixes in descending order of similarity to the actual prefix
+    // of the quote.
+    const prefixes = [
+      'Many years later',
+      'Many yers later',
+      'Some years later',
+      'Some years after',
+    ];
+
+    const quote = ', as he faced the firing squad';
+    const scores = prefixes.map(
+      p => matchQuote(fixtures.solitude, quote, { prefix: p }).score
+    );
+
+    for (let i = 1; i < scores.length; i++) {
+      assert.isBelow(scores[i], scores[i - 1]);
+    }
+  });
+
+  it('scores matches based on suffix similarity', () => {
+    // List of suffixes in descending order of similarity to the actual suffix
+    // of the quote.
+    const suffixes = [
+      ', as he faced the firing squad',
+      ', as she faced the firing squad',
+      ', as he awaited the firing squad',
+      ', as he awaited his death',
+    ];
+
+    const quote = 'Many years later';
+    const scores = suffixes.map(
+      s => matchQuote(fixtures.solitude, quote, { suffix: s }).score
+    );
+
+    for (let i = 1; i < scores.length; i++) {
+      assert.isBelow(scores[i], scores[i - 1]);
+    }
+  });
+
+  it('returns `null` if there is no acceptable approximate match', () => {
+    const match = matchQuote(fixtures.twoCities, fixtures.solitude);
+    assert.equal(match, null);
+  });
+
+  it('returns `null` if quote is empty', () => {
+    assert.equal(matchQuote('foobar', ''), null);
+  });
+
+  it('returns `null` if text is empty', () => {
+    assert.equal(matchQuote('', 'foobar'), null);
+  });
+
+  [
+    // Exact prefix matches.
+    {
+      quote: 'before us',
+      prefix: 'we had everything',
+      expected: 'before us, we had nothing',
+    },
+    {
+      quote: 'before us',
+      prefix: 'we had nothing',
+      expected: 'before us, we were all going',
+    },
+
+    // Approximate prefix matches.
+    {
+      quote: 'before us',
+      prefix: 'we had every-thing',
+      expected: 'before us, we had nothing',
+    },
+    {
+      quote: 'before us',
+      prefix: 'we had nout',
+      expected: 'before us, we were all going',
+    },
+
+    // Exact suffix matches.
+    {
+      quote: 'we had',
+      suffix: 'everything',
+      expected: 'we had everything',
+    },
+    {
+      quote: 'we had',
+      suffix: 'nothing',
+      expected: 'we had nothing',
+    },
+
+    // Approximate suffix matches.
+    {
+      quote: 'we had',
+      suffix: 'ever ting',
+      expected: 'we had everything',
+    },
+    {
+      quote: 'we had',
+      suffix: 'nutting',
+      expected: 'we had nothing',
+    },
+  ].forEach(({ quote, prefix, suffix, expected }, i) => {
+    it(`finds match with best context match (${i})`, () => {
+      const text = fixtures.twoCities;
+      const match = matchQuote(text, quote, {
+        prefix,
+        suffix,
+      });
+      assert.ok(match);
+      assert.equal(text.slice(match.start, match.end), quote);
+      assert.equal(match.start, text.indexOf(expected));
+    });
+  });
+
+  it('uses `hint` as a tie-breaker to choose between matches with close scores', () => {
+    const text = fixtures.twoCities;
+    const posA = text.indexOf('everything before us') + 'everything '.length;
+    const posB = text.indexOf('nothing before us') + 'nothing '.length;
+
+    // Search for a quote that appears multiple times in the text. Since no
+    // context is provided, there will be several matches with equal scores to
+    // choose between.
+    const matchHintA = matchQuote(text, 'befor us', { hint: posA });
+    const matchHintB = matchQuote(text, 'befor us', { hint: posB });
+    const matchNoHint = matchQuote(text, 'befor us');
+
+    // When a hint is provided, `matchQuote` should choose between otherwise
+    // equal matches based on how close the match start is to `hint`.
+    assert.ok(matchHintA);
+    assert.equal(matchHintA.start, posA, 'Wrong match for hint `posA`');
+
+    assert.ok(matchHintB);
+    assert.equal(matchHintB.start, posB, 'Wrong match for hint `posB`');
+
+    // When no hint is provided, the first match (ie. lowest `match.start`)
+    // should be chosen.
+    assert.ok(matchNoHint);
+    assert.equal(matchNoHint.start, posA, 'Wrong match with no hint');
+  });
+});
--- a/yarn.lock
+++ b/yarn.lock
@@ -1340,6 +1340,11 @@ append-buffer@^1.0.2:
  dependencies:
    buffer-equal "^1.0.0"

+approx-string-match@^1.1.0:
+  version "1.1.0"
+  resolved "https://registry.yarnpkg.com/approx-string-match/-/approx-string-match-1.1.0.tgz#2fb8e1d6dcb640acc1c0d1ae9f0895348d06f4c0"
+  integrity sha512-j1yQB9XhfGWsvTfHEuNsR/SrUT4XQDkAc0PEjMifyi97931LmNQyLsO6HbuvZ3HeMx+3Dvk8m8XGkUF+8lCeqw==
+
 archy@^1.0.0:
  version "1.0.0"
  resolved "https://registry.yarnpkg.com/archy/-/archy-1.0.0.tgz#f9c8c13757cc1dd7bc379ac77b2c62a5c2868c40"