Commit 0d193d1f authored by Robert Knight's avatar Robert Knight

Check for context match before stopping search early

To reduce the chances of mis-anchoring quotes, check for a context match
before stopping early. This helps when the quote is a word or phrase
that is common in the document and the position hint is wrong, which can
occur if the quote selector was created when using a different PDF
viewer or version of PDF.js, which exacts text differently.

For quotes captured at the start or end of a page we expect a mismatch
in the prefix and suffix respectively because quote selectors can currently
capture parts of the surrounding pages / DOM in `describe`, but when
anchoring we only search for matches in one page of text at a time.
parent a8c3e38e
......@@ -437,12 +437,36 @@ async function anchorQuote(quoteSelector, positionHint) {
},
};
// If we found an exact match, stop early. This improves search performance
// in long documents compared to testing every page.
// If we find a very good match, stop early.
//
// A known limitation here is that the quote context (prefix and suffix)
// is not considered.
if (strippedText.slice(match.start, match.end) === strippedQuote) {
// There is a tradeoff here between optimizing search performance and
// ensuring that we have found the best match in the document.
//
// The current heuristics are that we require an exact match for the quote
// and either the preceding or following context. The context matching
// helps to avoid incorrectly stopping the search early if the quote is
// a word or phrase that is common in the document.
const exactQuoteMatch =
strippedText.slice(match.start, match.end) === strippedQuote;
const exactPrefixMatch =
strippedPrefix !== undefined &&
strippedText.slice(
Math.max(0, match.start - strippedPrefix.length),
match.start
) === strippedPrefix;
const exactSuffixMatch =
strippedSuffix !== undefined &&
strippedText.slice(match.end, strippedSuffix.length) === strippedSuffix;
const hasContext =
strippedPrefix !== undefined || strippedSuffix !== undefined;
if (
exactQuoteMatch &&
(exactPrefixMatch || exactSuffixMatch || !hasContext)
) {
break;
}
}
......
......@@ -320,6 +320,74 @@ describe('annotator/anchoring/pdf', () => {
});
});
[
{
// If there is only a prefix, that should match.
test: 'prefix-only',
prefix: 'that',
suffix: undefined,
expectedMatch: 'Netherfield Park is occupied again?',
},
{
// If there is only a suffix, that should match.
test: 'suffix-only',
prefix: undefined,
suffix: ' Park is occupied',
expectedMatch: 'Netherfield Park is occupied again?',
},
{
// If there is both a prefix and suffix, either can match
test: 'suffix-match',
prefix: 'DOES NOT MATCH',
suffix: ' Park is occupied',
expectedMatch: 'Netherfield Park is occupied again?',
},
{
// If there is both a prefix and suffix, either can match
test: 'prefix-match',
prefix: 'that',
suffix: 'DOES NOT MATCH',
expectedMatch: 'Netherfield Park is occupied again?',
},
{
// If there is neither a prefix or suffix, only the quote matters.
test: 'no-context',
prefix: undefined,
suffix: undefined,
expectedMatch: 'recent attacks at Netherfield Park',
},
].forEach(({ test, prefix, suffix, expectedMatch }) => {
it(`prefers a context match for quote selectors (${test})`, async () => {
const expectedPage = fixtures.pdfPages.findIndex(page =>
page.includes(expectedMatch)
);
assert.notEqual(expectedPage, -1);
// Ensure the page where we expect to find the match is rendered, otherwise
// the quote will be anchored to a placeholder.
viewer.pdfViewer.setCurrentPage(expectedPage);
// Create a quote selector where the `exact` phrase occurs on multiple
// pages.
const quote = {
type: 'TextQuoteSelector',
exact: 'Netherfield',
prefix,
suffix,
};
// Anchor the quote without providing a position selector, so pages are tried in order.
const range = await pdfAnchoring.anchor(container, [quote]);
// Check that we found the match on the expected page.
assert.equal(range.toString(), 'Netherfield');
assert.include(
range.startContainer.parentElement.textContent,
expectedMatch
);
});
});
// The above test does high-level checking that whitespace mismatches don't
// affect quote anchoring. This test checks calls to `matchQuote` in more detail.
it('ignores spaces when searching for quote matches', async () => {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment