Merge pull request #1449 from hypothesis/fix-pdf-position-anchoring-bug

Fix PDF anchoring when annotation refers to last text on a page

Merge pull request #1449 from hypothesis/fix-pdf-position-anchoring-bug
Fix PDF anchoring when annotation refers to last text on a page
1c80ed66 · Robert Knight · GitHub · 7cf89d61 · 8c45ac32 · 1c80ed66
Unverified Commit 1c80ed66 authored Oct 29, 2019 by Robert Knight Committed by GitHub Oct 29, 2019
4 changed files
--- a/src/annotator/anchoring/pdf.js
+++ b/src/annotator/anchoring/pdf.js
@@ -9,9 +9,9 @@ const seek = require('dom-seek');
 const createNodeIterator = require('dom-node-iterator/polyfill')();

 const xpathRange = require('./range');
-const html = require('./html');
 const RenderingStates = require('../pdfjs-rendering-states');
 const { TextPositionAnchor, TextQuoteAnchor } = require('./types');
+const { toRange: textPositionToRange } = require('./text-position');

 // Caches for performance.

@@ -210,10 +210,9 @@ function findPage(offset) {
 *
 * @param {number} pageIndex - The PDF page index
 * @param {TextPositionAnchor} anchor - Anchor to locate in page
- * @param {Object} options - Options for `anchor.toSelector`
 * @return {Promise<Range>}
 */
-async function anchorByPosition(pageIndex, anchor, options) {
+async function anchorByPosition(pageIndex, anchor) {
  const page = await getPageView(pageIndex);

  let renderingDone = false;
@@ -221,11 +220,9 @@ async function anchorByPosition(pageIndex, anchor, options) {
    renderingDone = page.textLayer.renderingDone;
  }
  if (page.renderingState === RenderingStates.FINISHED && renderingDone) {
-    // The page has been rendered. Use HTML anchoring to locate the quote in
-    // the text layer.
+    // The page has been rendered. Locate the position in the text layer.
    const root = page.textLayer.textLayerDiv;
-    const selector = anchor.toSelector(options);
-    return html.anchor(root, [selector]);
+    return textPositionToRange(root, anchor.start, anchor.end);
  }

  // The page has not been rendered yet. Create a placeholder element and
@@ -345,10 +342,9 @@ function prioritizePages(position) {
 *
 * @param {HTMLElement} root
 * @param {Array} selectors - Selector objects to anchor
- * @param {Object} options - Options to pass to selector anchoring
 * @return {Promise<Range>}
 */
-function anchor(root, selectors, options = {}) {
+function anchor(root, selectors) {
  const position = selectors.find(s => s.type === 'TextPositionSelector');
  const quote = selectors.find(s => s.type === 'TextQuoteSelector');

@@ -371,7 +367,7 @@ function anchor(root, selectors, options = {}) {
        checkQuote(textContent.substr(start, length));

        const anchor = new TextPositionAnchor(root, start, end);
-        return anchorByPosition(index, anchor, options);
+        return anchorByPosition(index, anchor);
      });
    });
  }
@@ -386,7 +382,7 @@ function anchor(root, selectors, options = {}) {
        const { pageIndex, anchor } = quotePositionCache[quote.exact][
          position.start
        ];
-        return anchorByPosition(pageIndex, anchor, options);
+        return anchorByPosition(pageIndex, anchor);
      }

      return prioritizePages(position).then(pageIndices => {

--- a/src/annotator/anchoring/test/pdf-test.js
+++ b/src/annotator/anchoring/test/pdf-test.js
@@ -188,6 +188,19 @@ describe('annotator/anchoring/pdf', function() {
      });
    });

+    // See https://github.com/hypothesis/client/issues/1329
+    it('anchors selectors that match the last text on the page', async () => {
+      viewer.pdfViewer.setCurrentPage(1);
+      const selectors = [
+        {
+          type: 'TextQuoteSelector',
+          exact: 'horde of the living dead.',
+        },
+      ];
+      const anchoredRange = await pdfAnchoring.anchor(container, selectors);
+      assert.equal(anchoredRange.toString(), selectors[0].exact);
+    });
+
    [
      {
        // Position on same page as quote but different text.

--- a/src/annotator/anchoring/test/text-position-test.js
+++ b/src/annotator/anchoring/test/text-position-test.js
+'use strict';
+
+const { toRange } = require('../text-position');
+
+describe('text-position', () => {
+  let container;
+
+  before(() => {
+    container = document.createElement('div');
+    container.innerHTML = `<h1>Test article</h1>
+<p>First paragraph.</p>
+<p>Second paragraph.</p>`;
+  });
+
+  after(() => {
+    container.remove();
+  });
+
+  describe('toRange', () => {
+    const testCase = (description, text) => ({
+      description,
+      text,
+      expected: text,
+    });
+
+    [
+      testCase('start text of root', 'Test article'),
+      testCase('a whole text node', 'First paragraph.'),
+      testCase('end text of root', 'Second paragraph.'),
+      testCase('part of a text node', 'rst paragraph'),
+      {
+        description: 'negative start offset',
+        start: -5,
+        end: 5,
+        expected: new Error('invalid start offset'),
+      },
+      {
+        description: 'invalid start offset',
+        start: 1000,
+        end: 1010,
+        expected: new Error('invalid start offset'),
+      },
+      {
+        description: 'invalid end offset',
+        start: 0,
+        end: 1000,
+        expected: new Error('invalid end offset'),
+      },
+      {
+        description: 'an empty range',
+        start: 0,
+        end: 0,
+        expected: '',
+      },
+      {
+        description: 'a range with end < start',
+        start: 10,
+        end: 5,
+        expected: '',
+      },
+    ].forEach(({ description, start, end, expected, text }) => {
+      it(`returns a range with the correct text (${description})`, () => {
+        if (text) {
+          start = container.textContent.indexOf(text);
+          end = start + text.length;
+        }
+
+        if (expected instanceof Error) {
+          assert.throws(() => {
+            toRange(container, start, end);
+          }, expected.message);
+        } else {
+          const range = toRange(container, start, end);
+          assert.equal(range.toString(), expected);
+        }
+      });
+    });
+  });
+});
--- a/src/annotator/anchoring/text-position.js
+++ b/src/annotator/anchoring/text-position.js
+'use strict';
+
+/**
+ * Functions to convert between DOM ranges and characters offsets within the
+ * `textContent` of HTML elements.
+ *
+ * These were added to work around issues in `dom-anchor-text-position`'s
+ * `toRange` implementation. When the issue is resolved upstream, we may still
+ * want to keep the test suite for this module.
+ *
+ * See https://github.com/hypothesis/client/issues/1329
+ */
+
+/**
+ * Convert `start` and `end` character offset positions within the `textContent`
+ * of a `root` element into a `Range`.
+ *
+ * Throws if the `start` or `end` offsets are outside of the range `[0,
+ * root.textContent.length]`.
+ *
+ * @param {HTMLElement} root
+ * @param {number} start - Character offset within `root.textContent`
+ * @param {number} end - Character offset within `root.textContent`
+ * @return {Range} Range spanning text from `start` to `end`
+ */
+function toRange(root, start, end) {
+  // The `filter` and `expandEntityReferences` arguments are mandatory in IE
+  // although optional according to the spec.
+  const nodeIter = root.ownerDocument.createNodeIterator(
+    root,
+    NodeFilter.SHOW_TEXT,
+    null, // filter
+    false // expandEntityReferences
+  );
+
+  let startContainer;
+  let startOffset;
+  let endContainer;
+  let endOffset;
+
+  let textLength = 0;
+
+  let node;
+  while ((node = nodeIter.nextNode()) && (!startContainer || !endContainer)) {
+    const nodeText = node.nodeValue;
+
+    if (
+      !startContainer &&
+      start >= textLength &&
+      start <= textLength + nodeText.length
+    ) {
+      startContainer = node;
+      startOffset = start - textLength;
+    }
+
+    if (
+      !endContainer &&
+      end >= textLength &&
+      end <= textLength + nodeText.length
+    ) {
+      endContainer = node;
+      endOffset = end - textLength;
+    }
+
+    textLength += nodeText.length;
+  }
+
+  if (!startContainer) {
+    throw new Error('invalid start offset');
+  }
+  if (!endContainer) {
+    throw new Error('invalid end offset');
+  }
+
+  const range = root.ownerDocument.createRange();
+  range.setStart(startContainer, startOffset);
+  range.setEnd(endContainer, endOffset);
+
+  return range;
+}
+
+module.exports = {
+  toRange,
+};