Fix range selector anchoring in XHTML documents

In HTML documents that have been served with an XML mime type, `document.evaluate` handles element names differently. In HTML documents the XPath segments do not require prefixes on element names. In XHTML documents however they do. Since the client always generates the same un-prefixed XPaths regardless of document type, evaluation always failed. There was a fallback path but it was only executed if `document.evaluate` threw an error, not if it returned `null` as in this case. This commit resolves the issue by first attempting to evaluate the XPath using custom logic that only handles simple XPaths and then using `document.evaluate` only for more complex XPaths. The _simple XPath_ logic behaves the same in HTML and XML documents and ignores namespaces. As a result it works with the XPaths that the client generates regardless of document type and also regardless of whether the XPath references only HTML elements or elements from other namespaces (eg. MathML or SVG). We could change the way that the client generates XPaths in future in XML documents or SVG/MathML content within HTML documents, but the client would still need to handle `RangeSelector` selectors on existing annotations. For more details, see https://github.com/hypothesis/client/pull/2590#issuecomment-702132563 Fixes #2592

Fix range selector anchoring in XHTML documents
In HTML documents that have been served with an XML mime type, `document.evaluate` handles element names differently. In HTML documents the XPath segments do not require prefixes on element names. In XHTML documents however they do. Since the client always generates the same un-prefixed XPaths regardless of document type, evaluation always failed. There was a fallback path but it was only executed if `document.evaluate` threw an error, not if it returned `null` as in this case. This commit resolves the issue by first attempting to evaluate the XPath using custom logic that only handles simple XPaths and then using `document.evaluate` only for more complex XPaths. The _simple XPath_ logic behaves the same in HTML and XML documents and ignores namespaces. As a result it works with the XPaths that the client generates regardless of document type and also regardless of whether the XPath references only HTML elements or elements from other namespaces (eg. MathML or SVG). We could change the way that the client generates XPaths in future in XML documents or SVG/MathML content within HTML documents, but the client would still need to handle `RangeSelector` selectors on existing annotations. For more details, see https://github.com/hypothesis/client/pull/2590#issuecomment-702132563 Fixes #2592
39a806fc · Robert Knight · 205b64df · 39a806fc · 39a806fc · 39a806fc
Commit 39a806fc authored Sep 30, 2020 by Robert Knight
4 changed files
--- a/src/annotator/anchoring/test/xpath-test.js
+++ b/src/annotator/anchoring/test/xpath-test.js
@@ -12,52 +12,107 @@ describe('annotator/anchoring/xpath', () => {
            <li id="li-1">text 1</li>
            <li id="li-2">text 2</li>
            <li id="li-3">text 3</li>
+            <custom-element>text 4</custom-element>
          </ul>
+          <math xmlns="http://www.w3.org/1998/Math/MathML">
+            <msqrt><mrow><mi>x</mi><mo>+</mo><mn>1</mn></mrow></msqrt>
+          </math>
+          <svg viewBox="0 0 240 80" xmlns="http://www.w3.org/2000/svg">
+            <text x="20" y="35">Hello</text>
+            <text x="40" y="35">world</text>
+          </svg>
        </span>`;

    beforeEach(() => {
      container = document.createElement('div');
      container.innerHTML = html;
      document.body.appendChild(container);
+
+      sinon.spy(document, 'evaluate');
    });

    afterEach(() => {
+      document.evaluate.restore();
+
      container.remove();
    });

    [
+      // Single element path
      {
        xpath: '/h1[1]',
        nodeName: 'H1',
      },
+      // Multi-element path
      {
-        xpath: '/p[1]/a[1]/text()[1]',
-        nodeName: '#text',
+        xpath: '/p[1]/a[1]',
+        nodeName: 'A',
      },
      {
        xpath: '/span[1]/ul[1]/li[2]',
        nodeName: 'LI',
      },
+      // Upper-case element names
      {
        xpath: '/SPAN[1]/UL[1]/LI[2]',
        nodeName: 'LI',
      },
+      // Element path with implicit `[1]` index
+      {
+        xpath: '/h1',
+        nodeName: 'H1',
+      },
+      // Custom element
+      {
+        xpath: '/span/ul/custom-element',
+        nodeName: 'CUSTOM-ELEMENT',
+      },
+      // Embedded MathML
+      {
+        xpath: '/span/math/msqrt/mrow/mi',
+        nodeName: 'mi',
+      },
+      {
+        xpath: '/SPAN/MATH/MSQRT/MROW/MI',
+        nodeName: 'mi',
+      },
+      // Embedded SVG
      {
-        xpath: '/SPAN[1]/UL[1]/LI[2]/text()',
-        nodeName: '#text',
+        xpath: '/span[1]/svg[1]/text[2]',
+        nodeName: 'text',
      },
    ].forEach(test => {
-      it('it returns the node associated with the XPath', () => {
+      it('evaluates simple XPaths without using `document.evaluate`', () => {
        const result = nodeFromXPath(test.xpath, container);
-        assert.equal(result.nodeName, test.nodeName);
+        assert.notCalled(document.evaluate);
+        assert.equal(result?.nodeName, test.nodeName);
+      });
    });

-      it('it returns the node associated with the XPath when `document.evaluate` throws an error', () => {
-        const result = nodeFromXPath(test.xpath, container);
-        sinon.stub(document, 'evaluate').throws(new Error());
-        const resultFallback = nodeFromXPath(test.xpath, container);
-        assert.equal(result, resultFallback);
-        document.evaluate.restore();
+    ['/missing/element', '/span[0]'].forEach(xpath => {
+      it('returns `null` if simple XPath evaluation fails', () => {
+        const result = nodeFromXPath(xpath, container);
+        assert.notCalled(document.evaluate);
+        assert.strictEqual(result, null);
+      });
+    });
+
+    [
+      ['/*[local-name()="h1"]', 'H1'],
+      ['/span[-1]', null],
+    ].forEach(([xpath, expectedNodeName]) => {
+      it('uses `document.evaluate` for non-simple XPaths', () => {
+        const result = nodeFromXPath(xpath, container);
+        assert.calledOnce(document.evaluate);
+        assert.strictEqual(result?.nodeName ?? result, expectedNodeName);
+      });
+    });
+
+    ['not-a-valid-xpath'].forEach(xpath => {
+      it('throws if XPath is invalid', () => {
+        assert.throws(() => {
+          nodeFromXPath(xpath, container);
+        }, /The string '.*' is not a valid XPath expression/);
      });
    });
  });

--- a/src/annotator/anchoring/test/xpath-util-test.js
+++ b/src/annotator/anchoring/test/xpath-util-test.js
 import {
-  findChild,
  getTextNodes,
  getLastTextNodeUpTo,
  getFirstTextNodeNotBefore,
@@ -7,55 +6,6 @@ import {
 } from '../xpath-util';

 describe('annotator/anchoring/xpath-util', () => {
-  describe('findChild', () => {
-    let container;
-
-    beforeEach(() => {
-      container = document.createElement('div');
-      document.body.appendChild(container);
-    });
-
-    afterEach(() => {
-      container.remove();
-    });
-
-    it('throws an error when the parent has no children', () => {
-      assert.throws(() => {
-        findChild(container, 'fake', 0);
-      }, 'XPath error: node has no children!');
-    });
-
-    it('throws an error when the desired child is not found', () => {
-      container.innerHTML = `<div>text</div>`;
-      assert.throws(() => {
-        findChild(container, 'fake', 0);
-      }, 'XPath error: wanted child not found.');
-    });
-
-    describe('with children', () => {
-      beforeEach(() => {
-        container.innerHTML = `text 1<span>text 2</span><span>text 3</span>text 4`;
-      });
-
-      it('finds the desired text() node at index 1', () => {
-        assert.equal(findChild(container, 'text()', 1).nodeValue, 'text 1');
-        document.body.removeChild(container);
-      });
-
-      it('finds the desired text() node at index 2', () => {
-        assert.equal(findChild(container, 'text()', 2).nodeValue, 'text 4');
-      });
-
-      it('finds the desired span node at index 1', () => {
-        assert.equal(findChild(container, 'span', 1).innerText, 'text 2');
-      });
-
-      it('finds the desired span node at index 2', () => {
-        assert.equal(findChild(container, 'span', 2).innerText, 'text 3');
-      });
-    });
-  });
-
  describe('getTextNodes', () => {
    let container;


--- a/src/annotator/anchoring/xpath-util.js
+++ b/src/annotator/anchoring/xpath-util.js
-/**
- * Finds the child node associated with the provided index and
- * type relative from a parent.
- */
-export function findChild(parent, type, index) {
-  if (!parent.hasChildNodes()) {
-    throw new Error('XPath error: node has no children!');
-  }
-  const children = parent.childNodes;
-  let found = 0;
-  for (let i = 0; i < children.length; i++) {
-    const child = children[i];
-    let name = getNodeName(child);
-    if (name === type) {
-      found += 1;
-      if (found === index) {
-        return child;
-      }
-    }
-  }
-  throw new Error('XPath error: wanted child not found.');
-}
-
 /**
 * Get the node name for use in generating an xpath expression.
 *

--- a/src/annotator/anchoring/xpath.js
+++ b/src/annotator/anchoring/xpath.js
-import { findChild } from './xpath-util';
-
 export { xpathFromNode } from './xpath-util';

 /**
- * Finds an element node using an XPath relative to the document root.
+ * Return the `index`'th immediate child of `element` whose tag name is
+ * `nodeName` (case insensitive).
+ *
+ * @param {Element} element
+ * @param {string} nodeName
+ * @param {number} index
 */
-function nodeFromXPathFallback(xpath, root) {
-  const steps = xpath.substring(1).split('/');
-  let node = root;
-  steps.forEach(step => {
-    let [name, idx] = step.split('[');
-    idx = idx ? parseInt(idx.split(']')[0]) : 1;
-    node = findChild(node, name.toLowerCase(), idx);
-  });
-  return node;
+function nthChildOfType(element, nodeName, index) {
+  nodeName = nodeName.toUpperCase();
+
+  let matchIndex = -1;
+  for (let i = 0; i < element.children.length; i++) {
+    const child = element.children[i];
+    if (child.nodeName.toUpperCase() === nodeName) {
+      ++matchIndex;
+      if (matchIndex === index) {
+        return child;
+      }
+    }
+  }
+
+  return null;
 }

 /**
- * Finds an element node using an XPath relative to the document root.
+ * Evaluate a _simple XPath_ relative to a `root` element and return the
+ * matching element.
 *
- * Example:
- *   node = nodeFromXPath('/html/body/div/p[2]')
+ * A _simple XPath_ is a sequence of one or more `/tagName[index]` strings.
+ *
+ * Unlike `document.evaluate` this function:
+ *
+ *  - Only supports simple XPaths
+ *  - Is not affected by the document's _type_ (HTML or XML/XHTML)
+ *  - Ignores element namespaces when matching element names in the XPath against
+ *    elements in the DOM tree
+ *  - Is case insensitive for all elements, not just HTML elements
+ *
+ * The matching element is returned or `null` if no such element is found.
+ * An error is thrown if `xpath` is not a simple XPath.
+ *
+ * @param {string} xpath
+ * @param {Element} root
+ * @return {Element|null}
 */
-export function nodeFromXPath(xpath, root = document) {
-  /**
-   * Attempt to evaluate a provided XPath. If the evaluation fails, then fall back to a
-   * manual evaluation using `nodeFromXPathFallback` that can evaluate very simple XPaths such
-   * as those generated by `xpathFromNode`
+function evaluateSimpleXPath(xpath, root) {
+  const isSimpleXPath =
+    xpath.match(/^(\/[A-Za-z0-9-]+(\[[0-9]+\])?)+$/) !== null;
+  if (!isSimpleXPath) {
+    throw new Error('Expression is not a simple XPath');
+  }
+
+  const segments = xpath.split('/');
+  let element = root;
+
+  // Remove leading empty segment. The regex above validates that the XPath
+  // has at least two segments, with the first being empty and the others non-empty.
+  segments.shift();
+
+  for (let segment of segments) {
+    let elementName;
+    let elementIndex;
+
+    const separatorPos = segment.indexOf('[');
+    if (separatorPos !== -1) {
+      elementName = segment.slice(0, separatorPos);
+
+      const indexStr = segment.slice(separatorPos + 1, segment.indexOf(']'));
+      elementIndex = parseInt(indexStr) - 1;
+      if (elementIndex < 0) {
+        return null;
+      }
+    } else {
+      elementName = segment;
+      elementIndex = 0;
+    }
+
+    const child = nthChildOfType(element, elementName, elementIndex);
+    if (!child) {
+      return null;
+    }
+
+    element = child;
+  }
+
+  return element;
+}
+
+/**
+ * Finds an element node using an XPath relative to `root`
+ *
+ * Example:
+ *   node = nodeFromXPath('/main/article[1]/p[3]', document.body)
+ *
+ * @param {string} xpath
+ * @param {Element} [root]
 */
-  function evaluateXPath(xp, root, nsResolver = null) {
+export function nodeFromXPath(xpath, root = document.body) {
  try {
+    return evaluateSimpleXPath(xpath, root);
+  } catch (err) {
    return document.evaluate(
-        '.' + xp,
+      '.' + xpath,
      root,
-        nsResolver,
+      null /* nsResolver */,
      XPathResult.FIRST_ORDERED_NODE_TYPE,
-        null
+      null /* result */
    ).singleNodeValue;
-    } catch (e) {
-      // In the case of an XPath error, fall back to a manual evaluation
-      // that should be sufficient for simple expressions.
-      //
-      // See http://www.w3.org/TR/DOM-Level-3-XPath/xpath.html#XPathException
-      //
-      // In practice, it is unknown whether this exception occurs often. This may
-      // be a good place to insert analytics.
-      return nodeFromXPathFallback(xp, root);
-    }
-  }
-  return evaluateXPath(xpath, root);
+  }
 }