Commit 39a806fc authored by Robert Knight's avatar Robert Knight

Fix range selector anchoring in XHTML documents

In HTML documents that have been served with an XML mime type,
`document.evaluate` handles element names differently. In HTML documents
the XPath segments do not require prefixes on element names. In XHTML
documents however they do. Since the client always generates the same
un-prefixed XPaths regardless of document type, evaluation always
failed. There was a fallback path but it was only executed if
`document.evaluate` threw an error, not if it returned `null` as in this
case.

This commit resolves the issue by first attempting to evaluate the XPath
using custom logic that only handles simple XPaths and then using
`document.evaluate` only for more complex XPaths. The _simple XPath_
logic behaves the same in HTML and XML documents and ignores namespaces.
As a result it works with the XPaths that the client generates
regardless of document type and also regardless of whether the XPath
references only HTML elements or elements from other namespaces (eg.
MathML or SVG).

We could change the way that the client generates XPaths in future in
XML documents or SVG/MathML content within HTML documents, but the
client would still need to handle `RangeSelector` selectors on existing
annotations.

For more details, see https://github.com/hypothesis/client/pull/2590#issuecomment-702132563

Fixes #2592
parent 205b64df
......@@ -12,52 +12,107 @@ describe('annotator/anchoring/xpath', () => {
<li id="li-1">text 1</li>
<li id="li-2">text 2</li>
<li id="li-3">text 3</li>
<custom-element>text 4</custom-element>
</ul>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<msqrt><mrow><mi>x</mi><mo>+</mo><mn>1</mn></mrow></msqrt>
</math>
<svg viewBox="0 0 240 80" xmlns="http://www.w3.org/2000/svg">
<text x="20" y="35">Hello</text>
<text x="40" y="35">world</text>
</svg>
</span>`;
beforeEach(() => {
container = document.createElement('div');
container.innerHTML = html;
document.body.appendChild(container);
sinon.spy(document, 'evaluate');
});
afterEach(() => {
document.evaluate.restore();
container.remove();
});
[
// Single element path
{
xpath: '/h1[1]',
nodeName: 'H1',
},
// Multi-element path
{
xpath: '/p[1]/a[1]/text()[1]',
nodeName: '#text',
xpath: '/p[1]/a[1]',
nodeName: 'A',
},
{
xpath: '/span[1]/ul[1]/li[2]',
nodeName: 'LI',
},
// Upper-case element names
{
xpath: '/SPAN[1]/UL[1]/LI[2]',
nodeName: 'LI',
},
// Element path with implicit `[1]` index
{
xpath: '/h1',
nodeName: 'H1',
},
// Custom element
{
xpath: '/span/ul/custom-element',
nodeName: 'CUSTOM-ELEMENT',
},
// Embedded MathML
{
xpath: '/span/math/msqrt/mrow/mi',
nodeName: 'mi',
},
{
xpath: '/SPAN/MATH/MSQRT/MROW/MI',
nodeName: 'mi',
},
// Embedded SVG
{
xpath: '/SPAN[1]/UL[1]/LI[2]/text()',
nodeName: '#text',
xpath: '/span[1]/svg[1]/text[2]',
nodeName: 'text',
},
].forEach(test => {
it('it returns the node associated with the XPath', () => {
it('evaluates simple XPaths without using `document.evaluate`', () => {
const result = nodeFromXPath(test.xpath, container);
assert.equal(result.nodeName, test.nodeName);
assert.notCalled(document.evaluate);
assert.equal(result?.nodeName, test.nodeName);
});
});
it('it returns the node associated with the XPath when `document.evaluate` throws an error', () => {
const result = nodeFromXPath(test.xpath, container);
sinon.stub(document, 'evaluate').throws(new Error());
const resultFallback = nodeFromXPath(test.xpath, container);
assert.equal(result, resultFallback);
document.evaluate.restore();
['/missing/element', '/span[0]'].forEach(xpath => {
it('returns `null` if simple XPath evaluation fails', () => {
const result = nodeFromXPath(xpath, container);
assert.notCalled(document.evaluate);
assert.strictEqual(result, null);
});
});
[
['/*[local-name()="h1"]', 'H1'],
['/span[-1]', null],
].forEach(([xpath, expectedNodeName]) => {
it('uses `document.evaluate` for non-simple XPaths', () => {
const result = nodeFromXPath(xpath, container);
assert.calledOnce(document.evaluate);
assert.strictEqual(result?.nodeName ?? result, expectedNodeName);
});
});
['not-a-valid-xpath'].forEach(xpath => {
it('throws if XPath is invalid', () => {
assert.throws(() => {
nodeFromXPath(xpath, container);
}, /The string '.*' is not a valid XPath expression/);
});
});
});
......
import {
findChild,
getTextNodes,
getLastTextNodeUpTo,
getFirstTextNodeNotBefore,
......@@ -7,55 +6,6 @@ import {
} from '../xpath-util';
describe('annotator/anchoring/xpath-util', () => {
describe('findChild', () => {
let container;
beforeEach(() => {
container = document.createElement('div');
document.body.appendChild(container);
});
afterEach(() => {
container.remove();
});
it('throws an error when the parent has no children', () => {
assert.throws(() => {
findChild(container, 'fake', 0);
}, 'XPath error: node has no children!');
});
it('throws an error when the desired child is not found', () => {
container.innerHTML = `<div>text</div>`;
assert.throws(() => {
findChild(container, 'fake', 0);
}, 'XPath error: wanted child not found.');
});
describe('with children', () => {
beforeEach(() => {
container.innerHTML = `text 1<span>text 2</span><span>text 3</span>text 4`;
});
it('finds the desired text() node at index 1', () => {
assert.equal(findChild(container, 'text()', 1).nodeValue, 'text 1');
document.body.removeChild(container);
});
it('finds the desired text() node at index 2', () => {
assert.equal(findChild(container, 'text()', 2).nodeValue, 'text 4');
});
it('finds the desired span node at index 1', () => {
assert.equal(findChild(container, 'span', 1).innerText, 'text 2');
});
it('finds the desired span node at index 2', () => {
assert.equal(findChild(container, 'span', 2).innerText, 'text 3');
});
});
});
describe('getTextNodes', () => {
let container;
......
/**
* Finds the child node associated with the provided index and
* type relative from a parent.
*/
export function findChild(parent, type, index) {
if (!parent.hasChildNodes()) {
throw new Error('XPath error: node has no children!');
}
const children = parent.childNodes;
let found = 0;
for (let i = 0; i < children.length; i++) {
const child = children[i];
let name = getNodeName(child);
if (name === type) {
found += 1;
if (found === index) {
return child;
}
}
}
throw new Error('XPath error: wanted child not found.');
}
/**
* Get the node name for use in generating an xpath expression.
*
......
import { findChild } from './xpath-util';
export { xpathFromNode } from './xpath-util';
/**
* Finds an element node using an XPath relative to the document root.
* Return the `index`'th immediate child of `element` whose tag name is
* `nodeName` (case insensitive).
*
* @param {Element} element
* @param {string} nodeName
* @param {number} index
*/
function nodeFromXPathFallback(xpath, root) {
const steps = xpath.substring(1).split('/');
let node = root;
steps.forEach(step => {
let [name, idx] = step.split('[');
idx = idx ? parseInt(idx.split(']')[0]) : 1;
node = findChild(node, name.toLowerCase(), idx);
});
return node;
function nthChildOfType(element, nodeName, index) {
nodeName = nodeName.toUpperCase();
let matchIndex = -1;
for (let i = 0; i < element.children.length; i++) {
const child = element.children[i];
if (child.nodeName.toUpperCase() === nodeName) {
++matchIndex;
if (matchIndex === index) {
return child;
}
}
}
return null;
}
/**
* Finds an element node using an XPath relative to the document root.
* Evaluate a _simple XPath_ relative to a `root` element and return the
* matching element.
*
* Example:
* node = nodeFromXPath('/html/body/div/p[2]')
* A _simple XPath_ is a sequence of one or more `/tagName[index]` strings.
*
* Unlike `document.evaluate` this function:
*
* - Only supports simple XPaths
* - Is not affected by the document's _type_ (HTML or XML/XHTML)
* - Ignores element namespaces when matching element names in the XPath against
* elements in the DOM tree
* - Is case insensitive for all elements, not just HTML elements
*
* The matching element is returned or `null` if no such element is found.
* An error is thrown if `xpath` is not a simple XPath.
*
* @param {string} xpath
* @param {Element} root
* @return {Element|null}
*/
export function nodeFromXPath(xpath, root = document) {
/**
* Attempt to evaluate a provided XPath. If the evaluation fails, then fall back to a
* manual evaluation using `nodeFromXPathFallback` that can evaluate very simple XPaths such
* as those generated by `xpathFromNode`
function evaluateSimpleXPath(xpath, root) {
const isSimpleXPath =
xpath.match(/^(\/[A-Za-z0-9-]+(\[[0-9]+\])?)+$/) !== null;
if (!isSimpleXPath) {
throw new Error('Expression is not a simple XPath');
}
const segments = xpath.split('/');
let element = root;
// Remove leading empty segment. The regex above validates that the XPath
// has at least two segments, with the first being empty and the others non-empty.
segments.shift();
for (let segment of segments) {
let elementName;
let elementIndex;
const separatorPos = segment.indexOf('[');
if (separatorPos !== -1) {
elementName = segment.slice(0, separatorPos);
const indexStr = segment.slice(separatorPos + 1, segment.indexOf(']'));
elementIndex = parseInt(indexStr) - 1;
if (elementIndex < 0) {
return null;
}
} else {
elementName = segment;
elementIndex = 0;
}
const child = nthChildOfType(element, elementName, elementIndex);
if (!child) {
return null;
}
element = child;
}
return element;
}
/**
* Finds an element node using an XPath relative to `root`
*
* Example:
* node = nodeFromXPath('/main/article[1]/p[3]', document.body)
*
* @param {string} xpath
* @param {Element} [root]
*/
function evaluateXPath(xp, root, nsResolver = null) {
export function nodeFromXPath(xpath, root = document.body) {
try {
return evaluateSimpleXPath(xpath, root);
} catch (err) {
return document.evaluate(
'.' + xp,
'.' + xpath,
root,
nsResolver,
null /* nsResolver */,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
null /* result */
).singleNodeValue;
} catch (e) {
// In the case of an XPath error, fall back to a manual evaluation
// that should be sufficient for simple expressions.
//
// See http://www.w3.org/TR/DOM-Level-3-XPath/xpath.html#XPathException
//
// In practice, it is unknown whether this exception occurs often. This may
// be a good place to insert analytics.
return nodeFromXPathFallback(xp, root);
}
}
return evaluateXPath(xpath, root);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment