Commit 2bc090cf authored by Robert Knight's avatar Robert Knight

Skip over `<link>` and `<meta>` tags with invalid URLs

Websites may put unparseable URLs in `<link>` and `<meta>` tags [1]. We
could choose to preserve the URL string as-is (which
`HTMLLinkElement#href` does for example) or not include it at all. I've
opted to skip over such URLs to make life easier for code consuming the
parsed links as it can rely on them being parseable as URLs.

[1] https://github.com/hypothesis/product-backlog/issues/742
parent a672c0ce
......@@ -34,6 +34,7 @@ class DocumentMeta extends Plugin {
// Test seams.
this.baseURI = this.options.baseURI || baseURI;
this.document = this.options.document || document;
this.normalizeURI = this.options.normalizeURI || normalizeURI;
this.getDocumentMetadata();
}
......@@ -181,8 +182,12 @@ class DocumentMeta extends Plugin {
}
}
const href = this._absoluteUrl(link.href);
this.metadata.link.push({href, rel: link.rel, type: link.type});
try {
const href = this._absoluteUrl(link.href);
this.metadata.link.push({href, rel: link.rel, type: link.type});
} catch (e) {
// Ignore URIs which cannot be parsed.
}
}
// look for links in scholar metadata
......@@ -190,10 +195,14 @@ class DocumentMeta extends Plugin {
const values = this.metadata.highwire[name];
if (name === 'pdf_url') {
for (let url of values) {
this.metadata.link.push({
href: this._absoluteUrl(url),
type: 'application/pdf',
});
try {
this.metadata.link.push({
href: this._absoluteUrl(url),
type: 'application/pdf',
});
} catch (e) {
// Ignore URIs which cannot be parsed.
}
}
}
......@@ -242,13 +251,21 @@ class DocumentMeta extends Plugin {
_getFavicon() {
for (let link of Array.from(this.document.querySelectorAll('link'))) {
if (['shortcut icon', 'icon'].includes(link.rel)) {
this.metadata.favicon = this._absoluteUrl(link.href);
try {
this.metadata.favicon = this._absoluteUrl(link.href);
} catch (e) {
// Ignore URIs which cannot be parsed.
}
}
}
}
/**
* Convert a possibly relative URI to an absolute one. This will throw an
* exception if the URL cannot be parsed.
*/
_absoluteUrl(url) {
return normalizeURI(url, this.baseURI);
return this.normalizeURI(url, this.baseURI);
}
// Get the true URI record when it's masked via a different protocol.
......
......@@ -15,8 +15,10 @@
const $ = require('jquery');
const DocumentMeta = require('../document');
const { normalizeURI } = require('../../util/url');
describe('DocumentMeta', function() {
let fakeNormalizeURI;
let tempDocument;
let tempDocumentHead;
let testDocument = null;
......@@ -27,8 +29,18 @@ describe('DocumentMeta', function() {
tempDocumentHead = document.createElement('head');
tempDocument.appendChild(tempDocumentHead);
fakeNormalizeURI = sinon.stub().callsFake((url, base) => {
if (url === 'http://a:b:c') {
// A modern browser would reject this URL, but PhantomJS's URL parser is
// more lenient.
throw new Error('Invalid URL');
}
return normalizeURI(url, base);
});
testDocument = new DocumentMeta(tempDocument, {
document: tempDocument,
normalizeURI: fakeNormalizeURI,
});
testDocument.pluginInit();
});
......@@ -159,6 +171,43 @@ describe('DocumentMeta', function() {
it('should have a documentFingerprint as the dc resource identifiers URN href', () => {
assert.equal(metadata.documentFingerprint, metadata.link[9].href);
});
it('should ignore `<link>` tags with invalid URIs', () => {
tempDocumentHead.innerHTML = `
<link rel="alternate" href="https://example.com/foo">
<link rel="alternate" href="http://a:b:c">
`;
testDocument.getDocumentMetadata();
// There should be one link with the document location and one for the
// valid `<link>` tag.
assert.deepEqual(testDocument.metadata.link.length, 2);
assert.deepEqual(testDocument.metadata.link[1], {
rel: 'alternate',
href: 'https://example.com/foo',
type: '',
});
});
it('should ignore favicons with invalid URIs', () => {
tempDocumentHead.innerHTML = `
<link rel="favicon" href="http://a:b:c">
`;
testDocument.getDocumentMetadata();
assert.isUndefined(testDocument.metadata.favicon);
});
it('should ignore `<meta>` PDF links with invalid URIs', () => {
tempDocumentHead.innerHTML = `
<meta name="citation_pdf_url" content="http://a:b:c">
`;
testDocument.getDocumentMetadata();
// There should only be one link for the document's location.
// The invalid PDF link should be ignored.
assert.equal(testDocument.metadata.link.length, 1);
});
});
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment