Merge pull request #826 from hypothesis/handle-invalid-urls-in-links

Ignore invalid URLs in `<link>` and `<meta>` tags

Merge pull request #826 from hypothesis/handle-invalid-urls-in-links
Ignore invalid URLs in `<link>` and `<meta>` tags
9d7ff2ea · Robert Knight · GitHub · 1e307a55 · 2bc090cf · 9d7ff2ea
Unverified Commit 9d7ff2ea authored Dec 11, 2018 by Robert Knight Committed by GitHub Dec 11, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 130 additions and 52 deletions

document.js src/annotator/plugin/document.js +44 -28

document-test.js src/annotator/plugin/test/document-test.js +86 -24

No files found.
--- a/src/annotator/plugin/document.js
+++ b/src/annotator/plugin/document.js
@@ -34,6 +34,7 @@ class DocumentMeta extends Plugin {
    // Test seams.
    this.baseURI = this.options.baseURI || baseURI;
    this.document = this.options.document || document;
+    this.normalizeURI = this.options.normalizeURI || normalizeURI;

    this.getDocumentMetadata();
  }
@@ -160,40 +161,48 @@ class DocumentMeta extends Plugin {
  }

  _getLinks() {
-    // we know our current location is a link for the document
-    let href;
-    let type;
-    let values;
+    // We know our current location is a link for the document.
    this.metadata.link = [{href: this._getDocumentHref()}];

-    // look for some relevant link relations
-    for (let link of Array.from(this.document.querySelectorAll('link'))) {
-      href = this._absoluteUrl(link.href); // get absolute url
-      const { rel } = link;
-      ({ type } = link);
-      const lang = link.hreflang;
-
-      if (!['alternate', 'canonical', 'bookmark', 'shortlink'].includes(rel)) { continue; }
-
-      if (rel === 'alternate') {
-        // Ignore feeds resources
-        if (type && type.match(/^application\/(rss|atom)\+xml/)) { continue; }
-        // Ignore alternate languages
-        if (lang) { continue; }
+    // Extract links from certain `<link>` tags.
+    const linkElements = Array.from(this.document.querySelectorAll('link'));
+    for (let link of linkElements) {
+      if (!['alternate', 'canonical', 'bookmark', 'shortlink'].includes(link.rel)) {
+        continue;
+      }
+
+      if (link.rel === 'alternate') {
+        // Ignore RSS feed links.
+        if (link.type && link.type.match(/^application\/(rss|atom)\+xml/)) {
+          continue;
+        }
+        // Ignore alternate languages.
+        if (link.hreflang) {
+          continue;
+        }
      }

-      this.metadata.link.push({href, rel, type});
+      try {
+        const href = this._absoluteUrl(link.href);
+        this.metadata.link.push({href, rel: link.rel, type: link.type});
+      } catch (e) {
+        // Ignore URIs which cannot be parsed.
+      }
    }

    // look for links in scholar metadata
    for (let name of Object.keys(this.metadata.highwire)) {
-      values = this.metadata.highwire[name];
+      const values = this.metadata.highwire[name];
      if (name === 'pdf_url') {
        for (let url of values) {
-          this.metadata.link.push({
-            href: this._absoluteUrl(url),
-            type: 'application/pdf',
-          });
+          try {
+            this.metadata.link.push({
+              href: this._absoluteUrl(url),
+              type: 'application/pdf',
+            });
+          } catch (e) {
+            // Ignore URIs which cannot be parsed.
+          }
        }
      }

@@ -212,7 +221,7 @@ class DocumentMeta extends Plugin {

    // look for links in dublincore data
    for (let name of Object.keys(this.metadata.dc)) {
-      values = this.metadata.dc[name];
+      const values = this.metadata.dc[name];
      if (name === 'identifier') {
        for (let id of values) {
          if (id.slice(0, 4) === 'doi:') {
@@ -242,14 +251,21 @@ class DocumentMeta extends Plugin {
  _getFavicon() {
    for (let link of Array.from(this.document.querySelectorAll('link'))) {
      if (['shortcut icon', 'icon'].includes(link.rel)) {
-        this.metadata.favicon = this._absoluteUrl(link.href);
+        try {
+          this.metadata.favicon = this._absoluteUrl(link.href);
+        } catch (e) {
+          // Ignore URIs which cannot be parsed.
+        }
      }
    }
  }

-  // Hack to get a absolute url from a possibly relative one
+  /**
+   * Convert a possibly relative URI to an absolute one. This will throw an
+   * exception if the URL cannot be parsed.
+   */
  _absoluteUrl(url) {
-    return normalizeURI(url, this.baseURI);
+    return this.normalizeURI(url, this.baseURI);
  }

  // Get the true URI record when it's masked via a different protocol.

--- a/src/annotator/plugin/test/document-test.js
+++ b/src/annotator/plugin/test/document-test.js
@@ -15,43 +15,68 @@
 const $ = require('jquery');

 const DocumentMeta = require('../document');
+const { normalizeURI } = require('../../util/url');

 describe('DocumentMeta', function() {
+  let fakeNormalizeURI;
+  let tempDocument;
+  let tempDocumentHead;
  let testDocument = null;

  beforeEach(function() {
-    testDocument = new DocumentMeta($('<div></div>')[0], {});
+    tempDocument = document.createDocumentFragment();
+    tempDocument.location = { href: 'https://example.com' };
+    tempDocumentHead = document.createElement('head');
+    tempDocument.appendChild(tempDocumentHead);
+
+    fakeNormalizeURI = sinon.stub().callsFake((url, base) => {
+      if (url === 'http://a:b:c') {
+        // A modern browser would reject this URL, but PhantomJS's URL parser is
+        // more lenient.
+        throw new Error('Invalid URL');
+      }
+      return normalizeURI(url, base);
+    });
+
+    testDocument = new DocumentMeta(tempDocument, {
+      document: tempDocument,
+      normalizeURI: fakeNormalizeURI,
+    });
    testDocument.pluginInit();
  });

  afterEach(() => $(document).unbind());

  describe('annotation should have some metadata', function() {
-    // Add some metadata to the page
-    const head = $('head');
-    head.append('<link rel="alternate" href="foo.pdf" type="application/pdf"></link>');
-    head.append('<link rel="alternate" href="foo.doc" type="application/msword"></link>');
-    head.append('<link rel="bookmark" href="http://example.com/bookmark"></link>');
-    head.append('<link rel="shortlink" href="http://example.com/bookmark/short"></link>');
-    head.append('<link rel="alternate" href="es/foo.html" hreflang="es" type="text/html"></link>');
-    head.append('<meta name="citation_doi" content="10.1175/JCLI-D-11-00015.1">');
-    head.append('<meta name="citation_title" content="Foo">');
-    head.append('<meta name="citation_pdf_url" content="foo.pdf">');
-    head.append('<meta name="dc.identifier" content="doi:10.1175/JCLI-D-11-00015.1">');
-    head.append('<meta name="dc:identifier" content="foobar-abcxyz">');
-    head.append('<meta name="dc.relation.ispartof" content="isbn:123456789">');
-    head.append('<meta name="DC.type" content="Article">');
-    head.append('<meta property="og:url" content="http://example.com">');
-    head.append('<meta name="twitter:site" content="@okfn">');
-    head.append('<link rel="icon" href="http://example.com/images/icon.ico"></link>');
-    head.append('<meta name="eprints.title" content="Computer Lib / Dream Machines">');
-    head.append('<meta name="prism.title" content="Literary Machines">');
-    head.append('<link rel="alternate" href="feed" type="application/rss+xml"></link>');
-    head.append('<link rel="canonical" href="http://example.com/canonical"></link>');
-
    let metadata = null;

-    beforeEach(() => metadata = testDocument.metadata);
+    beforeEach(() => {
+      // Add some metadata to the page
+      tempDocumentHead.innerHTML = `
+        <link rel="alternate" href="foo.pdf" type="application/pdf"></link>
+        <link rel="alternate" href="foo.doc" type="application/msword"></link>
+        <link rel="bookmark" href="http://example.com/bookmark"></link>
+        <link rel="shortlink" href="http://example.com/bookmark/short"></link>
+        <link rel="alternate" href="es/foo.html" hreflang="es" type="text/html"></link>
+        <meta name="citation_doi" content="10.1175/JCLI-D-11-00015.1">
+        <meta name="citation_title" content="Foo">
+        <meta name="citation_pdf_url" content="foo.pdf">
+        <meta name="dc.identifier" content="doi:10.1175/JCLI-D-11-00015.1">
+        <meta name="dc:identifier" content="foobar-abcxyz">
+        <meta name="dc.relation.ispartof" content="isbn:123456789">
+        <meta name="DC.type" content="Article">
+        <meta property="og:url" content="http://example.com">
+        <meta name="twitter:site" content="@okfn">
+        <link rel="icon" href="http://example.com/images/icon.ico"></link>
+        <meta name="eprints.title" content="Computer Lib / Dream Machines">
+        <meta name="prism.title" content="Literary Machines">
+        <link rel="alternate" href="feed" type="application/rss+xml"></link>
+        <link rel="canonical" href="http://example.com/canonical"></link>
+      `;
+
+      testDocument.getDocumentMetadata();
+      metadata = testDocument.metadata;
+    });

    it('should have metadata', () => assert.ok(metadata));

@@ -146,6 +171,43 @@ describe('DocumentMeta', function() {
    it('should have a documentFingerprint as the dc resource identifiers URN href', () => {
      assert.equal(metadata.documentFingerprint, metadata.link[9].href);
    });
+
+    it('should ignore `<link>` tags with invalid URIs', () => {
+      tempDocumentHead.innerHTML = `
+        <link rel="alternate" href="https://example.com/foo">
+        <link rel="alternate" href="http://a:b:c">
+      `;
+
+      testDocument.getDocumentMetadata();
+
+      // There should be one link with the document location and one for the
+      // valid `<link>` tag.
+      assert.deepEqual(testDocument.metadata.link.length, 2);
+      assert.deepEqual(testDocument.metadata.link[1], {
+        rel: 'alternate',
+        href: 'https://example.com/foo',
+        type: '',
+      });
+    });
+
+    it('should ignore favicons with invalid URIs', () => {
+      tempDocumentHead.innerHTML = `
+        <link rel="favicon" href="http://a:b:c">
+      `;
+      testDocument.getDocumentMetadata();
+      assert.isUndefined(testDocument.metadata.favicon);
+    });
+
+    it('should ignore `<meta>` PDF links with invalid URIs', () => {
+      tempDocumentHead.innerHTML = `
+        <meta name="citation_pdf_url" content="http://a:b:c">
+      `;
+      testDocument.getDocumentMetadata();
+
+      // There should only be one link for the document's location.
+      // The invalid PDF link should be ignored.
+      assert.equal(testDocument.metadata.link.length, 1);
+    });
  });