Remove `metadata` field from `HTMLMetadata`

The `HTMLMetadata` class provided two ways to get at the document metadata: 1. A `getDocumentMetadata` method which reads the current metadata from the document and returns it 2. A `metadata` field which returns the last-read metadata The `metadata` field was not used outside the tests and shouldn't be used because it might return stale metadata (in a web page with client-side JS that updates `<meta>` and `<link>` tags etc. after the page loads). This field was also used internally by the various helper methods that gather metadata, with non-obvious constraints on the order in which the helpers are called. Remove the field to prevent external mis-use of the class and make the data flow and dependencies clearer internally.

Remove `metadata` field from `HTMLMetadata`
The `HTMLMetadata` class provided two ways to get at the document metadata: 1. A `getDocumentMetadata` method which reads the current metadata from the document and returns it 2. A `metadata` field which returns the last-read metadata The `metadata` field was not used outside the tests and shouldn't be used because it might return stale metadata (in a web page with client-side JS that updates `<meta>` and `<link>` tags etc. after the page loads). This field was also used internally by the various helper methods that gather metadata, with non-obvious constraints on the order in which the helpers are called. Remove the field to prevent external mis-use of the class and make the data flow and dependencies clearer internally.
a5c112a0 · Robert Knight · 69b0bb50 · a5c112a0 · a5c112a0
Commit a5c112a0 authored Jun 07, 2021 by Robert Knight
Hide whitespace changes
Inline Side-by-side

Showing with 74 additions and 101 deletions

html-metadata.js src/annotator/integrations/html-metadata.js +66 -92

html-metadata-test.js src/annotator/integrations/test/html-metadata-test.js +8 -9

No files found.
--- a/src/annotator/integrations/html-metadata.js
+++ b/src/annotator/integrations/html-metadata.js
@@ -38,24 +38,6 @@ import { normalizeURI } from '../util/url';
 * @prop {string} [documentFingerprint]
 */
-/**
- * Create an empty `HTMLDocumentMetadata` object.
- *
- * @return {HTMLDocumentMetadata}
- */
-function createMetadata() {
-  return {
-    title: document.title,
-    link: [],
-    dc: {},
-    eprints: {},
-    facebook: {},
-    highwire: {},
-    prism: {},
-    twitter: {},
-  };
-}
 /**
 * HTMLMetadata reads metadata/links from the current HTML document.
 */
@@ -67,13 +49,9 @@ export class HTMLMetadata {
   *   @param {normalizeURI} [options.normalizeURI]
   */
  constructor(options = {}) {
-    this.metadata = createMetadata();
    this.document = options.document || document;
    this.baseURI = options.baseURI || this.document.baseURI;
    this.normalizeURI = options.normalizeURI || normalizeURI;
-    this.getDocumentMetadata();
  }
  /**
@@ -82,8 +60,9 @@ export class HTMLMetadata {
   * @return {string}
   */
  uri() {
+    const links = this._getLinks({ dc: {}, highwire: {} });
    let uri = decodeURIComponent(this._getDocumentHref());
-    for (let link of this.metadata.link) {
+    for (let link of links) {
      if (link.rel === 'canonical') {
        uri = link.href;
      }
@@ -97,8 +76,9 @@ export class HTMLMetadata {
   * @return {string[]}
   */
  uris() {
+    const metadata = this.getDocumentMetadata();
    const uniqueUrls = {};
-    for (let link of this.metadata.link) {
+    for (let link of metadata.link) {
      if (link.href) {
        uniqueUrls[link.href] = true;
      }
@@ -108,49 +88,37 @@ export class HTMLMetadata {
  /**
   * Return metadata for the current page.
+   *
+   * @return {HTMLDocumentMetadata}
   */
  getDocumentMetadata() {
-    this.metadata = createMetadata();
+    /** @type {HTMLDocumentMetadata} */
+    const metadata = {
-    // first look for some common metadata types
+      title: document.title,
-    // TODO: look for microdata/rdfa?
+      link: [],
-    this._getHighwire();
-    this._getDublinCore();
+      dc: this._getMetaTags('dc', 'name', '.'),
-    this._getFacebook();
+      eprints: this._getMetaTags('eprints', 'name', '.'),
-    this._getEprints();
+      facebook: this._getMetaTags('og', 'property', ':'),
-    this._getPrism();
+      highwire: this._getMetaTags('citation', 'name', '_'),
-    this._getTwitter();
+      prism: this._getMetaTags('prism', 'name', '.'),
-    this._getFavicon();
+      twitter: this._getMetaTags('twitter', 'name', ':'),
+    };
-    // extract out/normalize some things
-    this._getTitle();
+    const favicon = this._getFavicon();
-    this._getLinks();
+    if (favicon) {
+      metadata.favicon = favicon;
-    return this.metadata;
+    }
-  }
-  _getHighwire() {
-    this.metadata.highwire = this._getMetaTags('citation', 'name', '_');
-  }
-  _getFacebook() {
-    this.metadata.facebook = this._getMetaTags('og', 'property', ':');
-  }
-  _getTwitter() {
-    this.metadata.twitter = this._getMetaTags('twitter', 'name', ':');
-  }
-  _getDublinCore() {
+    metadata.title = this._getTitle(metadata);
-    this.metadata.dc = this._getMetaTags('dc', 'name', '.');
+    metadata.link = this._getLinks(metadata);
-  }
-  _getPrism() {
+    const dcLink = metadata.link.find(link => link.href.startsWith('urn:x-dc'));
-    this.metadata.prism = this._getMetaTags('prism', 'name', '.');
+    if (dcLink) {
-  }
+      metadata.documentFingerprint = dcLink.href;
+    }
-  _getEprints() {
+    return metadata;
-    this.metadata.eprints = this._getMetaTags('eprints', 'name', '.');
  }
  /**
@@ -183,27 +151,31 @@ export class HTMLMetadata {
    return tags;
  }
-  _getTitle() {
+  /** @param {HTMLDocumentMetadata} metadata */
-    if (this.metadata.highwire.title) {
+  _getTitle(metadata) {
-      this.metadata.title = this.metadata.highwire.title[0];
+    if (metadata.highwire.title) {
-    } else if (this.metadata.eprints.title) {
+      return metadata.highwire.title[0];
-      this.metadata.title = this.metadata.eprints.title[0];
+    } else if (metadata.eprints.title) {
-    } else if (this.metadata.prism.title) {
+      return metadata.eprints.title[0];
-      this.metadata.title = this.metadata.prism.title[0];
+    } else if (metadata.prism.title) {
-    } else if (this.metadata.facebook.title) {
+      return metadata.prism.title[0];
-      this.metadata.title = this.metadata.facebook.title[0];
+    } else if (metadata.facebook.title) {
-    } else if (this.metadata.twitter.title) {
+      return metadata.facebook.title[0];
-      this.metadata.title = this.metadata.twitter.title[0];
+    } else if (metadata.twitter.title) {
-    } else if (this.metadata.dc.title) {
+      return metadata.twitter.title[0];
-      this.metadata.title = this.metadata.dc.title[0];
+    } else if (metadata.dc.title) {
+      return metadata.dc.title[0];
    } else {
-      this.metadata.title = this.document.title;
+      return this.document.title;
    }
  }
-  _getLinks() {
+  /**
+   * @param {Pick<HTMLDocumentMetadata, 'highwire'|'dc'>} metadata
+   */
+  _getLinks(metadata) {
    // We know our current location is a link for the document.
-    this.metadata.link = [{ href: this._getDocumentHref() }];
+    const links = [{ href: this._getDocumentHref() }];
    // Extract links from certain `<link>` tags.
    const linkElements = Array.from(this.document.querySelectorAll('link'));
@@ -227,19 +199,19 @@ export class HTMLMetadata {
      try {
        const href = this._absoluteUrl(link.href);
-        this.metadata.link.push({ href, rel: link.rel, type: link.type });
+        links.push({ href, rel: link.rel, type: link.type });
      } catch (e) {
        // Ignore URIs which cannot be parsed.
      }
    }
    // look for links in scholar metadata
-    for (let name of Object.keys(this.metadata.highwire)) {
+    for (let name of Object.keys(metadata.highwire)) {
-      const values = this.metadata.highwire[name];
+      const values = metadata.highwire[name];
      if (name === 'pdf_url') {
        for (let url of values) {
          try {
-            this.metadata.link.push({
+            links.push({
              href: this._absoluteUrl(url),
              type: 'application/pdf',
            });
@@ -257,26 +229,26 @@ export class HTMLMetadata {
          if (doi.slice(0, 4) !== 'doi:') {
            doi = `doi:${doi}`;
          }
-          this.metadata.link.push({ href: doi });
+          links.push({ href: doi });
        }
      }
    }
    // look for links in dublincore data
-    for (let name of Object.keys(this.metadata.dc)) {
+    for (let name of Object.keys(metadata.dc)) {
-      const values = this.metadata.dc[name];
+      const values = metadata.dc[name];
      if (name === 'identifier') {
        for (let id of values) {
          if (id.slice(0, 4) === 'doi:') {
-            this.metadata.link.push({ href: id });
+            links.push({ href: id });
          }
        }
      }
    }
    // look for a link to identify the resource in dublincore metadata
-    const dcRelationValues = this.metadata.dc['relation.ispartof'];
+    const dcRelationValues = metadata.dc['relation.ispartof'];
-    const dcIdentifierValues = this.metadata.dc.identifier;
+    const dcIdentifierValues = metadata.dc.identifier;
    if (dcRelationValues && dcIdentifierValues) {
      const dcUrnRelationComponent =
        dcRelationValues[dcRelationValues.length - 1];
@@ -287,22 +259,24 @@ export class HTMLMetadata {
        encodeURIComponent(dcUrnRelationComponent) +
        '/' +
        encodeURIComponent(dcUrnIdentifierComponent);
-      this.metadata.link.push({ href: dcUrn });
+      links.push({ href: dcUrn });
-      // set this as the documentFingerprint as a hint to include this in search queries
-      this.metadata.documentFingerprint = dcUrn;
    }
+    return links;
  }
  _getFavicon() {
+    let favicon = null;
    for (let link of Array.from(this.document.querySelectorAll('link'))) {
      if (['shortcut icon', 'icon'].includes(link.rel)) {
        try {
-          this.metadata.favicon = this._absoluteUrl(link.href);
+          favicon = this._absoluteUrl(link.href);
        } catch (e) {
          // Ignore URIs which cannot be parsed.
        }
      }
    }
+    return favicon;
  }
  /**

--- a/src/annotator/integrations/test/html-metadata-test.js
+++ b/src/annotator/integrations/test/html-metadata-test.js
@@ -62,8 +62,7 @@ describe('HTMLMetadata', function () {
        <link rel="canonical" href="http://example.com/canonical"></link>
      `;
-      testDocument.getDocumentMetadata();
+      metadata = testDocument.getDocumentMetadata();
-      metadata = testDocument.metadata;
    });
    it('should have metadata', () => assert.ok(metadata));
@@ -167,12 +166,12 @@ describe('HTMLMetadata', function () {
        <link rel="alternate" href="http://a:b:c">
      `;
-      testDocument.getDocumentMetadata();
+      const metadata = testDocument.getDocumentMetadata();
      // There should be one link with the document location and one for the
      // valid `<link>` tag.
-      assert.deepEqual(testDocument.metadata.link.length, 2);
+      assert.deepEqual(metadata.link.length, 2);
-      assert.deepEqual(testDocument.metadata.link[1], {
+      assert.deepEqual(metadata.link[1], {
        rel: 'alternate',
        href: 'https://example.com/foo',
        type: '',
@@ -183,19 +182,19 @@ describe('HTMLMetadata', function () {
      tempDocumentHead.innerHTML = `
        <link rel="favicon" href="http://a:b:c">
      `;
-      testDocument.getDocumentMetadata();
+      const metadata = testDocument.getDocumentMetadata();
-      assert.isUndefined(testDocument.metadata.favicon);
+      assert.isUndefined(metadata.favicon);
    });
    it('should ignore `<meta>` PDF links with invalid URIs', () => {
      tempDocumentHead.innerHTML = `
        <meta name="citation_pdf_url" content="http://a:b:c">
      `;
-      testDocument.getDocumentMetadata();
+      const metadata = testDocument.getDocumentMetadata();
      // There should only be one link for the document's location.
      // The invalid PDF link should be ignored.
-      assert.equal(testDocument.metadata.link.length, 1);
+      assert.equal(metadata.link.length, 1);
    });
  });