Commit a5c112a0 authored by Robert Knight's avatar Robert Knight

Remove `metadata` field from `HTMLMetadata`

The `HTMLMetadata` class provided two ways to get at the document

 1. A `getDocumentMetadata` method which reads the current metadata from
    the document and returns it
 2. A `metadata` field which returns the last-read metadata

The `metadata` field was not used outside the tests and shouldn't be used
because it might return stale metadata (in a web page with client-side
JS that updates `<meta>` and `<link>` tags etc. after the page loads). This field
was also used internally by the various helper methods that gather metadata,
with non-obvious constraints on the order in which the helpers are called.

Remove the field to prevent external mis-use of the class and make the data flow
and dependencies clearer internally.
parent 69b0bb50
......@@ -38,24 +38,6 @@ import { normalizeURI } from '../util/url';
* @prop {string} [documentFingerprint]
* Create an empty `HTMLDocumentMetadata` object.
* @return {HTMLDocumentMetadata}
function createMetadata() {
return {
title: document.title,
link: [],
dc: {},
eprints: {},
facebook: {},
highwire: {},
prism: {},
twitter: {},
* HTMLMetadata reads metadata/links from the current HTML document.
......@@ -67,13 +49,9 @@ export class HTMLMetadata {
* @param {normalizeURI} [options.normalizeURI]
constructor(options = {}) {
this.metadata = createMetadata();
this.document = options.document || document;
this.baseURI = options.baseURI || this.document.baseURI;
this.normalizeURI = options.normalizeURI || normalizeURI;
......@@ -82,8 +60,9 @@ export class HTMLMetadata {
* @return {string}
uri() {
const links = this._getLinks({ dc: {}, highwire: {} });
let uri = decodeURIComponent(this._getDocumentHref());
for (let link of {
for (let link of links) {
if (link.rel === 'canonical') {
uri = link.href;
......@@ -97,8 +76,9 @@ export class HTMLMetadata {
* @return {string[]}
uris() {
const metadata = this.getDocumentMetadata();
const uniqueUrls = {};
for (let link of {
for (let link of {
if (link.href) {
uniqueUrls[link.href] = true;
......@@ -108,49 +88,37 @@ export class HTMLMetadata {
* Return metadata for the current page.
* @return {HTMLDocumentMetadata}
getDocumentMetadata() {
this.metadata = createMetadata();
// first look for some common metadata types
// TODO: look for microdata/rdfa?
// extract out/normalize some things
return this.metadata;
_getHighwire() {
this.metadata.highwire = this._getMetaTags('citation', 'name', '_');
_getFacebook() {
this.metadata.facebook = this._getMetaTags('og', 'property', ':');
_getTwitter() {
this.metadata.twitter = this._getMetaTags('twitter', 'name', ':');
/** @type {HTMLDocumentMetadata} */
const metadata = {
title: document.title,
link: [],
dc: this._getMetaTags('dc', 'name', '.'),
eprints: this._getMetaTags('eprints', 'name', '.'),
facebook: this._getMetaTags('og', 'property', ':'),
highwire: this._getMetaTags('citation', 'name', '_'),
prism: this._getMetaTags('prism', 'name', '.'),
twitter: this._getMetaTags('twitter', 'name', ':'),
const favicon = this._getFavicon();
if (favicon) {
metadata.favicon = favicon;
_getDublinCore() {
this.metadata.dc = this._getMetaTags('dc', 'name', '.');
metadata.title = this._getTitle(metadata); = this._getLinks(metadata);
_getPrism() {
this.metadata.prism = this._getMetaTags('prism', 'name', '.');
const dcLink = => link.href.startsWith('urn:x-dc'));
if (dcLink) {
metadata.documentFingerprint = dcLink.href;
_getEprints() {
this.metadata.eprints = this._getMetaTags('eprints', 'name', '.');
return metadata;
......@@ -183,27 +151,31 @@ export class HTMLMetadata {
return tags;
_getTitle() {
if (this.metadata.highwire.title) {
this.metadata.title = this.metadata.highwire.title[0];
} else if (this.metadata.eprints.title) {
this.metadata.title = this.metadata.eprints.title[0];
} else if (this.metadata.prism.title) {
this.metadata.title = this.metadata.prism.title[0];
} else if (this.metadata.facebook.title) {
this.metadata.title = this.metadata.facebook.title[0];
} else if (this.metadata.twitter.title) {
this.metadata.title = this.metadata.twitter.title[0];
} else if (this.metadata.dc.title) {
this.metadata.title = this.metadata.dc.title[0];
/** @param {HTMLDocumentMetadata} metadata */
_getTitle(metadata) {
if (metadata.highwire.title) {
return metadata.highwire.title[0];
} else if (metadata.eprints.title) {
return metadata.eprints.title[0];
} else if (metadata.prism.title) {
return metadata.prism.title[0];
} else if (metadata.facebook.title) {
return metadata.facebook.title[0];
} else if (metadata.twitter.title) {
return metadata.twitter.title[0];
} else if (metadata.dc.title) {
return metadata.dc.title[0];
} else {
this.metadata.title = this.document.title;
return this.document.title;
_getLinks() {
* @param {Pick<HTMLDocumentMetadata, 'highwire'|'dc'>} metadata
_getLinks(metadata) {
// We know our current location is a link for the document. = [{ href: this._getDocumentHref() }];
const links = [{ href: this._getDocumentHref() }];
// Extract links from certain `<link>` tags.
const linkElements = Array.from(this.document.querySelectorAll('link'));
......@@ -227,19 +199,19 @@ export class HTMLMetadata {
try {
const href = this._absoluteUrl(link.href);{ href, rel: link.rel, type: link.type });
links.push({ href, rel: link.rel, type: link.type });
} catch (e) {
// Ignore URIs which cannot be parsed.
// look for links in scholar metadata
for (let name of Object.keys(this.metadata.highwire)) {
const values = this.metadata.highwire[name];
for (let name of Object.keys(metadata.highwire)) {
const values = metadata.highwire[name];
if (name === 'pdf_url') {
for (let url of values) {
try {{
href: this._absoluteUrl(url),
type: 'application/pdf',
......@@ -257,26 +229,26 @@ export class HTMLMetadata {
if (doi.slice(0, 4) !== 'doi:') {
doi = `doi:${doi}`;
}{ href: doi });
links.push({ href: doi });
// look for links in dublincore data
for (let name of Object.keys(this.metadata.dc)) {
const values = this.metadata.dc[name];
for (let name of Object.keys(metadata.dc)) {
const values = metadata.dc[name];
if (name === 'identifier') {
for (let id of values) {
if (id.slice(0, 4) === 'doi:') {{ href: id });
links.push({ href: id });
// look for a link to identify the resource in dublincore metadata
const dcRelationValues = this.metadata.dc['relation.ispartof'];
const dcIdentifierValues = this.metadata.dc.identifier;
const dcRelationValues = metadata.dc['relation.ispartof'];
const dcIdentifierValues = metadata.dc.identifier;
if (dcRelationValues && dcIdentifierValues) {
const dcUrnRelationComponent =
dcRelationValues[dcRelationValues.length - 1];
......@@ -287,22 +259,24 @@ export class HTMLMetadata {
encodeURIComponent(dcUrnRelationComponent) +
'/' +
encodeURIComponent(dcUrnIdentifierComponent);{ href: dcUrn });
// set this as the documentFingerprint as a hint to include this in search queries
this.metadata.documentFingerprint = dcUrn;
links.push({ href: dcUrn });
return links;
_getFavicon() {
let favicon = null;
for (let link of Array.from(this.document.querySelectorAll('link'))) {
if (['shortcut icon', 'icon'].includes(link.rel)) {
try {
this.metadata.favicon = this._absoluteUrl(link.href);
favicon = this._absoluteUrl(link.href);
} catch (e) {
// Ignore URIs which cannot be parsed.
return favicon;
......@@ -62,8 +62,7 @@ describe('HTMLMetadata', function () {
<link rel="canonical" href=""></link>
metadata = testDocument.metadata;
metadata = testDocument.getDocumentMetadata();
it('should have metadata', () => assert.ok(metadata));
......@@ -167,12 +166,12 @@ describe('HTMLMetadata', function () {
<link rel="alternate" href="http://a:b:c">
const metadata = testDocument.getDocumentMetadata();
// There should be one link with the document location and one for the
// valid `<link>` tag.
assert.deepEqual(, 2);
assert.deepEqual([1], {
assert.deepEqual(, 2);
assert.deepEqual([1], {
rel: 'alternate',
href: '',
type: '',
......@@ -183,19 +182,19 @@ describe('HTMLMetadata', function () {
tempDocumentHead.innerHTML = `
<link rel="favicon" href="http://a:b:c">
const metadata = testDocument.getDocumentMetadata();
it('should ignore `<meta>` PDF links with invalid URIs', () => {
tempDocumentHead.innerHTML = `
<meta name="citation_pdf_url" content="http://a:b:c">
const metadata = testDocument.getDocumentMetadata();
// There should only be one link for the document's location.
// The invalid PDF link should be ignored.
assert.equal(, 1);
assert.equal(, 1);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment