Commit a5c112a0 authored by Robert Knight's avatar Robert Knight

Remove `metadata` field from `HTMLMetadata`

The `HTMLMetadata` class provided two ways to get at the document
metadata:

 1. A `getDocumentMetadata` method which reads the current metadata from
    the document and returns it
 2. A `metadata` field which returns the last-read metadata

The `metadata` field was not used outside the tests and shouldn't be used
because it might return stale metadata (in a web page with client-side
JS that updates `<meta>` and `<link>` tags etc. after the page loads). This field
was also used internally by the various helper methods that gather metadata,
with non-obvious constraints on the order in which the helpers are called.

Remove the field to prevent external mis-use of the class and make the data flow
and dependencies clearer internally.
parent 69b0bb50
...@@ -38,24 +38,6 @@ import { normalizeURI } from '../util/url'; ...@@ -38,24 +38,6 @@ import { normalizeURI } from '../util/url';
* @prop {string} [documentFingerprint] * @prop {string} [documentFingerprint]
*/ */
/**
* Create an empty `HTMLDocumentMetadata` object.
*
* @return {HTMLDocumentMetadata}
*/
function createMetadata() {
return {
title: document.title,
link: [],
dc: {},
eprints: {},
facebook: {},
highwire: {},
prism: {},
twitter: {},
};
}
/** /**
* HTMLMetadata reads metadata/links from the current HTML document. * HTMLMetadata reads metadata/links from the current HTML document.
*/ */
...@@ -67,13 +49,9 @@ export class HTMLMetadata { ...@@ -67,13 +49,9 @@ export class HTMLMetadata {
* @param {normalizeURI} [options.normalizeURI] * @param {normalizeURI} [options.normalizeURI]
*/ */
constructor(options = {}) { constructor(options = {}) {
this.metadata = createMetadata();
this.document = options.document || document; this.document = options.document || document;
this.baseURI = options.baseURI || this.document.baseURI; this.baseURI = options.baseURI || this.document.baseURI;
this.normalizeURI = options.normalizeURI || normalizeURI; this.normalizeURI = options.normalizeURI || normalizeURI;
this.getDocumentMetadata();
} }
/** /**
...@@ -82,8 +60,9 @@ export class HTMLMetadata { ...@@ -82,8 +60,9 @@ export class HTMLMetadata {
* @return {string} * @return {string}
*/ */
uri() { uri() {
const links = this._getLinks({ dc: {}, highwire: {} });
let uri = decodeURIComponent(this._getDocumentHref()); let uri = decodeURIComponent(this._getDocumentHref());
for (let link of this.metadata.link) { for (let link of links) {
if (link.rel === 'canonical') { if (link.rel === 'canonical') {
uri = link.href; uri = link.href;
} }
...@@ -97,8 +76,9 @@ export class HTMLMetadata { ...@@ -97,8 +76,9 @@ export class HTMLMetadata {
* @return {string[]} * @return {string[]}
*/ */
uris() { uris() {
const metadata = this.getDocumentMetadata();
const uniqueUrls = {}; const uniqueUrls = {};
for (let link of this.metadata.link) { for (let link of metadata.link) {
if (link.href) { if (link.href) {
uniqueUrls[link.href] = true; uniqueUrls[link.href] = true;
} }
...@@ -108,49 +88,37 @@ export class HTMLMetadata { ...@@ -108,49 +88,37 @@ export class HTMLMetadata {
/** /**
* Return metadata for the current page. * Return metadata for the current page.
*
* @return {HTMLDocumentMetadata}
*/ */
getDocumentMetadata() { getDocumentMetadata() {
this.metadata = createMetadata(); /** @type {HTMLDocumentMetadata} */
const metadata = {
// first look for some common metadata types title: document.title,
// TODO: look for microdata/rdfa? link: [],
this._getHighwire();
this._getDublinCore();
this._getFacebook();
this._getEprints();
this._getPrism();
this._getTwitter();
this._getFavicon();
// extract out/normalize some things
this._getTitle();
this._getLinks();
return this.metadata;
}
_getHighwire() {
this.metadata.highwire = this._getMetaTags('citation', 'name', '_');
}
_getFacebook() { dc: this._getMetaTags('dc', 'name', '.'),
this.metadata.facebook = this._getMetaTags('og', 'property', ':'); eprints: this._getMetaTags('eprints', 'name', '.'),
} facebook: this._getMetaTags('og', 'property', ':'),
highwire: this._getMetaTags('citation', 'name', '_'),
prism: this._getMetaTags('prism', 'name', '.'),
twitter: this._getMetaTags('twitter', 'name', ':'),
};
_getTwitter() { const favicon = this._getFavicon();
this.metadata.twitter = this._getMetaTags('twitter', 'name', ':'); if (favicon) {
metadata.favicon = favicon;
} }
_getDublinCore() { metadata.title = this._getTitle(metadata);
this.metadata.dc = this._getMetaTags('dc', 'name', '.'); metadata.link = this._getLinks(metadata);
}
_getPrism() { const dcLink = metadata.link.find(link => link.href.startsWith('urn:x-dc'));
this.metadata.prism = this._getMetaTags('prism', 'name', '.'); if (dcLink) {
metadata.documentFingerprint = dcLink.href;
} }
_getEprints() { return metadata;
this.metadata.eprints = this._getMetaTags('eprints', 'name', '.');
} }
/** /**
...@@ -183,27 +151,31 @@ export class HTMLMetadata { ...@@ -183,27 +151,31 @@ export class HTMLMetadata {
return tags; return tags;
} }
_getTitle() { /** @param {HTMLDocumentMetadata} metadata */
if (this.metadata.highwire.title) { _getTitle(metadata) {
this.metadata.title = this.metadata.highwire.title[0]; if (metadata.highwire.title) {
} else if (this.metadata.eprints.title) { return metadata.highwire.title[0];
this.metadata.title = this.metadata.eprints.title[0]; } else if (metadata.eprints.title) {
} else if (this.metadata.prism.title) { return metadata.eprints.title[0];
this.metadata.title = this.metadata.prism.title[0]; } else if (metadata.prism.title) {
} else if (this.metadata.facebook.title) { return metadata.prism.title[0];
this.metadata.title = this.metadata.facebook.title[0]; } else if (metadata.facebook.title) {
} else if (this.metadata.twitter.title) { return metadata.facebook.title[0];
this.metadata.title = this.metadata.twitter.title[0]; } else if (metadata.twitter.title) {
} else if (this.metadata.dc.title) { return metadata.twitter.title[0];
this.metadata.title = this.metadata.dc.title[0]; } else if (metadata.dc.title) {
return metadata.dc.title[0];
} else { } else {
this.metadata.title = this.document.title; return this.document.title;
} }
} }
_getLinks() { /**
* @param {Pick<HTMLDocumentMetadata, 'highwire'|'dc'>} metadata
*/
_getLinks(metadata) {
// We know our current location is a link for the document. // We know our current location is a link for the document.
this.metadata.link = [{ href: this._getDocumentHref() }]; const links = [{ href: this._getDocumentHref() }];
// Extract links from certain `<link>` tags. // Extract links from certain `<link>` tags.
const linkElements = Array.from(this.document.querySelectorAll('link')); const linkElements = Array.from(this.document.querySelectorAll('link'));
...@@ -227,19 +199,19 @@ export class HTMLMetadata { ...@@ -227,19 +199,19 @@ export class HTMLMetadata {
try { try {
const href = this._absoluteUrl(link.href); const href = this._absoluteUrl(link.href);
this.metadata.link.push({ href, rel: link.rel, type: link.type }); links.push({ href, rel: link.rel, type: link.type });
} catch (e) { } catch (e) {
// Ignore URIs which cannot be parsed. // Ignore URIs which cannot be parsed.
} }
} }
// look for links in scholar metadata // look for links in scholar metadata
for (let name of Object.keys(this.metadata.highwire)) { for (let name of Object.keys(metadata.highwire)) {
const values = this.metadata.highwire[name]; const values = metadata.highwire[name];
if (name === 'pdf_url') { if (name === 'pdf_url') {
for (let url of values) { for (let url of values) {
try { try {
this.metadata.link.push({ links.push({
href: this._absoluteUrl(url), href: this._absoluteUrl(url),
type: 'application/pdf', type: 'application/pdf',
}); });
...@@ -257,26 +229,26 @@ export class HTMLMetadata { ...@@ -257,26 +229,26 @@ export class HTMLMetadata {
if (doi.slice(0, 4) !== 'doi:') { if (doi.slice(0, 4) !== 'doi:') {
doi = `doi:${doi}`; doi = `doi:${doi}`;
} }
this.metadata.link.push({ href: doi }); links.push({ href: doi });
} }
} }
} }
// look for links in dublincore data // look for links in dublincore data
for (let name of Object.keys(this.metadata.dc)) { for (let name of Object.keys(metadata.dc)) {
const values = this.metadata.dc[name]; const values = metadata.dc[name];
if (name === 'identifier') { if (name === 'identifier') {
for (let id of values) { for (let id of values) {
if (id.slice(0, 4) === 'doi:') { if (id.slice(0, 4) === 'doi:') {
this.metadata.link.push({ href: id }); links.push({ href: id });
} }
} }
} }
} }
// look for a link to identify the resource in dublincore metadata // look for a link to identify the resource in dublincore metadata
const dcRelationValues = this.metadata.dc['relation.ispartof']; const dcRelationValues = metadata.dc['relation.ispartof'];
const dcIdentifierValues = this.metadata.dc.identifier; const dcIdentifierValues = metadata.dc.identifier;
if (dcRelationValues && dcIdentifierValues) { if (dcRelationValues && dcIdentifierValues) {
const dcUrnRelationComponent = const dcUrnRelationComponent =
dcRelationValues[dcRelationValues.length - 1]; dcRelationValues[dcRelationValues.length - 1];
...@@ -287,22 +259,24 @@ export class HTMLMetadata { ...@@ -287,22 +259,24 @@ export class HTMLMetadata {
encodeURIComponent(dcUrnRelationComponent) + encodeURIComponent(dcUrnRelationComponent) +
'/' + '/' +
encodeURIComponent(dcUrnIdentifierComponent); encodeURIComponent(dcUrnIdentifierComponent);
this.metadata.link.push({ href: dcUrn }); links.push({ href: dcUrn });
// set this as the documentFingerprint as a hint to include this in search queries
this.metadata.documentFingerprint = dcUrn;
} }
return links;
} }
_getFavicon() { _getFavicon() {
let favicon = null;
for (let link of Array.from(this.document.querySelectorAll('link'))) { for (let link of Array.from(this.document.querySelectorAll('link'))) {
if (['shortcut icon', 'icon'].includes(link.rel)) { if (['shortcut icon', 'icon'].includes(link.rel)) {
try { try {
this.metadata.favicon = this._absoluteUrl(link.href); favicon = this._absoluteUrl(link.href);
} catch (e) { } catch (e) {
// Ignore URIs which cannot be parsed. // Ignore URIs which cannot be parsed.
} }
} }
} }
return favicon;
} }
/** /**
......
...@@ -62,8 +62,7 @@ describe('HTMLMetadata', function () { ...@@ -62,8 +62,7 @@ describe('HTMLMetadata', function () {
<link rel="canonical" href="http://example.com/canonical"></link> <link rel="canonical" href="http://example.com/canonical"></link>
`; `;
testDocument.getDocumentMetadata(); metadata = testDocument.getDocumentMetadata();
metadata = testDocument.metadata;
}); });
it('should have metadata', () => assert.ok(metadata)); it('should have metadata', () => assert.ok(metadata));
...@@ -167,12 +166,12 @@ describe('HTMLMetadata', function () { ...@@ -167,12 +166,12 @@ describe('HTMLMetadata', function () {
<link rel="alternate" href="http://a:b:c"> <link rel="alternate" href="http://a:b:c">
`; `;
testDocument.getDocumentMetadata(); const metadata = testDocument.getDocumentMetadata();
// There should be one link with the document location and one for the // There should be one link with the document location and one for the
// valid `<link>` tag. // valid `<link>` tag.
assert.deepEqual(testDocument.metadata.link.length, 2); assert.deepEqual(metadata.link.length, 2);
assert.deepEqual(testDocument.metadata.link[1], { assert.deepEqual(metadata.link[1], {
rel: 'alternate', rel: 'alternate',
href: 'https://example.com/foo', href: 'https://example.com/foo',
type: '', type: '',
...@@ -183,19 +182,19 @@ describe('HTMLMetadata', function () { ...@@ -183,19 +182,19 @@ describe('HTMLMetadata', function () {
tempDocumentHead.innerHTML = ` tempDocumentHead.innerHTML = `
<link rel="favicon" href="http://a:b:c"> <link rel="favicon" href="http://a:b:c">
`; `;
testDocument.getDocumentMetadata(); const metadata = testDocument.getDocumentMetadata();
assert.isUndefined(testDocument.metadata.favicon); assert.isUndefined(metadata.favicon);
}); });
it('should ignore `<meta>` PDF links with invalid URIs', () => { it('should ignore `<meta>` PDF links with invalid URIs', () => {
tempDocumentHead.innerHTML = ` tempDocumentHead.innerHTML = `
<meta name="citation_pdf_url" content="http://a:b:c"> <meta name="citation_pdf_url" content="http://a:b:c">
`; `;
testDocument.getDocumentMetadata(); const metadata = testDocument.getDocumentMetadata();
// There should only be one link for the document's location. // There should only be one link for the document's location.
// The invalid PDF link should be ignored. // The invalid PDF link should be ignored.
assert.equal(testDocument.metadata.link.length, 1); assert.equal(metadata.link.length, 1);
}); });
}); });
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment