Commit a5c112a0 authored by Robert Knight's avatar Robert Knight

Remove `metadata` field from `HTMLMetadata`

The `HTMLMetadata` class provided two ways to get at the document
metadata:

 1. A `getDocumentMetadata` method which reads the current metadata from
    the document and returns it
 2. A `metadata` field which returns the last-read metadata

The `metadata` field was not used outside the tests and shouldn't be used
because it might return stale metadata (in a web page with client-side
JS that updates `<meta>` and `<link>` tags etc. after the page loads). This field
was also used internally by the various helper methods that gather metadata,
with non-obvious constraints on the order in which the helpers are called.

Remove the field to prevent external mis-use of the class and make the data flow
and dependencies clearer internally.
parent 69b0bb50
......@@ -38,24 +38,6 @@ import { normalizeURI } from '../util/url';
* @prop {string} [documentFingerprint]
*/
/**
* Create an empty `HTMLDocumentMetadata` object.
*
* @return {HTMLDocumentMetadata}
*/
function createMetadata() {
return {
title: document.title,
link: [],
dc: {},
eprints: {},
facebook: {},
highwire: {},
prism: {},
twitter: {},
};
}
/**
* HTMLMetadata reads metadata/links from the current HTML document.
*/
......@@ -67,13 +49,9 @@ export class HTMLMetadata {
* @param {normalizeURI} [options.normalizeURI]
*/
constructor(options = {}) {
this.metadata = createMetadata();
this.document = options.document || document;
this.baseURI = options.baseURI || this.document.baseURI;
this.normalizeURI = options.normalizeURI || normalizeURI;
this.getDocumentMetadata();
}
/**
......@@ -82,8 +60,9 @@ export class HTMLMetadata {
* @return {string}
*/
uri() {
const links = this._getLinks({ dc: {}, highwire: {} });
let uri = decodeURIComponent(this._getDocumentHref());
for (let link of this.metadata.link) {
for (let link of links) {
if (link.rel === 'canonical') {
uri = link.href;
}
......@@ -97,8 +76,9 @@ export class HTMLMetadata {
* @return {string[]}
*/
uris() {
const metadata = this.getDocumentMetadata();
const uniqueUrls = {};
for (let link of this.metadata.link) {
for (let link of metadata.link) {
if (link.href) {
uniqueUrls[link.href] = true;
}
......@@ -108,49 +88,37 @@ export class HTMLMetadata {
/**
* Return metadata for the current page.
*
* @return {HTMLDocumentMetadata}
*/
getDocumentMetadata() {
this.metadata = createMetadata();
// first look for some common metadata types
// TODO: look for microdata/rdfa?
this._getHighwire();
this._getDublinCore();
this._getFacebook();
this._getEprints();
this._getPrism();
this._getTwitter();
this._getFavicon();
// extract out/normalize some things
this._getTitle();
this._getLinks();
return this.metadata;
}
_getHighwire() {
this.metadata.highwire = this._getMetaTags('citation', 'name', '_');
}
_getFacebook() {
this.metadata.facebook = this._getMetaTags('og', 'property', ':');
}
_getTwitter() {
this.metadata.twitter = this._getMetaTags('twitter', 'name', ':');
}
/** @type {HTMLDocumentMetadata} */
const metadata = {
title: document.title,
link: [],
dc: this._getMetaTags('dc', 'name', '.'),
eprints: this._getMetaTags('eprints', 'name', '.'),
facebook: this._getMetaTags('og', 'property', ':'),
highwire: this._getMetaTags('citation', 'name', '_'),
prism: this._getMetaTags('prism', 'name', '.'),
twitter: this._getMetaTags('twitter', 'name', ':'),
};
const favicon = this._getFavicon();
if (favicon) {
metadata.favicon = favicon;
}
_getDublinCore() {
this.metadata.dc = this._getMetaTags('dc', 'name', '.');
}
metadata.title = this._getTitle(metadata);
metadata.link = this._getLinks(metadata);
_getPrism() {
this.metadata.prism = this._getMetaTags('prism', 'name', '.');
}
const dcLink = metadata.link.find(link => link.href.startsWith('urn:x-dc'));
if (dcLink) {
metadata.documentFingerprint = dcLink.href;
}
_getEprints() {
this.metadata.eprints = this._getMetaTags('eprints', 'name', '.');
return metadata;
}
/**
......@@ -183,27 +151,31 @@ export class HTMLMetadata {
return tags;
}
_getTitle() {
if (this.metadata.highwire.title) {
this.metadata.title = this.metadata.highwire.title[0];
} else if (this.metadata.eprints.title) {
this.metadata.title = this.metadata.eprints.title[0];
} else if (this.metadata.prism.title) {
this.metadata.title = this.metadata.prism.title[0];
} else if (this.metadata.facebook.title) {
this.metadata.title = this.metadata.facebook.title[0];
} else if (this.metadata.twitter.title) {
this.metadata.title = this.metadata.twitter.title[0];
} else if (this.metadata.dc.title) {
this.metadata.title = this.metadata.dc.title[0];
/** @param {HTMLDocumentMetadata} metadata */
_getTitle(metadata) {
if (metadata.highwire.title) {
return metadata.highwire.title[0];
} else if (metadata.eprints.title) {
return metadata.eprints.title[0];
} else if (metadata.prism.title) {
return metadata.prism.title[0];
} else if (metadata.facebook.title) {
return metadata.facebook.title[0];
} else if (metadata.twitter.title) {
return metadata.twitter.title[0];
} else if (metadata.dc.title) {
return metadata.dc.title[0];
} else {
this.metadata.title = this.document.title;
return this.document.title;
}
}
_getLinks() {
/**
* @param {Pick<HTMLDocumentMetadata, 'highwire'|'dc'>} metadata
*/
_getLinks(metadata) {
// We know our current location is a link for the document.
this.metadata.link = [{ href: this._getDocumentHref() }];
const links = [{ href: this._getDocumentHref() }];
// Extract links from certain `<link>` tags.
const linkElements = Array.from(this.document.querySelectorAll('link'));
......@@ -227,19 +199,19 @@ export class HTMLMetadata {
try {
const href = this._absoluteUrl(link.href);
this.metadata.link.push({ href, rel: link.rel, type: link.type });
links.push({ href, rel: link.rel, type: link.type });
} catch (e) {
// Ignore URIs which cannot be parsed.
}
}
// look for links in scholar metadata
for (let name of Object.keys(this.metadata.highwire)) {
const values = this.metadata.highwire[name];
for (let name of Object.keys(metadata.highwire)) {
const values = metadata.highwire[name];
if (name === 'pdf_url') {
for (let url of values) {
try {
this.metadata.link.push({
links.push({
href: this._absoluteUrl(url),
type: 'application/pdf',
});
......@@ -257,26 +229,26 @@ export class HTMLMetadata {
if (doi.slice(0, 4) !== 'doi:') {
doi = `doi:${doi}`;
}
this.metadata.link.push({ href: doi });
links.push({ href: doi });
}
}
}
// look for links in dublincore data
for (let name of Object.keys(this.metadata.dc)) {
const values = this.metadata.dc[name];
for (let name of Object.keys(metadata.dc)) {
const values = metadata.dc[name];
if (name === 'identifier') {
for (let id of values) {
if (id.slice(0, 4) === 'doi:') {
this.metadata.link.push({ href: id });
links.push({ href: id });
}
}
}
}
// look for a link to identify the resource in dublincore metadata
const dcRelationValues = this.metadata.dc['relation.ispartof'];
const dcIdentifierValues = this.metadata.dc.identifier;
const dcRelationValues = metadata.dc['relation.ispartof'];
const dcIdentifierValues = metadata.dc.identifier;
if (dcRelationValues && dcIdentifierValues) {
const dcUrnRelationComponent =
dcRelationValues[dcRelationValues.length - 1];
......@@ -287,22 +259,24 @@ export class HTMLMetadata {
encodeURIComponent(dcUrnRelationComponent) +
'/' +
encodeURIComponent(dcUrnIdentifierComponent);
this.metadata.link.push({ href: dcUrn });
// set this as the documentFingerprint as a hint to include this in search queries
this.metadata.documentFingerprint = dcUrn;
links.push({ href: dcUrn });
}
return links;
}
_getFavicon() {
let favicon = null;
for (let link of Array.from(this.document.querySelectorAll('link'))) {
if (['shortcut icon', 'icon'].includes(link.rel)) {
try {
this.metadata.favicon = this._absoluteUrl(link.href);
favicon = this._absoluteUrl(link.href);
} catch (e) {
// Ignore URIs which cannot be parsed.
}
}
}
return favicon;
}
/**
......
......@@ -62,8 +62,7 @@ describe('HTMLMetadata', function () {
<link rel="canonical" href="http://example.com/canonical"></link>
`;
testDocument.getDocumentMetadata();
metadata = testDocument.metadata;
metadata = testDocument.getDocumentMetadata();
});
it('should have metadata', () => assert.ok(metadata));
......@@ -167,12 +166,12 @@ describe('HTMLMetadata', function () {
<link rel="alternate" href="http://a:b:c">
`;
testDocument.getDocumentMetadata();
const metadata = testDocument.getDocumentMetadata();
// There should be one link with the document location and one for the
// valid `<link>` tag.
assert.deepEqual(testDocument.metadata.link.length, 2);
assert.deepEqual(testDocument.metadata.link[1], {
assert.deepEqual(metadata.link.length, 2);
assert.deepEqual(metadata.link[1], {
rel: 'alternate',
href: 'https://example.com/foo',
type: '',
......@@ -183,19 +182,19 @@ describe('HTMLMetadata', function () {
tempDocumentHead.innerHTML = `
<link rel="favicon" href="http://a:b:c">
`;
testDocument.getDocumentMetadata();
assert.isUndefined(testDocument.metadata.favicon);
const metadata = testDocument.getDocumentMetadata();
assert.isUndefined(metadata.favicon);
});
it('should ignore `<meta>` PDF links with invalid URIs', () => {
tempDocumentHead.innerHTML = `
<meta name="citation_pdf_url" content="http://a:b:c">
`;
testDocument.getDocumentMetadata();
const metadata = testDocument.getDocumentMetadata();
// There should only be one link for the document's location.
// The invalid PDF link should be ignored.
assert.equal(testDocument.metadata.link.length, 1);
assert.equal(metadata.link.length, 1);
});
});
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment