Unverified Commit 72c87787 authored by Robert Knight's avatar Robert Knight Committed by GitHub

Merge pull request #734 from hypothesis/decaf-document-meta

Convert annotator/plugin/document.coffee to JS
parents 20d5471e 56f051cd
baseURI = require('document-base-uri')
Plugin = require('../plugin')
{ normalizeURI } = require('../util/url')
###
** Adapted from:
** https://github.com/openannotation/annotator/blob/v1.2.x/src/plugin/document.coffee
**
** Annotator v1.2.10
** https://github.com/openannotation/annotator
**
** Copyright 2015, the Annotator project contributors.
** Dual licensed under the MIT and GPLv3 licenses.
** https://github.com/openannotation/annotator/blob/master/LICENSE
###
module.exports = class Document extends Plugin
events:
'beforeAnnotationCreated': 'beforeAnnotationCreated'
pluginInit: ->
# Test seams.
@baseURI = @options.baseURI or baseURI
@document = @options.document or document
this.getDocumentMetadata()
# Returns the primary URI for the document being annotated
uri: =>
uri = decodeURIComponent(this._getDocumentHref())
for link in @metadata.link
if link.rel == "canonical"
uri = link.href
return uri
# Returns all uris for the document being annotated
uris: =>
uniqueUrls = {}
for link in @metadata.link
uniqueUrls[link.href] = true if link.href
return (href for href of uniqueUrls)
beforeAnnotationCreated: (annotation) =>
annotation.document = @metadata
getDocumentMetadata: =>
@metadata = {}
# first look for some common metadata types
# TODO: look for microdata/rdfa?
this._getHighwire()
this._getDublinCore()
this._getFacebook()
this._getEprints()
this._getPrism()
this._getTwitter()
this._getFavicon()
# extract out/normalize some things
this._getTitle()
this._getLinks()
return @metadata
_getHighwire: =>
return @metadata.highwire = this._getMetaTags("citation", "name", "_")
_getFacebook: =>
return @metadata.facebook = this._getMetaTags("og", "property", ":")
_getTwitter: =>
return @metadata.twitter = this._getMetaTags("twitter", "name", ":")
_getDublinCore: =>
return @metadata.dc = this._getMetaTags("dc", "name", ".")
_getPrism: =>
return @metadata.prism = this._getMetaTags("prism", "name", ".")
_getEprints: =>
return @metadata.eprints = this._getMetaTags("eprints", "name", ".")
_getMetaTags: (prefix, attribute, delimiter) =>
tags = {}
for meta in @document.querySelectorAll('meta')
name = meta.getAttribute(attribute)
content = meta.content
if name
match = name.match(RegExp("^#{prefix}#{delimiter}(.+)$", "i"))
if match
n = match[1]
if tags[n]
tags[n].push(content)
else
tags[n] = [content]
return tags
_getTitle: =>
if @metadata.highwire.title
@metadata.title = @metadata.highwire.title[0]
else if @metadata.eprints.title
@metadata.title = @metadata.eprints.title[0]
else if @metadata.prism.title
@metadata.title = @metadata.prism.title[0]
else if @metadata.facebook.title
@metadata.title = @metadata.facebook.title[0]
else if @metadata.twitter.title
@metadata.title = @metadata.twitter.title[0]
else if @metadata.dc.title
@metadata.title = @metadata.dc.title[0]
else
@metadata.title = @document.title
_getLinks: =>
# we know our current location is a link for the document
@metadata.link = [href: this._getDocumentHref()]
# look for some relevant link relations
for link in @document.querySelectorAll('link')
href = this._absoluteUrl(link.href) # get absolute url
rel = link.rel
type = link.type
lang = link.hreflang
if rel not in ["alternate", "canonical", "bookmark", "shortlink"] then continue
if rel is 'alternate'
# Ignore feeds resources
if type and type.match /^application\/(rss|atom)\+xml/ then continue
# Ignore alternate languages
if lang then continue
@metadata.link.push(href: href, rel: rel, type: type)
# look for links in scholar metadata
for name, values of @metadata.highwire
if name == "pdf_url"
for url in values
@metadata.link.push
href: this._absoluteUrl(url)
type: "application/pdf"
# kind of a hack to express DOI identifiers as links but it's a
# convenient place to look them up later, and somewhat sane since
# they don't have a type
if name == "doi"
for doi in values
if doi[0..3] != "doi:"
doi = "doi:" + doi
@metadata.link.push(href: doi)
# look for links in dublincore data
for name, values of @metadata.dc
if name == "identifier"
for id in values
if id[0..3] == "doi:"
@metadata.link.push(href: id)
# look for a link to identify the resource in dublincore metadata
dcRelationValues = @metadata.dc['relation.ispartof']
dcIdentifierValues = @metadata.dc['identifier']
if dcRelationValues && dcIdentifierValues
dcUrnRelationComponent =
dcRelationValues[dcRelationValues.length - 1]
dcUrnIdentifierComponent =
dcIdentifierValues[dcIdentifierValues.length - 1]
dcUrn = 'urn:x-dc:' +
encodeURIComponent(dcUrnRelationComponent) + '/' +
encodeURIComponent(dcUrnIdentifierComponent)
@metadata.link.push(href: dcUrn)
# set this as the documentFingerprint as a hint to include this in search queries
@metadata.documentFingerprint = dcUrn
_getFavicon: =>
for link in @document.querySelectorAll('link')
if link.rel in ["shortcut icon", "icon"]
@metadata["favicon"] = this._absoluteUrl(link.href)
# Hack to get a absolute url from a possibly relative one
_absoluteUrl: (url) ->
normalizeURI(url, @baseURI)
# Get the true URI record when it's masked via a different protocol.
# This happens when an href is set with a uri using the 'blob:' protocol
# but the document can set a different uri through a <base> tag.
_getDocumentHref: ->
href = @document.location.href
allowedSchemes = ['http:', 'https:', 'file:']
# Use the current document location if it has a recognized scheme.
if new URL(href).protocol in allowedSchemes
return href
# Otherwise, try using the location specified by the <base> element.
if @baseURI and (new URL(@baseURI).protocol in allowedSchemes)
return @baseURI
# Fall back to returning the document URI, even though the scheme is not
# in the allowed list.
return href
'use strict';
/*
** Adapted from:
** https://github.com/openannotation/annotator/blob/v1.2.x/src/plugin/document.coffee
**
** Annotator v1.2.10
** https://github.com/openannotation/annotator
**
** Copyright 2015, the Annotator project contributors.
** Dual licensed under the MIT and GPLv3 licenses.
** https://github.com/openannotation/annotator/blob/master/LICENSE
*/
const baseURI = require('document-base-uri');
const Plugin = require('../plugin');
const { normalizeURI } = require('../util/url');
/**
* DocumentMeta reads metadata/links from the current HTML document and
* populates the `document` property of new annotations.
*/
class DocumentMeta extends Plugin {
constructor(element, options) {
super(element, options);
this.events = {
'beforeAnnotationCreated': 'beforeAnnotationCreated',
};
}
pluginInit() {
// Test seams.
this.baseURI = this.options.baseURI || baseURI;
this.document = this.options.document || document;
this.getDocumentMetadata();
}
/**
* Returns the primary URI for the document being annotated
*
* @return {string}
*/
uri() {
let uri = decodeURIComponent(this._getDocumentHref());
for (let link of this.metadata.link) {
if (link.rel === 'canonical') {
uri = link.href;
}
}
return uri;
}
/**
* Returns all uris for the document being annotated
*
* @return {string[]}
*/
uris() {
const uniqueUrls = {};
for (let link of this.metadata.link) {
if (link.href) { uniqueUrls[link.href] = true; }
}
return Object.keys(uniqueUrls);
}
/**
* Hook that augments new annotations with metadata about the document they
* came from.
*/
beforeAnnotationCreated(annotation) {
annotation.document = this.metadata;
}
/**
* Return metadata for the current page.
*/
getDocumentMetadata() {
this.metadata = {};
// first look for some common metadata types
// TODO: look for microdata/rdfa?
this._getHighwire();
this._getDublinCore();
this._getFacebook();
this._getEprints();
this._getPrism();
this._getTwitter();
this._getFavicon();
// extract out/normalize some things
this._getTitle();
this._getLinks();
return this.metadata;
}
_getHighwire() {
this.metadata.highwire = this._getMetaTags('citation', 'name', '_');
}
_getFacebook() {
this.metadata.facebook = this._getMetaTags('og', 'property', ':');
}
_getTwitter() {
this.metadata.twitter = this._getMetaTags('twitter', 'name', ':');
}
_getDublinCore() {
this.metadata.dc = this._getMetaTags('dc', 'name', '.');
}
_getPrism() {
this.metadata.prism = this._getMetaTags('prism', 'name', '.');
}
_getEprints() {
this.metadata.eprints = this._getMetaTags('eprints', 'name', '.');
}
_getMetaTags(prefix, attribute, delimiter) {
const tags = {};
for (let meta of Array.from(this.document.querySelectorAll('meta'))) {
const name = meta.getAttribute(attribute);
const { content } = meta;
if (name) {
const match = name.match(RegExp(`^${prefix}${delimiter}(.+)$`, 'i'));
if (match) {
const n = match[1];
if (tags[n]) {
tags[n].push(content);
} else {
tags[n] = [content];
}
}
}
}
return tags;
}
_getTitle() {
if (this.metadata.highwire.title) {
this.metadata.title = this.metadata.highwire.title[0];
} else if (this.metadata.eprints.title) {
this.metadata.title = this.metadata.eprints.title[0];
} else if (this.metadata.prism.title) {
this.metadata.title = this.metadata.prism.title[0];
} else if (this.metadata.facebook.title) {
this.metadata.title = this.metadata.facebook.title[0];
} else if (this.metadata.twitter.title) {
this.metadata.title = this.metadata.twitter.title[0];
} else if (this.metadata.dc.title) {
this.metadata.title = this.metadata.dc.title[0];
} else {
this.metadata.title = this.document.title;
}
}
_getLinks() {
// we know our current location is a link for the document
let href;
let type;
let values;
this.metadata.link = [{href: this._getDocumentHref()}];
// look for some relevant link relations
for (let link of Array.from(this.document.querySelectorAll('link'))) {
href = this._absoluteUrl(link.href); // get absolute url
const { rel } = link;
({ type } = link);
const lang = link.hreflang;
if (!['alternate', 'canonical', 'bookmark', 'shortlink'].includes(rel)) { continue; }
if (rel === 'alternate') {
// Ignore feeds resources
if (type && type.match(/^application\/(rss|atom)\+xml/)) { continue; }
// Ignore alternate languages
if (lang) { continue; }
}
this.metadata.link.push({href, rel, type});
}
// look for links in scholar metadata
for (let name of Object.keys(this.metadata.highwire)) {
values = this.metadata.highwire[name];
if (name === 'pdf_url') {
for (let url of values) {
this.metadata.link.push({
href: this._absoluteUrl(url),
type: 'application/pdf',
});
}
}
// kind of a hack to express DOI identifiers as links but it's a
// convenient place to look them up later, and somewhat sane since
// they don't have a type
if (name === 'doi') {
for (let doi of values) {
if (doi.slice(0, 4) !== 'doi:') {
doi = `doi:${doi}`;
}
this.metadata.link.push({href: doi});
}
}
}
// look for links in dublincore data
for (let name of Object.keys(this.metadata.dc)) {
values = this.metadata.dc[name];
if (name === 'identifier') {
for (let id of values) {
if (id.slice(0, 4) === 'doi:') {
this.metadata.link.push({href: id});
}
}
}
}
// look for a link to identify the resource in dublincore metadata
const dcRelationValues = this.metadata.dc['relation.ispartof'];
const dcIdentifierValues = this.metadata.dc.identifier;
if (dcRelationValues && dcIdentifierValues) {
const dcUrnRelationComponent =
dcRelationValues[dcRelationValues.length - 1];
const dcUrnIdentifierComponent =
dcIdentifierValues[dcIdentifierValues.length - 1];
const dcUrn = 'urn:x-dc:' +
encodeURIComponent(dcUrnRelationComponent) + '/' +
encodeURIComponent(dcUrnIdentifierComponent);
this.metadata.link.push({href: dcUrn});
// set this as the documentFingerprint as a hint to include this in search queries
this.metadata.documentFingerprint = dcUrn;
}
}
_getFavicon() {
for (let link of Array.from(this.document.querySelectorAll('link'))) {
if (['shortcut icon', 'icon'].includes(link.rel)) {
this.metadata.favicon = this._absoluteUrl(link.href);
}
}
}
// Hack to get a absolute url from a possibly relative one
_absoluteUrl(url) {
return normalizeURI(url, this.baseURI);
}
// Get the true URI record when it's masked via a different protocol.
// This happens when an href is set with a uri using the 'blob:' protocol
// but the document can set a different uri through a <base> tag.
_getDocumentHref() {
const { href } = this.document.location;
const allowedSchemes = ['http:', 'https:', 'file:'];
// Use the current document location if it has a recognized scheme.
const scheme = new URL(href).protocol;
if (allowedSchemes.includes(scheme)) {
return href;
}
// Otherwise, try using the location specified by the <base> element.
if (this.baseURI && allowedSchemes.includes(new URL(this.baseURI).protocol)) {
return this.baseURI;
}
// Fall back to returning the document URI, even though the scheme is not
// in the allowed list.
return href;
}
}
module.exports = DocumentMeta;
$ = require('jquery')
Document = require('../document')
###
** Adapted from:
** https://github.com/openannotation/annotator/blob/v1.2.x/test/spec/plugin/document_spec.coffee
**
** Annotator v1.2.10
** https://github.com/openannotation/annotator
**
** Copyright 2015, the Annotator project contributors.
** Dual licensed under the MIT and GPLv3 licenses.
** https://github.com/openannotation/annotator/blob/master/LICENSE
###
describe 'Document', ->
testDocument = null
beforeEach ->
testDocument = new Document($('<div></div>')[0], {})
testDocument.pluginInit()
afterEach ->
$(document).unbind()
describe 'annotation should have some metadata', ->
# Add some metadata to the page
head = $("head")
head.append('<link rel="alternate" href="foo.pdf" type="application/pdf"></link>')
head.append('<link rel="alternate" href="foo.doc" type="application/msword"></link>')
head.append('<link rel="bookmark" href="http://example.com/bookmark"></link>')
head.append('<link rel="shortlink" href="http://example.com/bookmark/short"></link>')
head.append('<link rel="alternate" href="es/foo.html" hreflang="es" type="text/html"></link>')
head.append('<meta name="citation_doi" content="10.1175/JCLI-D-11-00015.1">')
head.append('<meta name="citation_title" content="Foo">')
head.append('<meta name="citation_pdf_url" content="foo.pdf">')
head.append('<meta name="dc.identifier" content="doi:10.1175/JCLI-D-11-00015.1">')
head.append('<meta name="dc:identifier" content="foobar-abcxyz">')
head.append('<meta name="dc.relation.ispartof" content="isbn:123456789">')
head.append('<meta name="DC.type" content="Article">')
head.append('<meta property="og:url" content="http://example.com">')
head.append('<meta name="twitter:site" content="@okfn">')
head.append('<link rel="icon" href="http://example.com/images/icon.ico"></link>')
head.append('<meta name="eprints.title" content="Computer Lib / Dream Machines">')
head.append('<meta name="prism.title" content="Literary Machines">')
head.append('<link rel="alternate" href="feed" type="application/rss+xml"></link>')
head.append('<link rel="canonical" href="http://example.com/canonical"></link>')
metadata = null
beforeEach ->
metadata = testDocument.metadata
it 'should have metadata', ->
assert.ok(metadata)
it 'should have a title, derived from highwire metadata if possible', ->
assert.equal(metadata.title, 'Foo')
it 'should have links with absolute hrefs and types', ->
assert.ok(metadata.link)
assert.equal(metadata.link.length, 10)
assert.equal(metadata.link[1].rel, "alternate")
assert.match(metadata.link[1].href, /^.+foo\.pdf$/)
assert.equal(metadata.link[1].type, "application/pdf")
assert.equal(metadata.link[2].rel, "alternate")
assert.match(metadata.link[2].href, /^.+foo\.doc$/)
assert.equal(metadata.link[2].type, "application/msword")
assert.equal(metadata.link[3].rel, "bookmark")
assert.equal(metadata.link[3].href, "http://example.com/bookmark")
assert.equal(metadata.link[4].rel, "shortlink")
assert.equal(metadata.link[4].href, "http://example.com/bookmark/short")
assert.equal(metadata.link[5].rel, "canonical")
assert.equal(metadata.link[5].href, "http://example.com/canonical")
assert.equal(metadata.link[6].href, "doi:10.1175/JCLI-D-11-00015.1")
assert.match(metadata.link[7].href, /.+foo\.pdf$/)
assert.equal(metadata.link[7].type, "application/pdf")
assert.equal(metadata.link[8].href, "doi:10.1175/JCLI-D-11-00015.1")
# Link derived from dc resource identifiers in the form of urn:x-dc:<container>/<identifier>
# Where <container> is the percent-encoded value of the last dc.relation.ispartof meta element
# and <identifier> is the percent-encoded value of the last dc.identifier meta element.
assert.equal(
metadata.link[9].href
"urn:x-dc:isbn%3A123456789/foobar-abcxyz"
)
it 'should ignore atom and RSS feeds and alternate languages', ->
assert.equal(metadata.link.length, 10)
it 'should have highwire metadata', ->
assert.ok(metadata.highwire)
assert.deepEqual(metadata.highwire.pdf_url, ['foo.pdf'])
assert.deepEqual(metadata.highwire.doi, ['10.1175/JCLI-D-11-00015.1'])
assert.deepEqual(metadata.highwire.title, ['Foo'])
it 'should have dublincore metadata', ->
assert.ok(metadata.dc)
assert.deepEqual(metadata.dc.identifier, ["doi:10.1175/JCLI-D-11-00015.1", "foobar-abcxyz"])
assert.deepEqual(metadata.dc['relation.ispartof'], ["isbn:123456789"])
assert.deepEqual(metadata.dc.type, ["Article"])
it 'should have facebook metadata', ->
assert.ok(metadata.facebook)
assert.deepEqual(metadata.facebook.url, ["http://example.com"])
it 'should have eprints metadata', ->
assert.ok(metadata.eprints)
assert.deepEqual(metadata.eprints.title, ['Computer Lib / Dream Machines'])
it 'should have prism metadata', ->
assert.ok(metadata.prism)
assert.deepEqual(metadata.prism.title, ['Literary Machines'])
it 'should have twitter card metadata', ->
assert.ok(metadata.twitter)
assert.deepEqual(metadata.twitter.site, ['@okfn'])
it 'should have unique uris', ->
uris = testDocument.uris()
assert.equal(uris.length, 8)
it 'uri() returns the canonical uri', ->
uri = testDocument.uri()
assert.equal(uri, metadata.link[5].href)
it 'should have a favicon', ->
assert.equal(
metadata.favicon
'http://example.com/images/icon.ico'
)
it 'should have a documentFingerprint as the dc resource identifiers URN href', ->
assert.equal(metadata.documentFingerprint, metadata.link[9].href)
describe '#_absoluteUrl', ->
it 'should add the protocol when the url starts with two slashes', ->
result = testDocument._absoluteUrl('//example.com/')
expected = "#{document.location.protocol}//example.com/"
assert.equal(result, expected)
it 'should add a trailing slash when given an empty path', ->
result = testDocument._absoluteUrl('http://example.com')
assert.equal(result, 'http://example.com/')
it 'should make a relative path into an absolute url', ->
result = testDocument._absoluteUrl('path')
expected = (
document.location.protocol + '//' +
document.location.host +
document.location.pathname.replace(/[^\/]+$/, '') +
'path'
)
assert.equal(result, expected)
it 'should make an absolute path into an absolute url', ->
result = testDocument._absoluteUrl('/path')
expected = (
document.location.protocol + '//' +
document.location.host +
'/path'
)
assert.equal(result, expected)
describe '#uri', ->
beforeEach ->
# Remove any existing canonical links which would otherwise override the
# document's own location.
canonicalLink = document.querySelector('link[rel="canonical"]')
if canonicalLink
canonicalLink.remove()
# Create a blank HTML document with a faked `href` and `baseURI` and
# return a `Document` instance which reads metadata from it.
createDoc = (href, baseURI, htmlDoc) ->
if !htmlDoc
# Create a blank DOM Document
htmlDoc = document.implementation.createHTMLDocument()
# `Document.location` is not overridable. In order to fake the
# location in tests, create a proxy object in front of our blank HTML
# document.
fakeDocument =
createElement: htmlDoc.createElement.bind(htmlDoc),
querySelectorAll: htmlDoc.querySelectorAll.bind(htmlDoc),
location:
href: href
doc = new Document($('<div></div>')[0], {
document: fakeDocument,
baseURI: baseURI,
})
doc.pluginInit()
doc
[
'http://publisher.org/book',
'https://publisher.org/book',
'file:///Users/jim/book',
].forEach (href) ->
it "should return the document's URL if it has an allowed scheme", ->
baseURI = 'https://publisher.org/'
doc = createDoc(href, baseURI)
assert.equal(doc.uri(), href)
it "should return the baseURI if the document's URL does not have an allowed scheme", ->
href = 'blob:1234-5678'
baseURI = 'https://publisher.org/book'
doc = createDoc(href, baseURI)
assert.equal(doc.uri(), baseURI)
[
# The base URI is not available in IE if the document has no `<base>`
# tags. This is a limitation of `document-base-uri`.
['https://publisher.org/article', undefined],
# Ignore base URIs with non-HTTP/HTTPS/file protocols, which can be
# created by a `<base>` tag.
['blob:1234', 'doi:foo'],
['chrome://foo', 'chrome://blah'],
].forEach ([href, baseURI]) ->
it "should return the document's URL if it and the baseURI do not have an allowed scheme", ->
doc = createDoc(href, baseURI)
assert.equal(doc.uri(), href)
it 'returns the canonical URI if present', ->
htmlDoc = document.implementation.createHTMLDocument()
canonicalLink = htmlDoc.createElement('link')
canonicalLink.rel = 'canonical'
canonicalLink.href = 'https://publisher.org/canonical'
htmlDoc.head.appendChild(canonicalLink)
doc = createDoc('https://publisher.org/not-canonical', null, htmlDoc)
assert.equal doc.uri(), canonicalLink.href
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment