baseURI = require('document-base-uri')
Plugin = require('../plugin')
{ normalizeURI } = require('../util/url')
** Adapted from:
** Annotator v1.2.10
** Copyright 2015, the Annotator project contributors.
** Dual licensed under the MIT and GPLv3 licenses.
module.exports = class Document extends Plugin
'beforeAnnotationCreated': 'beforeAnnotationCreated'
pluginInit: ->
# Test seams.
@baseURI = @options.baseURI or baseURI
@document = @options.document or document
# Returns the primary URI for the document being annotated
uri: =>
uri = decodeURIComponent(this._getDocumentHref())
for link in
if link.rel == "canonical"
uri = link.href
return uri
# Returns all uris for the document being annotated
uris: =>
uniqueUrls = {}
for link in
uniqueUrls[link.href] = true if link.href
return (href for href of uniqueUrls)
beforeAnnotationCreated: (annotation) =>
annotation.document = @metadata
getDocumentMetadata: =>
@metadata = {}
# first look for some common metadata types
# TODO: look for microdata/rdfa?
# extract out/normalize some things
return @metadata
_getHighwire: =>
return @metadata.highwire = this._getMetaTags("citation", "name", "_")
_getFacebook: =>
return @metadata.facebook = this._getMetaTags("og", "property", ":")
_getTwitter: =>
return @metadata.twitter = this._getMetaTags("twitter", "name", ":")
_getDublinCore: =>
return @metadata.dc = this._getMetaTags("dc", "name", ".")
_getPrism: =>
return @metadata.prism = this._getMetaTags("prism", "name", ".")
_getEprints: =>
return @metadata.eprints = this._getMetaTags("eprints", "name", ".")
_getMetaTags: (prefix, attribute, delimiter) =>
tags = {}
for meta in @document.querySelectorAll('meta')
name = meta.getAttribute(attribute)
content = meta.content
if name
match = name.match(RegExp("^#{prefix}#{delimiter}(.+)$", "i"))
if match
n = match[1]
if tags[n]
tags[n] = [content]
return tags
_getTitle: =>
if @metadata.highwire.title
@metadata.title = @metadata.highwire.title[0]
else if @metadata.eprints.title
@metadata.title = @metadata.eprints.title[0]
else if @metadata.prism.title
@metadata.title = @metadata.prism.title[0]
else if @metadata.facebook.title
@metadata.title = @metadata.facebook.title[0]
else if @metadata.twitter.title
@metadata.title = @metadata.twitter.title[0]
else if @metadata.dc.title
@metadata.title = @metadata.dc.title[0]
@metadata.title = @document.title
_getLinks: =>
# we know our current location is a link for the document = [href: this._getDocumentHref()]
# look for some relevant link relations
for link in @document.querySelectorAll('link')
href = this._absoluteUrl(link.href) # get absolute url
rel = link.rel
type = link.type
lang = link.hreflang
if rel not in ["alternate", "canonical", "bookmark", "shortlink"] then continue
if rel is 'alternate'
# Ignore feeds resources
if type and type.match /^application\/(rss|atom)\+xml/ then continue
# Ignore alternate languages
if lang then continue href, rel: rel, type: type)
# look for links in scholar metadata
for name, values of @metadata.highwire
if name == "pdf_url"
for url in values
href: this._absoluteUrl(url)
type: "application/pdf"
# kind of a hack to express DOI identifiers as links but it's a
# convenient place to look them up later, and somewhat sane since
# they don't have a type
if name == "doi"
for doi in values
if doi[0..3] != "doi:"
doi = "doi:" + doi doi)
# look for links in dublincore data
for name, values of @metadata.dc
if name == "identifier"
for id in values
if id[0..3] == "doi:" id)
# look for a link to identify the resource in dublincore metadata
dcRelationValues = @metadata.dc['relation.ispartof']
dcIdentifierValues = @metadata.dc['identifier']
if dcRelationValues && dcIdentifierValues
dcUrnRelationComponent =
dcRelationValues[dcRelationValues.length - 1]
dcUrnIdentifierComponent =
dcIdentifierValues[dcIdentifierValues.length - 1]
dcUrn = 'urn:x-dc:' +
encodeURIComponent(dcUrnRelationComponent) + '/' +
encodeURIComponent(dcUrnIdentifierComponent) dcUrn)
# set this as the documentFingerprint as a hint to include this in search queries
@metadata.documentFingerprint = dcUrn
_getFavicon: =>
for link in @document.querySelectorAll('link')
if link.rel in ["shortcut icon", "icon"]
@metadata["favicon"] = this._absoluteUrl(link.href)
# Hack to get a absolute url from a possibly relative one
_absoluteUrl: (url) ->
normalizeURI(url, @baseURI)
# Get the true URI record when it's masked via a different protocol.
# This happens when an href is set with a uri using the 'blob:' protocol
# but the document can set a different uri through a <base> tag.
_getDocumentHref: ->
href = @document.location.href
allowedSchemes = ['http:', 'https:', 'file:']
# Use the current document location if it has a recognized scheme.
if new URL(href).protocol in allowedSchemes
return href
# Otherwise, try using the location specified by the <base> element.
if @baseURI and (new URL(@baseURI).protocol in allowedSchemes)
return @baseURI
# Fall back to returning the document URI, even though the scheme is not
# in the allowed list.
return href
$ = require('jquery')
Document = require('../document')
** Adapted from:
** Annotator v1.2.10
** Copyright 2015, the Annotator project contributors.
** Dual licensed under the MIT and GPLv3 licenses.
describe 'Document', ->
testDocument = null
beforeEach ->
testDocument = new Document($('<div></div>')[0], {})
afterEach ->
describe 'annotation should have some metadata', ->
# Add some metadata to the page
head = $("head")
head.append('<link rel="alternate" href="foo.pdf" type="application/pdf"></link>')
head.append('<link rel="alternate" href="foo.doc" type="application/msword"></link>')
head.append('<link rel="bookmark" href=""></link>')
head.append('<link rel="shortlink" href=""></link>')
head.append('<link rel="alternate" href="es/foo.html" hreflang="es" type="text/html"></link>')
head.append('<meta name="citation_doi" content="10.1175/JCLI-D-11-00015.1">')
head.append('<meta name="citation_title" content="Foo">')
head.append('<meta name="citation_pdf_url" content="foo.pdf">')
head.append('<meta name="dc.identifier" content="doi:10.1175/JCLI-D-11-00015.1">')
head.append('<meta name="dc:identifier" content="foobar-abcxyz">')
head.append('<meta name="dc.relation.ispartof" content="isbn:123456789">')
head.append('<meta name="DC.type" content="Article">')
head.append('<meta property="og:url" content="">')
head.append('<meta name="twitter:site" content="@okfn">')
head.append('<link rel="icon" href=""></link>')
head.append('<meta name="eprints.title" content="Computer Lib / Dream Machines">')
head.append('<meta name="prism.title" content="Literary Machines">')
head.append('<link rel="alternate" href="feed" type="application/rss+xml"></link>')
head.append('<link rel="canonical" href=""></link>')
metadata = null
beforeEach ->
metadata = testDocument.metadata
it 'should have metadata', ->
it 'should have a title, derived from highwire metadata if possible', ->
assert.equal(metadata.title, 'Foo')
it 'should have links with absolute hrefs and types', ->
assert.equal(, 10)
assert.equal([1].rel, "alternate")
assert.match([1].href, /^.+foo\.pdf$/)
assert.equal([1].type, "application/pdf")
assert.equal([2].rel, "alternate")
assert.match([2].href, /^.+foo\.doc$/)
assert.equal([2].type, "application/msword")
assert.equal([3].rel, "bookmark")
assert.equal([3].href, "")
assert.equal([4].rel, "shortlink")
assert.equal([4].href, "")
assert.equal([5].rel, "canonical")
assert.equal([5].href, "")
assert.equal([6].href, "doi:10.1175/JCLI-D-11-00015.1")
assert.match([7].href, /.+foo\.pdf$/)
assert.equal([7].type, "application/pdf")
assert.equal([8].href, "doi:10.1175/JCLI-D-11-00015.1")
# Link derived from dc resource identifiers in the form of urn:x-dc:<container>/<identifier>
# Where <container> is the percent-encoded value of the last dc.relation.ispartof meta element
# and <identifier> is the percent-encoded value of the last dc.identifier meta element.
it 'should ignore atom and RSS feeds and alternate languages', ->
assert.equal(, 10)
it 'should have highwire metadata', ->
assert.deepEqual(metadata.highwire.pdf_url, ['foo.pdf'])
assert.deepEqual(metadata.highwire.doi, ['10.1175/JCLI-D-11-00015.1'])
assert.deepEqual(metadata.highwire.title, ['Foo'])
it 'should have dublincore metadata', ->
assert.deepEqual(metadata.dc.identifier, ["doi:10.1175/JCLI-D-11-00015.1", "foobar-abcxyz"])
assert.deepEqual(metadata.dc['relation.ispartof'], ["isbn:123456789"])
assert.deepEqual(metadata.dc.type, ["Article"])
it 'should have facebook metadata', ->
assert.deepEqual(metadata.facebook.url, [""])
it 'should have eprints metadata', ->
assert.deepEqual(metadata.eprints.title, ['Computer Lib / Dream Machines'])
it 'should have prism metadata', ->
assert.deepEqual(metadata.prism.title, ['Literary Machines'])
it 'should have twitter card metadata', ->
assert.deepEqual(, ['@okfn'])
it 'should have unique uris', ->
uris = testDocument.uris()
assert.equal(uris.length, 8)
it 'uri() returns the canonical uri', ->
uri = testDocument.uri()
it 'should have a favicon', ->
it 'should have a documentFingerprint as the dc resource identifiers URN href', ->
describe '#_absoluteUrl', ->
it 'should add the protocol when the url starts with two slashes', ->
result = testDocument._absoluteUrl('//')
expected = "#{document.location.protocol}//"
assert.equal(result, expected)
it 'should add a trailing slash when given an empty path', ->
result = testDocument._absoluteUrl('')
assert.equal(result, '')
it 'should make a relative path into an absolute url', ->
result = testDocument._absoluteUrl('path')
expected = (
document.location.protocol + '//' + +
document.location.pathname.replace(/[^\/]+$/, '') +
assert.equal(result, expected)
it 'should make an absolute path into an absolute url', ->
result = testDocument._absoluteUrl('/path')
expected = (
document.location.protocol + '//' + +
assert.equal(result, expected)
describe '#uri', ->
beforeEach ->
# Remove any existing canonical links which would otherwise override the
# document's own location.
canonicalLink = document.querySelector('link[rel="canonical"]')
if canonicalLink
# Create a blank HTML document with a faked `href` and `baseURI` and
# return a `Document` instance which reads metadata from it.
createDoc = (href, baseURI, htmlDoc) ->
if !htmlDoc
# Create a blank DOM Document
htmlDoc = document.implementation.createHTMLDocument()
# `Document.location` is not overridable. In order to fake the
# location in tests, create a proxy object in front of our blank HTML
# document.
fakeDocument =
createElement: htmlDoc.createElement.bind(htmlDoc),
querySelectorAll: htmlDoc.querySelectorAll.bind(htmlDoc),
href: href
doc = new Document($('<div></div>')[0], {
document: fakeDocument,
baseURI: baseURI,
].forEach (href) ->
it "should return the document's URL if it has an allowed scheme", ->
baseURI = ''
doc = createDoc(href, baseURI)
assert.equal(doc.uri(), href)
it "should return the baseURI if the document's URL does not have an allowed scheme", ->
href = 'blob:1234-5678'
baseURI = ''
doc = createDoc(href, baseURI)
assert.equal(doc.uri(), baseURI)
# The base URI is not available in IE if the document has no `<base>`
# tags. This is a limitation of `document-base-uri`.
['', undefined],
# Ignore base URIs with non-HTTP/HTTPS/file protocols, which can be
# created by a `<base>` tag.
['blob:1234', 'doi:foo'],
['chrome://foo', 'chrome://blah'],
].forEach ([href, baseURI]) ->
it "should return the document's URL if it and the baseURI do not have an allowed scheme", ->
doc = createDoc(href, baseURI)
assert.equal(doc.uri(), href)
it 'returns the canonical URI if present', ->
htmlDoc = document.implementation.createHTMLDocument()
canonicalLink = htmlDoc.createElement('link')
canonicalLink.rel = 'canonical'
canonicalLink.href = ''
doc = createDoc('', null, htmlDoc)
assert.equal doc.uri(), canonicalLink.href
