improved document metadata extraction which now replaces previous getHref jschannel rpc call

77038342 · Ed Summers · eb9f9a7d · 77038342 · 77038342 · 77038342
Commit 77038342 authored May 03, 2013 by Ed Summers
Show whitespace changes
Inline Side-by-side

Showing with 139 additions and 19 deletions

host.coffee h/js/host.coffee +5 -2

document.coffee h/js/plugin/document.coffee +125 -6

services.coffee h/js/services.coffee +9 -11

No files found.
--- a/h/js/host.coffee
+++ b/h/js/host.coffee
@@ -160,8 +160,11 @@ class Annotator.Host extends Annotator
          @drag.last = null
        )
-        .bind('getDocumentMetadata', =>
+        .bind('getDocumentInfo', =>
-          return @plugins.Document.getDocumentMetadata()
+          return {
+            uri: @plugins.Document.uri()
+            metadata: @plugins.Document.metadata
+          }
        )
  scanDocument: (reason = "something happened") =>

--- a/h/js/plugin/document.coffee
+++ b/h/js/plugin/document.coffee
 class Annotator.Plugin.Document extends Annotator.Plugin
+  $ = Annotator.$
  events:
    'beforeAnnotationCreated': 'beforeAnnotationCreated'
  pluginInit: ->
-    @metadata = null
+    this.getDocumentMetadata()
+  # returns the primary URI for the document being annotated
+  uri: =>
+    uri = decodeURIComponent document.location.href
+    for link in @metadata
+      if link.rel == "canonical"
+        uri = link.href
+    return uri
+  # returns all uris for the document being annotated
+  uris: =>
+    uniqueUrls = {}
+    for link in @metadata.link
+      uniqueUrls[link.href] = true if link.href
+    return (href for href of uniqueUrls)
  beforeAnnotationCreated: (annotation) =>
-    if not @metadata
-      @metadata = this.getDocumentMetadata()
    annotation.document = @metadata
  getDocumentMetadata: =>
-    $ = jQuery
+    @metadata = {}
-    @metadata =
-      title: $("head title").text()
+    # first look for some common metadata types
+    # TODO: look for microdata/rdfa?
+    this._getScholar()
+    this._getDublinCore()
+    this._getOpenGraph()
+    # extract out/normalize some things
+    this._getTitle()
+    this._getLinks()
    return @metadata
+  _getScholar: =>
+    @metadata.scholar = {}
+    for meta in $("meta")
+      name = $(meta).prop("name")
+      content = $(meta).prop("content")
+      if name.match(/^citation_/)
+        if @metadata.scholar[name]
+          @metadata.scholar[name].push(content)
+        else
+          @metadata.scholar[name] = [content]
+  _getDublinCore: =>
+    @metadata.dc = {}
+    for meta in $("meta")
+      name = $(meta).prop("name")
+      content = $(meta).prop("content")
+      nameParts = name.split(".")
+      if nameParts.length == 2 and nameParts[0] == "dc"
+        n = nameParts[1]
+        if @metadata.dc[n]
+          @metadata.dc[n].push(content)
+        else
+          @metadata.dc[n] = [content]
+  _getOpenGraph: =>
+    @metadata.og = {}
+    for meta in $("meta")
+      property = $(meta).attr("property")
+      content = $(meta).prop("content")
+      if property
+        match = property.match(/^og:(.+)$/)
+        if match
+          n = match[1]
+          if @metadata.og[n]
+            @metadata.og[n].push(content)
+          else
+            @metadata.og[n] = [content]
+  _getTitle: =>
+    if @metadata.scholar.citation_title
+      @metadata.title = @metadata.scholar.citation_title[0]
+    else if @metadata.dc.title
+      @metadata.title = @metadata.dc.title
+    else
+      @metadata.title = $("head title").text()
+  _getLinks: =>
+    # we know our current location is a link for the document
+    @metadata.link = [href: document.location.href]
+    # look for some relevant link relations
+    for link in $("link")
+      l = $(link)
+      href = this._absoluteUrl(l.prop('href')) # get absolute url
+      rel = l.prop('rel')
+      type = l.prop('type')
+      if rel in ["alternate", "canonical"]
+        @metadata.link.push(href: href, rel: rel, type: type)
+    # look for links in scholar metadata
+    for name, values of @metadata.scholar
+      if name == "citation_pdf_url"
+        for url in values
+          @metadata.link.push
+            href: this._absoluteUrl(url)
+            type: "application/pdf"
+      # kind of a hack to express DOI identifiers as links but it's a 
+      # convenient place to look them up later, and somewhat sane since 
+      # they don't have a type
+      if name == "citation_doi"
+        for doi in values
+          if doi[0..3] != "doi:"
+            doi = "doi:" + doi
+          @metadata.link.push(href: doi)
+    # look for links in dublincore data
+    for name, values of @metadata.dc
+      if name == "identifier"
+        for id in values
+          if id[0..3] == "doi:"
+            @metadata.link.push(href: id)
+  # hack to get a absolute url from a possibly relative one
+  _absoluteUrl: (url) ->
+    img = $("<img src='#{ url }'>")
+    url = img.prop('src')
+    img.prop('src', null)
+    return url
--- a/h/js/services.coffee
+++ b/h/js/services.coffee
@@ -343,14 +343,18 @@ class Hypothesis extends Annotator
    # Get the location of the annotated document
    @provider.call
-      method: 'getHref'
+      method: 'getDocumentInfo'
-      success: (href) =>
+      success: (info) =>
+        href = info.uri
+        @plugins.Document.metadata = info.metadata
        options = angular.extend {}, (@options.Store or {}),
          annotationData:
            uri: href
          loadFromSearch:
            limit: 1000
            uri: href
+<<<<<<< HEAD
        this.addStore(options)
  addStore: (options) ->
@@ -361,15 +365,9 @@ class Hypothesis extends Annotator
    return unless href?
    console.log "Loaded annotions for '" + href + "'."
-    for href in this.getSynonymURLs href
+    for uri in @plugins.Document.uris()
-      console.log "Also loading annotations for: " + href
+      console.log "Also loading annotations for: " + uri
-      this.plugins.Store.loadAnnotationsFromSearch uri: href
+      this.plugins.Store.loadAnnotationsFromSearch uri: uri
-    # get metadata for the annotated document
-    @provider.call
-      method: 'getDocumentMetadata'
-      success: (metadata) =>
-        this.plugins.Document.metadata = metadata
 class DraftProvider
  drafts: []