Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
coopwire-hypothesis
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
孙灵跃 Leon Sun
coopwire-hypothesis
Commits
9d7ff2ea
Unverified
Commit
9d7ff2ea
authored
Dec 11, 2018
by
Robert Knight
Committed by
GitHub
Dec 11, 2018
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #826 from hypothesis/handle-invalid-urls-in-links
Ignore invalid URLs in `<link>` and `<meta>` tags
parents
1e307a55
2bc090cf
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
130 additions
and
52 deletions
+130
-52
document.js
src/annotator/plugin/document.js
+44
-28
document-test.js
src/annotator/plugin/test/document-test.js
+86
-24
No files found.
src/annotator/plugin/document.js
View file @
9d7ff2ea
...
...
@@ -34,6 +34,7 @@ class DocumentMeta extends Plugin {
// Test seams.
this
.
baseURI
=
this
.
options
.
baseURI
||
baseURI
;
this
.
document
=
this
.
options
.
document
||
document
;
this
.
normalizeURI
=
this
.
options
.
normalizeURI
||
normalizeURI
;
this
.
getDocumentMetadata
();
}
...
...
@@ -160,40 +161,48 @@ class DocumentMeta extends Plugin {
}
_getLinks
()
{
// we know our current location is a link for the document
let
href
;
let
type
;
let
values
;
// We know our current location is a link for the document.
this
.
metadata
.
link
=
[{
href
:
this
.
_getDocumentHref
()}];
// look for some relevant link relations
for
(
let
link
of
Array
.
from
(
this
.
document
.
querySelectorAll
(
'link'
)))
{
href
=
this
.
_absoluteUrl
(
link
.
href
);
// get absolute url
const
{
rel
}
=
link
;
({
type
}
=
link
);
const
lang
=
link
.
hreflang
;
if
(
!
[
'alternate'
,
'canonical'
,
'bookmark'
,
'shortlink'
].
includes
(
rel
))
{
continue
;
}
if
(
rel
===
'alternate'
)
{
// Ignore feeds resources
if
(
type
&&
type
.
match
(
/^application
\/(
rss|atom
)\+
xml/
))
{
continue
;
}
// Ignore alternate languages
if
(
lang
)
{
continue
;
}
// Extract links from certain `<link>` tags.
const
linkElements
=
Array
.
from
(
this
.
document
.
querySelectorAll
(
'link'
));
for
(
let
link
of
linkElements
)
{
if
(
!
[
'alternate'
,
'canonical'
,
'bookmark'
,
'shortlink'
].
includes
(
link
.
rel
))
{
continue
;
}
if
(
link
.
rel
===
'alternate'
)
{
// Ignore RSS feed links.
if
(
link
.
type
&&
link
.
type
.
match
(
/^application
\/(
rss|atom
)\+
xml/
))
{
continue
;
}
// Ignore alternate languages.
if
(
link
.
hreflang
)
{
continue
;
}
}
this
.
metadata
.
link
.
push
({
href
,
rel
,
type
});
try
{
const
href
=
this
.
_absoluteUrl
(
link
.
href
);
this
.
metadata
.
link
.
push
({
href
,
rel
:
link
.
rel
,
type
:
link
.
type
});
}
catch
(
e
)
{
// Ignore URIs which cannot be parsed.
}
}
// look for links in scholar metadata
for
(
let
name
of
Object
.
keys
(
this
.
metadata
.
highwire
))
{
values
=
this
.
metadata
.
highwire
[
name
];
const
values
=
this
.
metadata
.
highwire
[
name
];
if
(
name
===
'pdf_url'
)
{
for
(
let
url
of
values
)
{
this
.
metadata
.
link
.
push
({
href
:
this
.
_absoluteUrl
(
url
),
type
:
'application/pdf'
,
});
try
{
this
.
metadata
.
link
.
push
({
href
:
this
.
_absoluteUrl
(
url
),
type
:
'application/pdf'
,
});
}
catch
(
e
)
{
// Ignore URIs which cannot be parsed.
}
}
}
...
...
@@ -212,7 +221,7 @@ class DocumentMeta extends Plugin {
// look for links in dublincore data
for
(
let
name
of
Object
.
keys
(
this
.
metadata
.
dc
))
{
values
=
this
.
metadata
.
dc
[
name
];
const
values
=
this
.
metadata
.
dc
[
name
];
if
(
name
===
'identifier'
)
{
for
(
let
id
of
values
)
{
if
(
id
.
slice
(
0
,
4
)
===
'doi:'
)
{
...
...
@@ -242,14 +251,21 @@ class DocumentMeta extends Plugin {
_getFavicon
()
{
for
(
let
link
of
Array
.
from
(
this
.
document
.
querySelectorAll
(
'link'
)))
{
if
([
'shortcut icon'
,
'icon'
].
includes
(
link
.
rel
))
{
this
.
metadata
.
favicon
=
this
.
_absoluteUrl
(
link
.
href
);
try
{
this
.
metadata
.
favicon
=
this
.
_absoluteUrl
(
link
.
href
);
}
catch
(
e
)
{
// Ignore URIs which cannot be parsed.
}
}
}
}
// Hack to get a absolute url from a possibly relative one
/**
* Convert a possibly relative URI to an absolute one. This will throw an
* exception if the URL cannot be parsed.
*/
_absoluteUrl
(
url
)
{
return
normalizeURI
(
url
,
this
.
baseURI
);
return
this
.
normalizeURI
(
url
,
this
.
baseURI
);
}
// Get the true URI record when it's masked via a different protocol.
...
...
src/annotator/plugin/test/document-test.js
View file @
9d7ff2ea
...
...
@@ -15,43 +15,68 @@
const
$
=
require
(
'jquery'
);
const
DocumentMeta
=
require
(
'../document'
);
const
{
normalizeURI
}
=
require
(
'../../util/url'
);
describe
(
'DocumentMeta'
,
function
()
{
let
fakeNormalizeURI
;
let
tempDocument
;
let
tempDocumentHead
;
let
testDocument
=
null
;
beforeEach
(
function
()
{
testDocument
=
new
DocumentMeta
(
$
(
'<div></div>'
)[
0
],
{});
tempDocument
=
document
.
createDocumentFragment
();
tempDocument
.
location
=
{
href
:
'https://example.com'
};
tempDocumentHead
=
document
.
createElement
(
'head'
);
tempDocument
.
appendChild
(
tempDocumentHead
);
fakeNormalizeURI
=
sinon
.
stub
().
callsFake
((
url
,
base
)
=>
{
if
(
url
===
'http://a:b:c'
)
{
// A modern browser would reject this URL, but PhantomJS's URL parser is
// more lenient.
throw
new
Error
(
'Invalid URL'
);
}
return
normalizeURI
(
url
,
base
);
});
testDocument
=
new
DocumentMeta
(
tempDocument
,
{
document
:
tempDocument
,
normalizeURI
:
fakeNormalizeURI
,
});
testDocument
.
pluginInit
();
});
afterEach
(()
=>
$
(
document
).
unbind
());
describe
(
'annotation should have some metadata'
,
function
()
{
// Add some metadata to the page
const
head
=
$
(
'head'
);
head
.
append
(
'<link rel="alternate" href="foo.pdf" type="application/pdf"></link>'
);
head
.
append
(
'<link rel="alternate" href="foo.doc" type="application/msword"></link>'
);
head
.
append
(
'<link rel="bookmark" href="http://example.com/bookmark"></link>'
);
head
.
append
(
'<link rel="shortlink" href="http://example.com/bookmark/short"></link>'
);
head
.
append
(
'<link rel="alternate" href="es/foo.html" hreflang="es" type="text/html"></link>'
);
head
.
append
(
'<meta name="citation_doi" content="10.1175/JCLI-D-11-00015.1">'
);
head
.
append
(
'<meta name="citation_title" content="Foo">'
);
head
.
append
(
'<meta name="citation_pdf_url" content="foo.pdf">'
);
head
.
append
(
'<meta name="dc.identifier" content="doi:10.1175/JCLI-D-11-00015.1">'
);
head
.
append
(
'<meta name="dc:identifier" content="foobar-abcxyz">'
);
head
.
append
(
'<meta name="dc.relation.ispartof" content="isbn:123456789">'
);
head
.
append
(
'<meta name="DC.type" content="Article">'
);
head
.
append
(
'<meta property="og:url" content="http://example.com">'
);
head
.
append
(
'<meta name="twitter:site" content="@okfn">'
);
head
.
append
(
'<link rel="icon" href="http://example.com/images/icon.ico"></link>'
);
head
.
append
(
'<meta name="eprints.title" content="Computer Lib / Dream Machines">'
);
head
.
append
(
'<meta name="prism.title" content="Literary Machines">'
);
head
.
append
(
'<link rel="alternate" href="feed" type="application/rss+xml"></link>'
);
head
.
append
(
'<link rel="canonical" href="http://example.com/canonical"></link>'
);
let
metadata
=
null
;
beforeEach
(()
=>
metadata
=
testDocument
.
metadata
);
beforeEach
(()
=>
{
// Add some metadata to the page
tempDocumentHead
.
innerHTML
=
`
<link rel="alternate" href="foo.pdf" type="application/pdf"></link>
<link rel="alternate" href="foo.doc" type="application/msword"></link>
<link rel="bookmark" href="http://example.com/bookmark"></link>
<link rel="shortlink" href="http://example.com/bookmark/short"></link>
<link rel="alternate" href="es/foo.html" hreflang="es" type="text/html"></link>
<meta name="citation_doi" content="10.1175/JCLI-D-11-00015.1">
<meta name="citation_title" content="Foo">
<meta name="citation_pdf_url" content="foo.pdf">
<meta name="dc.identifier" content="doi:10.1175/JCLI-D-11-00015.1">
<meta name="dc:identifier" content="foobar-abcxyz">
<meta name="dc.relation.ispartof" content="isbn:123456789">
<meta name="DC.type" content="Article">
<meta property="og:url" content="http://example.com">
<meta name="twitter:site" content="@okfn">
<link rel="icon" href="http://example.com/images/icon.ico"></link>
<meta name="eprints.title" content="Computer Lib / Dream Machines">
<meta name="prism.title" content="Literary Machines">
<link rel="alternate" href="feed" type="application/rss+xml"></link>
<link rel="canonical" href="http://example.com/canonical"></link>
`
;
testDocument
.
getDocumentMetadata
();
metadata
=
testDocument
.
metadata
;
});
it
(
'should have metadata'
,
()
=>
assert
.
ok
(
metadata
));
...
...
@@ -146,6 +171,43 @@ describe('DocumentMeta', function() {
it
(
'should have a documentFingerprint as the dc resource identifiers URN href'
,
()
=>
{
assert
.
equal
(
metadata
.
documentFingerprint
,
metadata
.
link
[
9
].
href
);
});
it
(
'should ignore `<link>` tags with invalid URIs'
,
()
=>
{
tempDocumentHead
.
innerHTML
=
`
<link rel="alternate" href="https://example.com/foo">
<link rel="alternate" href="http://a:b:c">
`
;
testDocument
.
getDocumentMetadata
();
// There should be one link with the document location and one for the
// valid `<link>` tag.
assert
.
deepEqual
(
testDocument
.
metadata
.
link
.
length
,
2
);
assert
.
deepEqual
(
testDocument
.
metadata
.
link
[
1
],
{
rel
:
'alternate'
,
href
:
'https://example.com/foo'
,
type
:
''
,
});
});
it
(
'should ignore favicons with invalid URIs'
,
()
=>
{
tempDocumentHead
.
innerHTML
=
`
<link rel="favicon" href="http://a:b:c">
`
;
testDocument
.
getDocumentMetadata
();
assert
.
isUndefined
(
testDocument
.
metadata
.
favicon
);
});
it
(
'should ignore `<meta>` PDF links with invalid URIs'
,
()
=>
{
tempDocumentHead
.
innerHTML
=
`
<meta name="citation_pdf_url" content="http://a:b:c">
`
;
testDocument
.
getDocumentMetadata
();
// There should only be one link for the document's location.
// The invalid PDF link should be ignored.
assert
.
equal
(
testDocument
.
metadata
.
link
.
length
,
1
);
});
});
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment