Commit 40eaf5c9 authored by Robert Knight's avatar Robert Knight

Replace `document.title` as a fallback for title in PDFs

Replace the usage of `document.title` as a way to get the document title
if the PDF has no embedded title in either its _document info
dictionary_ or _metadata stream_.

In top-level frames using `document.title` (where `document` is the
global HTML document, not the PDF) works because PDF.js sets the title
based on the first non-empty value from:

 1. The embedded title
 2. The filename from the `Content-Disposition` header
 3. The last segment of the URL's path (eg. "test.pdf" in
    "https://example.com/test.pdf")

When PDF.js is embedded in an iframe however, it does not set
`document.title` by default. As a result, documents were ending up in
Hypothesis with a generic "PDF.js viewer" title.

This commit implements (roughly) the same logic that PDF.js uses to
determine the value used to set `document.title`, in the case where the
PDF has no embedded title. This means implementing steps (2) and (3)
from the above list. The `Content-Disposition` filename is not exposed
as a public property on `PDFViewerApplication`, so
`PDFMetadata#getMetadata` was refactored to call the
`pdfDocument.getMetadata` instead.

Fixes https://github.com/hypothesis/client/issues/3372
parent 96723e83
......@@ -131,33 +131,48 @@ export class PDFMetadata {
*
* @return {Promise<Metadata>}
*/
getMetadata() {
return this._loaded.then(app => {
let title = document.title;
if (
app.metadata &&
app.metadata.has('dc:title') &&
app.metadata.get('dc:title') !== 'Untitled'
) {
title = /** @type {string} */ (app.metadata.get('dc:title'));
} else if (app.documentInfo && app.documentInfo.Title) {
title = app.documentInfo.Title;
}
const link = [{ href: fingerprintToURN(app.pdfDocument.fingerprint) }];
async getMetadata() {
const app = await this._loaded;
const {
info: documentInfo,
contentDispositionFilename,
metadata,
} = await app.pdfDocument.getMetadata();
const documentFingerprint = app.pdfDocument.fingerprint;
const url = getPDFURL(app);
// Return the title metadata embedded in the PDF if available, otherwise
// fall back to values from the `Content-Disposition` header or URL.
//
// PDFs contain two embedded metadata sources, the metadata stream and
// the document info dictionary. Per the specification, the metadata stream
// is preferred if available.
//
// This logic is similar to how PDF.js sets `document.title`.
let title;
if (metadata?.has('dc:title') && metadata.get('dc:title') !== 'Untitled') {
title = /** @type {string} */ (metadata.get('dc:title'));
} else if (documentInfo?.Title) {
title = documentInfo.Title;
} else if (contentDispositionFilename) {
title = contentDispositionFilename;
} else if (url) {
title = filenameFromURL(url);
} else {
title = '';
}
const link = [{ href: fingerprintToURN(documentFingerprint) }];
if (url) {
link.push({ href: url });
}
return {
title: title,
link: link,
documentFingerprint: app.pdfDocument.fingerprint,
title,
link,
documentFingerprint,
};
});
}
}
......@@ -165,7 +180,15 @@ function fingerprintToURN(fingerprint) {
return 'urn:x-pdf:' + String(fingerprint);
}
/**
* @param {PDFViewerApplication} app
* @return {string|null} - Valid URL string or `null`
*/
function getPDFURL(app) {
if (!app.url) {
return null;
}
const url = normalizeURI(app.url);
// Local file:// URLs should not be saved in document metadata.
......@@ -177,3 +200,15 @@ function getPDFURL(app) {
return null;
}
/**
* Return the last component of the path part of a URL.
*
* @param {string} url - A valid URL string
* @return {string}
*/
function filenameFromURL(url) {
const parsed = new URL(url);
const pathSegments = parsed.pathname.split('/');
return pathSegments[pathSegments.length - 1];
}
......@@ -28,8 +28,25 @@ class FakeMetadata {
* Fake implementation of PDF.js `window.PDFViewerApplication.pdfDocument`.
*/
class FakePDFDocumentProxy {
constructor({ fingerprint }) {
constructor({
contentDispositionFilename = null,
fingerprint,
info,
metadata = null,
}) {
this.fingerprint = fingerprint;
this._contentDispositionFilename = contentDispositionFilename;
this._info = info;
this._metadata = metadata;
}
async getMetadata() {
return {
contentDispositionFilename: this._contentDispositionFilename,
info: this._info,
metadata: this._metadata,
};
}
}
......@@ -84,6 +101,7 @@ class FakePDFViewerApplication {
* Simulate completion of PDF document loading.
*/
finishLoading({
contentDispositionFilename,
url,
fingerprint,
metadata,
......@@ -92,17 +110,18 @@ class FakePDFViewerApplication {
}) {
this.url = url;
this.downloadComplete = true;
this.documentInfo = {};
if (typeof title !== undefined) {
this.documentInfo.Title = title;
const info = {};
if (title) {
info.Title = title;
}
if (metadata) {
this.metadata = new FakeMetadata(metadata);
}
this.pdfDocument = new FakePDFDocumentProxy({ fingerprint });
this.pdfDocument = new FakePDFDocumentProxy({
contentDispositionFilename,
info,
metadata: metadata ? new FakeMetadata(metadata) : null,
fingerprint,
});
if (this.dispatchDOMEvents) {
const event = document.createEvent('Event');
......@@ -320,5 +339,44 @@ describe('PDFMetadata', function () {
assert.equal(metadata.title, 'Some title');
});
it('gets the title from the `Content-Disposition` header', async () => {
const { pdfMetadata } = createPDFMetadata({
contentDispositionFilename: 'some-file.pdf',
url: 'http://fake.com/test.pdf',
});
const metadata = await pdfMetadata.getMetadata();
assert.equal(metadata.title, 'some-file.pdf');
});
it('gets the title from the URL', async () => {
const { pdfMetadata } = createPDFMetadata({
url: 'http://fake.com/a-file.pdf',
});
const metadata = await pdfMetadata.getMetadata();
assert.equal(metadata.title, 'a-file.pdf');
});
[
null, // Missing URL
'', // Invalid URL
'https://example.com', // Missing path
'https://example.com/', // Empty string after last `/` in path
].forEach(url => {
it('returns an empty string if there is no title metadata or filename in URL', async () => {
const { pdfMetadata } = createPDFMetadata({ url });
// Earlier versions of the client used `document.title` as a fallback,
// but we changed this. See https://github.com/hypothesis/client/issues/3372.
document.title = 'Ignore me';
const metadata = await pdfMetadata.getMetadata();
assert.equal(metadata.title, '');
});
});
});
});
......@@ -13,19 +13,46 @@
*/
/**
* Document metadata parsed from the PDF's _metadata stream_.
*
* See `Metadata` class from `display/metadata.js` in PDF.js.
*
* @typedef Metadata
* @prop {(name: string) => string} get
* @prop {(name: string) => boolean} has
*/
/**
* @typedef PDFDocument
* @prop {string} fingerprint
* Document metadata parsed from the PDF's _document info dictionary_.
*
* See `PDFDocument#documentInfo` in PDF.js.
*
* @typedef PDFDocumentInfo
* @prop {string} [Title]
*/
/**
* @typedef PDFDocumentInfo
* @prop {string} [Title]
* An object containing metadata about the PDF. This includes information from:
*
* - The PDF's document info dictionary
* - The PDF's metadata stream
* - The HTTP headers (eg. `Content-Disposition`) sent when the PDF file was
* served
*
* See the "Metadata" section (14.3) in the PDF 1.7 reference for details of
* the _metadata stream_ and _document info dictionary_.
*
* @typedef PDFDocumentMetadata
* @prop {Metadata|null} metadata
* @prop {PDFDocumentInfo} [info]
* @prop {string|null} contentDispositionFilename - The `filename` directive from
* the `Content-Disposition` header
*/
/**
* @typedef PDFDocument
* @prop {string} fingerprint
* @prop {() => Promise<PDFDocumentMetadata>} getMetadata
*/
/**
......@@ -93,6 +120,7 @@
* @prop {Promise<void>} [initializedPromise] -
* Promise that resolves when PDF.js is initialized. Since v2.4.456.
* See https://github.com/mozilla/pdf.js/wiki/Third-party-viewer-usage#initialization-promise.
* @prop {string} url - The URL of the loaded PDF file
*/
/**
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment