Replace `document.title` as a fallback for title in PDFs

Replace the usage of `document.title` as a way to get the document title if the PDF has no embedded title in either its _document info dictionary_ or _metadata stream_. In top-level frames using `document.title` (where `document` is the global HTML document, not the PDF) works because PDF.js sets the title based on the first non-empty value from: 1. The embedded title 2. The filename from the `Content-Disposition` header 3. The last segment of the URL's path (eg. "test.pdf" in "https://example.com/test.pdf") When PDF.js is embedded in an iframe however, it does not set `document.title` by default. As a result, documents were ending up in Hypothesis with a generic "PDF.js viewer" title. This commit implements (roughly) the same logic that PDF.js uses to determine the value used to set `document.title`, in the case where the PDF has no embedded title. This means implementing steps (2) and (3) from the above list. The `Content-Disposition` filename is not exposed as a public property on `PDFViewerApplication`, so `PDFMetadata#getMetadata` was refactored to call the `pdfDocument.getMetadata` instead. Fixes https://github.com/hypothesis/client/issues/3372

Replace `document.title` as a fallback for title in PDFs
Replace the usage of `document.title` as a way to get the document title if the PDF has no embedded title in either its _document info dictionary_ or _metadata stream_. In top-level frames using `document.title` (where `document` is the global HTML document, not the PDF) works because PDF.js sets the title based on the first non-empty value from: 1. The embedded title 2. The filename from the `Content-Disposition` header 3. The last segment of the URL's path (eg. "test.pdf" in "https://example.com/test.pdf") When PDF.js is embedded in an iframe however, it does not set `document.title` by default. As a result, documents were ending up in Hypothesis with a generic "PDF.js viewer" title. This commit implements (roughly) the same logic that PDF.js uses to determine the value used to set `document.title`, in the case where the PDF has no embedded title. This means implementing steps (2) and (3) from the above list. The `Content-Disposition` filename is not exposed as a public property on `PDFViewerApplication`, so `PDFMetadata#getMetadata` was refactored to call the `pdfDocument.getMetadata` instead. Fixes https://github.com/hypothesis/client/issues/3372
40eaf5c9 · Robert Knight · 96723e83 · 40eaf5c9 · 40eaf5c9 · 40eaf5c9
Commit 40eaf5c9 authored May 05, 2021 by Robert Knight
Showing with 161 additions and 40 deletions

pdf-metadata.js src/annotator/integrations/pdf-metadata.js +62 -27

pdf-metadata-test.js src/annotator/integrations/test/pdf-metadata-test.js +67 -9

pdfjs.js src/types/pdfjs.js +32 -4

No files found.
--- a/src/annotator/integrations/pdf-metadata.js
+++ b/src/annotator/integrations/pdf-metadata.js
@@ -131,33 +131,48 @@ export class PDFMetadata {
   *
   * @return {Promise<Metadata>}
   */
-  getMetadata() {
-    return this._loaded.then(app => {
-      let title = document.title;
-
-      if (
-        app.metadata &&
-        app.metadata.has('dc:title') &&
-        app.metadata.get('dc:title') !== 'Untitled'
-      ) {
-        title = /** @type {string} */ (app.metadata.get('dc:title'));
-      } else if (app.documentInfo && app.documentInfo.Title) {
-        title = app.documentInfo.Title;
-      }
-
-      const link = [{ href: fingerprintToURN(app.pdfDocument.fingerprint) }];
+  async getMetadata() {
+    const app = await this._loaded;
+    const {
+      info: documentInfo,
+      contentDispositionFilename,
+      metadata,
+    } = await app.pdfDocument.getMetadata();
+    const documentFingerprint = app.pdfDocument.fingerprint;

    const url = getPDFURL(app);
+
+    // Return the title metadata embedded in the PDF if available, otherwise
+    // fall back to values from the `Content-Disposition` header or URL.
+    //
+    // PDFs contain two embedded metadata sources, the metadata stream and
+    // the document info dictionary. Per the specification, the metadata stream
+    // is preferred if available.
+    //
+    // This logic is similar to how PDF.js sets `document.title`.
+    let title;
+    if (metadata?.has('dc:title') && metadata.get('dc:title') !== 'Untitled') {
+      title = /** @type {string} */ (metadata.get('dc:title'));
+    } else if (documentInfo?.Title) {
+      title = documentInfo.Title;
+    } else if (contentDispositionFilename) {
+      title = contentDispositionFilename;
+    } else if (url) {
+      title = filenameFromURL(url);
+    } else {
+      title = '';
+    }
+
+    const link = [{ href: fingerprintToURN(documentFingerprint) }];
    if (url) {
      link.push({ href: url });
    }

    return {
-        title: title,
-        link: link,
-        documentFingerprint: app.pdfDocument.fingerprint,
+      title,
+      link,
+      documentFingerprint,
    };
-    });
  }
 }

@@ -165,7 +180,15 @@ function fingerprintToURN(fingerprint) {
  return 'urn:x-pdf:' + String(fingerprint);
 }

+/**
+ * @param {PDFViewerApplication} app
+ * @return {string|null} - Valid URL string or `null`
+ */
 function getPDFURL(app) {
+  if (!app.url) {
+    return null;
+  }
+
  const url = normalizeURI(app.url);

  // Local file:// URLs should not be saved in document metadata.
@@ -177,3 +200,15 @@ function getPDFURL(app) {

  return null;
 }
+
+/**
+ * Return the last component of the path part of a URL.
+ *
+ * @param {string} url - A valid URL string
+ * @return {string}
+ */
+function filenameFromURL(url) {
+  const parsed = new URL(url);
+  const pathSegments = parsed.pathname.split('/');
+  return pathSegments[pathSegments.length - 1];
+}
--- a/src/annotator/integrations/test/pdf-metadata-test.js
+++ b/src/annotator/integrations/test/pdf-metadata-test.js
@@ -28,8 +28,25 @@ class FakeMetadata {
 * Fake implementation of PDF.js `window.PDFViewerApplication.pdfDocument`.
 */
 class FakePDFDocumentProxy {
-  constructor({ fingerprint }) {
+  constructor({
+    contentDispositionFilename = null,
+    fingerprint,
+    info,
+    metadata = null,
+  }) {
    this.fingerprint = fingerprint;
+
+    this._contentDispositionFilename = contentDispositionFilename;
+    this._info = info;
+    this._metadata = metadata;
+  }
+
+  async getMetadata() {
+    return {
+      contentDispositionFilename: this._contentDispositionFilename,
+      info: this._info,
+      metadata: this._metadata,
+    };
  }
 }

@@ -84,6 +101,7 @@ class FakePDFViewerApplication {
   * Simulate completion of PDF document loading.
   */
  finishLoading({
+    contentDispositionFilename,
    url,
    fingerprint,
    metadata,
@@ -92,17 +110,18 @@ class FakePDFViewerApplication {
  }) {
    this.url = url;
    this.downloadComplete = true;
-    this.documentInfo = {};

-    if (typeof title !== undefined) {
-      this.documentInfo.Title = title;
+    const info = {};
+    if (title) {
+      info.Title = title;
    }

-    if (metadata) {
-      this.metadata = new FakeMetadata(metadata);
-    }
-
-    this.pdfDocument = new FakePDFDocumentProxy({ fingerprint });
+    this.pdfDocument = new FakePDFDocumentProxy({
+      contentDispositionFilename,
+      info,
+      metadata: metadata ? new FakeMetadata(metadata) : null,
+      fingerprint,
+    });

    if (this.dispatchDOMEvents) {
      const event = document.createEvent('Event');
@@ -320,5 +339,44 @@ describe('PDFMetadata', function () {

      assert.equal(metadata.title, 'Some title');
    });
+
+    it('gets the title from the `Content-Disposition` header', async () => {
+      const { pdfMetadata } = createPDFMetadata({
+        contentDispositionFilename: 'some-file.pdf',
+        url: 'http://fake.com/test.pdf',
+      });
+
+      const metadata = await pdfMetadata.getMetadata();
+
+      assert.equal(metadata.title, 'some-file.pdf');
+    });
+
+    it('gets the title from the URL', async () => {
+      const { pdfMetadata } = createPDFMetadata({
+        url: 'http://fake.com/a-file.pdf',
+      });
+
+      const metadata = await pdfMetadata.getMetadata();
+
+      assert.equal(metadata.title, 'a-file.pdf');
+    });
+
+    [
+      null, // Missing URL
+      '', // Invalid URL
+      'https://example.com', // Missing path
+      'https://example.com/', // Empty string after last `/` in path
+    ].forEach(url => {
+      it('returns an empty string if there is no title metadata or filename in URL', async () => {
+        const { pdfMetadata } = createPDFMetadata({ url });
+
+        // Earlier versions of the client used `document.title` as a fallback,
+        // but we changed this. See https://github.com/hypothesis/client/issues/3372.
+        document.title = 'Ignore me';
+        const metadata = await pdfMetadata.getMetadata();
+
+        assert.equal(metadata.title, '');
+      });
+    });
  });
 });
--- a/src/types/pdfjs.js
+++ b/src/types/pdfjs.js
@@ -13,19 +13,46 @@
 */

 /**
+ * Document metadata parsed from the PDF's _metadata stream_.
+ *
+ * See `Metadata` class from `display/metadata.js` in PDF.js.
+ *
 * @typedef Metadata
 * @prop {(name: string) => string} get
 * @prop {(name: string) => boolean} has
 */

 /**
- * @typedef PDFDocument
- * @prop {string} fingerprint
+ * Document metadata parsed from the PDF's _document info dictionary_.
+ *
+ * See `PDFDocument#documentInfo` in PDF.js.
+ *
+ * @typedef PDFDocumentInfo
+ * @prop {string} [Title]
 */

 /**
- * @typedef PDFDocumentInfo
- * @prop {string} [Title]
+ * An object containing metadata about the PDF. This includes information from:
+ *
+ * - The PDF's document info dictionary
+ * - The PDF's metadata stream
+ * - The HTTP headers (eg. `Content-Disposition`) sent when the PDF file was
+ *   served
+ *
+ * See the "Metadata" section (14.3) in the PDF 1.7 reference for details of
+ * the _metadata stream_ and _document info dictionary_.
+ *
+ * @typedef PDFDocumentMetadata
+ * @prop {Metadata|null} metadata
+ * @prop {PDFDocumentInfo} [info]
+ * @prop {string|null} contentDispositionFilename - The `filename` directive from
+ *   the `Content-Disposition` header
+ */
+
+/**
+ * @typedef PDFDocument
+ * @prop {string} fingerprint
+ * @prop {() => Promise<PDFDocumentMetadata>} getMetadata
 */

 /**
@@ -93,6 +120,7 @@
 * @prop {Promise<void>} [initializedPromise] -
 *   Promise that resolves when PDF.js is initialized. Since v2.4.456.
 *   See https://github.com/mozilla/pdf.js/wiki/Third-party-viewer-usage#initialization-promise.
+ * @prop {string} url - The URL of the loaded PDF file
 */

 /**