Merge pull request #889 from hypothesis/update-pdfjs-text-api

Update PDF anchoring for changes in PDF.js API

Merge pull request #889 from hypothesis/update-pdfjs-text-api
Update PDF anchoring for changes in PDF.js API
2b5da551 · Robert Knight · GitHub · 7f0a2338 · 37e41ea8 · 2b5da551
Unverified Commit 2b5da551 authored Feb 01, 2019 by Robert Knight Committed by GitHub Feb 01, 2019
Show whitespace changes
Inline Side-by-side

Showing with 34 additions and 16 deletions

pdf.js src/annotator/anchoring/pdf.js +4 -3

fake-pdf-viewer-application.js src/annotator/anchoring/test/fake-pdf-viewer-application.js +30 -13

No files found.
--- a/src/annotator/anchoring/pdf.js
+++ b/src/annotator/anchoring/pdf.js
@@ -72,9 +72,10 @@ function getPageTextContent(pageIndex) {
    return textContent;
  };
-  // FIXME - `pdfViewer.getPageTextContent` was removed in recent versions of PDF.js.
+  pageTextCache[pageIndex] = getPage(pageIndex).pdfPage
-  pageTextCache[pageIndex] = PDFViewerApplication.pdfViewer
+    .getTextContent({
-    .getPageTextContent(pageIndex)
+      normalizeWhitespace: true,
+    })
    .then(joinItems);
  return pageTextCache[pageIndex];

--- a/src/annotator/anchoring/test/fake-pdf-viewer-application.js
+++ b/src/annotator/anchoring/test/fake-pdf-viewer-application.js
@@ -43,6 +43,35 @@ function createPage(content, rendered) {
  return pageEl;
 }
+/**
+ * Fake implementation of `PDFPageProxy` class.
+ *
+ * The original is defined at https://github.com/mozilla/pdf.js/blob/master/src/display/api.js
+ */
+class FakePDFPageProxy {
+  constructor(pageText) {
+    this.pageText = pageText;
+  }
+  getTextContent(params = {}) {
+    if (!params.normalizeWhitespace) {
+      return Promise.reject(new Error('Expected `normalizeWhitespace` to be true'));
+    }
+    const textContent = {
+      // The way that the page text is split into items will depend on
+      // the PDF and the version of PDF.js - individual text items might be
+      // just symbols, words, phrases or whole lines.
+      //
+      // Here we split items by line which matches the typical output for a
+      // born-digital PDF.
+      items: this.pageText.split(/\n/).map(line => ({ str: line })),
+    };
+    return Promise.resolve(textContent);
+  }
+}
 /**
 * @typedef FakePDFPageViewOptions
 * @prop [boolean] rendered - Whether this page is "rendered", as if it were
@@ -70,6 +99,7 @@ class FakePDFPageView {
    this.renderingState = textLayerEl
      ? RenderingStates.FINISHED
      : RenderingStates.INITIAL;
+    this.pdfPage = new FakePDFPageProxy(text);
  }
  dispose() {
@@ -103,19 +133,6 @@ class FakePDFViewer {
    return this._pages[index];
  }
-  getPageTextContent(index) {
-    this._checkBounds(index);
-    return Promise.resolve({
-      // The way that the page text is split into items will depend on
-      // the PDF and the version of PDF.js - individual text items might be
-      // just symbols, words, phrases or whole lines.
-      //
-      // Here we split items by line which matches the typical output for a
-      // born-digital PDF.
-      items: this._content[index].split(/\n/).map(line => ({ str: line })),
-    });
-  }
  /**
   * Set the index of the page which is currently visible in the viewport.
   *