Commit f7346842 authored by Robert Knight's avatar Robert Knight

Add test to cover changes in text rendering between PDF.js versions

Add tests that simulate the different handling of whitespace-only text
items between older (< v2.9.359) and newer PDF.js releases and check
that describing and anchoring selectors works in both cases.
parent 8e288390
...@@ -21,9 +21,10 @@ import { RenderingStates } from '../pdf'; ...@@ -21,9 +21,10 @@ import { RenderingStates } from '../pdf';
* @param {string} content - The text content for the page * @param {string} content - The text content for the page
* @param {boolean} rendered - True if the page should be "rendered" or false if * @param {boolean} rendered - True if the page should be "rendered" or false if
* it should be an empty placeholder for a not-yet-rendered page * it should be an empty placeholder for a not-yet-rendered page
* @param {PDFJSConfig} config
* @return {Element} - The root Element for the page * @return {Element} - The root Element for the page
*/ */
function createPage(content, rendered) { function createPage(content, rendered, config) {
const pageEl = document.createElement('div'); const pageEl = document.createElement('div');
pageEl.classList.add('page'); pageEl.classList.add('page');
...@@ -35,6 +36,11 @@ function createPage(content, rendered) { ...@@ -35,6 +36,11 @@ function createPage(content, rendered) {
textLayer.classList.add('textLayer'); textLayer.classList.add('textLayer');
content.split(/\n/).forEach(item => { content.split(/\n/).forEach(item => {
if (!config.newTextRendering && /^\s*$/.test(item)) {
// PDF.js releases before v2.9.359 do not create elements in the text
// layer for whitespace-only text items.
return;
}
const itemEl = document.createElement('div'); const itemEl = document.createElement('div');
itemEl.textContent = item; itemEl.textContent = item;
textLayer.appendChild(itemEl); textLayer.appendChild(itemEl);
...@@ -50,8 +56,13 @@ function createPage(content, rendered) { ...@@ -50,8 +56,13 @@ function createPage(content, rendered) {
* The original is defined at https://github.com/mozilla/pdf.js/blob/master/src/display/api.js * The original is defined at https://github.com/mozilla/pdf.js/blob/master/src/display/api.js
*/ */
class FakePDFPageProxy { class FakePDFPageProxy {
constructor(pageText) { /**
* @param {string} pageText
* @param {PDFJSConfig} config
*/
constructor(pageText, config) {
this.pageText = pageText; this.pageText = pageText;
this._config = config;
} }
getTextContent(params = {}) { getTextContent(params = {}) {
...@@ -61,6 +72,17 @@ class FakePDFPageProxy { ...@@ -61,6 +72,17 @@ class FakePDFPageProxy {
); );
} }
const makeTextItem = str => {
if (this._config.newTextRendering) {
// The `hasEOL` property was added in https://github.com/mozilla/pdf.js/pull/13257
// and is used to feature-detect whether whitespace-only items need
// to ignored in the `items` array. The value is unimportant.
return { str, hasEOL: false };
} else {
return { str };
}
};
const textContent = { const textContent = {
// The way that the page text is split into items will depend on // The way that the page text is split into items will depend on
// the PDF and the version of PDF.js - individual text items might be // the PDF and the version of PDF.js - individual text items might be
...@@ -68,7 +90,7 @@ class FakePDFPageProxy { ...@@ -68,7 +90,7 @@ class FakePDFPageProxy {
// //
// Here we split items by line which matches the typical output for a // Here we split items by line which matches the typical output for a
// born-digital PDF. // born-digital PDF.
items: this.pageText.split(/\n/).map(line => ({ str: line })), items: this.pageText.split(/\n/).map(makeTextItem),
}; };
return Promise.resolve(textContent); return Promise.resolve(textContent);
...@@ -77,8 +99,9 @@ class FakePDFPageProxy { ...@@ -77,8 +99,9 @@ class FakePDFPageProxy {
/** /**
* @typedef FakePDFPageViewOptions * @typedef FakePDFPageViewOptions
* @prop [boolean] rendered - Whether this page is "rendered", as if it were * @prop {boolean} rendered - Whether this page is "rendered", as if it were
* near the viewport, or not. * near the viewport, or not.
* @prop {PDFJSConfig} config
*/ */
/** /**
...@@ -91,8 +114,8 @@ class FakePDFPageView { ...@@ -91,8 +114,8 @@ class FakePDFPageView {
* @param {string} text - Text of the page * @param {string} text - Text of the page
* @param {FakePDFPageViewOptions} options * @param {FakePDFPageViewOptions} options
*/ */
constructor(text, options) { constructor(text, { rendered, config }) {
const pageEl = createPage(text, options.rendered); const pageEl = createPage(text, rendered, config);
const textLayerEl = pageEl.querySelector('.textLayer'); const textLayerEl = pageEl.querySelector('.textLayer');
this.div = pageEl; this.div = pageEl;
...@@ -102,7 +125,7 @@ class FakePDFPageView { ...@@ -102,7 +125,7 @@ class FakePDFPageView {
this.renderingState = textLayerEl this.renderingState = textLayerEl
? RenderingStates.FINISHED ? RenderingStates.FINISHED
: RenderingStates.INITIAL; : RenderingStates.INITIAL;
this.pdfPage = new FakePDFPageProxy(text); this.pdfPage = new FakePDFPageProxy(text, config);
} }
dispose() { dispose() {
...@@ -120,6 +143,7 @@ class FakePDFViewer { ...@@ -120,6 +143,7 @@ class FakePDFViewer {
* @param {Options} options * @param {Options} options
*/ */
constructor(options) { constructor(options) {
this._config = options.config;
this._container = options.container; this._container = options.container;
this._content = options.content; this._content = options.content;
...@@ -167,6 +191,7 @@ class FakePDFViewer { ...@@ -167,6 +191,7 @@ class FakePDFViewer {
(text, idx) => (text, idx) =>
new FakePDFPageView(text, { new FakePDFPageView(text, {
rendered: idx >= index && idx <= lastRenderedPage, rendered: idx >= index && idx <= lastRenderedPage,
config: this._config,
}) })
); );
...@@ -201,11 +226,21 @@ class FakePDFViewer { ...@@ -201,11 +226,21 @@ class FakePDFViewer {
} }
/** /**
* @typedef {Object} Options * Options that control global aspects of the PDF.js fake, such as which
* @property {Element} container - The container into which the fake PDF viewer * version of PDF.js is being emulated.
* should render the content *
* @property {string[]} content - Array of strings containing the text for each * @typedef PDFJSConfig
* page * @prop {boolean} newTextRendering - Whether to emulate the PDF.js text rendering
* changes added in v2.9.359.
*/
/**
* @typedef Options
* @prop {Element} container - The container into which the fake PDF viewer
* should render the content
* @prop {string[]} content - Array of strings containing the text for each
* page
* @prop {PDFJSConfig} [config]
*/ */
/** /**
...@@ -220,15 +255,16 @@ export default class FakePDFViewerApplication { ...@@ -220,15 +255,16 @@ export default class FakePDFViewerApplication {
* @param {Options} options * @param {Options} options
*/ */
constructor(options) { constructor(options) {
if (!options.config) {
options.config = { newTextRendering: true };
}
this.appConfig = { this.appConfig = {
// The root element which contains all of the PDF.js UI. In the real PDF.js // The root element which contains all of the PDF.js UI. In the real PDF.js
// viewer this is generally `document.body`. // viewer this is generally `document.body`.
appContainer: document.createElement('div'), appContainer: document.createElement('div'),
}; };
this.pdfViewer = new FakePDFViewer({ this.pdfViewer = new FakePDFViewer(options);
content: options.content,
container: options.container,
});
} }
/** /**
......
...@@ -23,10 +23,14 @@ function delay(ms) { ...@@ -23,10 +23,14 @@ function delay(ms) {
} }
const fixtures = { const fixtures = {
// Each item in this list contains the text for one page of the "PDF" // Each item in this list contains the text for one page of the "PDF".
//
// Each line within an item is converted to a single text item, as returned by
// PDF.js' text APIs, and rendered as a separate element in the text layer.
pdfPages: [ pdfPages: [
'Pride And Prejudice And Zombies\n' + 'Pride And Prejudice And Zombies\n' +
'By Jane Austin and Seth Grahame-Smith ', ' \n' + // nb. Blank text item handling differs between PDF.js versions
'By Jane Austen and Seth Grahame-Smith ',
'IT IS A TRUTH universally acknowledged that a zombie in possession of\n' + 'IT IS A TRUTH universally acknowledged that a zombie in possession of\n' +
'brains must be in want of more brains. Never was this truth more plain\n' + 'brains must be in want of more brains. Never was this truth more plain\n' +
...@@ -49,8 +53,9 @@ describe('annotator/anchoring/pdf', () => { ...@@ -49,8 +53,9 @@ describe('annotator/anchoring/pdf', () => {
* *
* @param {string[]} content - * @param {string[]} content -
* Array containing the text content of each page of the loaded PDF document * Array containing the text content of each page of the loaded PDF document
* @param {import('./fake-pdf-viewer-application').PDFJSConfig} [config]
*/ */
function initViewer(content) { function initViewer(content, config) {
cleanupViewer(); cleanupViewer();
// The rendered text for each page is cached during anchoring. // The rendered text for each page is cached during anchoring.
...@@ -60,6 +65,7 @@ describe('annotator/anchoring/pdf', () => { ...@@ -60,6 +65,7 @@ describe('annotator/anchoring/pdf', () => {
viewer = new FakePDFViewerApplication({ viewer = new FakePDFViewerApplication({
container, container,
content, content,
config,
}); });
window.PDFViewerApplication = viewer; window.PDFViewerApplication = viewer;
...@@ -97,18 +103,41 @@ describe('annotator/anchoring/pdf', () => { ...@@ -97,18 +103,41 @@ describe('annotator/anchoring/pdf', () => {
}); });
}); });
it('returns a position selector with correct start/end offsets', () => { it('returns a position selector with correct start/end offsets', async () => {
viewer.pdfViewer.setCurrentPage(2); viewer.pdfViewer.setCurrentPage(2);
const quote = 'Netherfield Park'; const quote = 'Netherfield Park';
const range = findText(container, quote); const range = findText(container, quote);
const contentStr = fixtures.pdfPages.join(''); const contentStr = fixtures.pdfPages.join('');
const expectedPos = contentStr.replace(/\n/g, '').lastIndexOf(quote); const expectedPos = contentStr.replace(/\n/g, '').lastIndexOf(quote);
return pdfAnchoring.describe(container, range).then(selectors => { const [positionSelector] = await pdfAnchoring.describe(container, range);
const position = selectors[0];
assert.equal(position.start, expectedPos); assert.equal(positionSelector.start, expectedPos);
assert.equal(position.end, expectedPos + quote.length); assert.equal(positionSelector.end, expectedPos + quote.length);
}); });
// This test is similar to the above, but simulates older PDF.js releases
// which do not create elements in the text layer for whitespace-only text items.
it('returns a position selector with correct start/end offsets (old text rendering)', async () => {
initViewer(fixtures.pdfPages, { newTextRendering: false });
viewer.pdfViewer.setCurrentPage(2);
const quote = 'Netherfield Park';
const range = findText(container, quote);
const contentStr = fixtures.pdfPages
.map(pageText =>
pageText
.split('\n')
.filter(line => !line.match(/^\s*$/)) // Strip whitespace-only text items
.join('\n')
)
.join('');
const expectedPos = contentStr.replace(/\n/g, '').lastIndexOf(quote);
const [positionSelector] = await pdfAnchoring.describe(container, range);
assert.equal(positionSelector.start, expectedPos);
assert.equal(positionSelector.end, expectedPos + quote.length);
}); });
it('returns a quote selector with the correct quote', () => { it('returns a quote selector with the correct quote', () => {
...@@ -241,6 +270,16 @@ describe('annotator/anchoring/pdf', () => { ...@@ -241,6 +270,16 @@ describe('annotator/anchoring/pdf', () => {
}); });
}); });
it('anchors text in older PDF.js versions', async () => {
initViewer(fixtures.pdfPages, { newTextRendering: false });
// Choose a quote in the first page, which has blank text items in it.
const quote = { type: 'TextQuoteSelector', exact: 'Jane Austen' };
const range = await pdfAnchoring.anchor(container, [quote]);
assert.equal(range.toString(), 'Jane Austen');
});
// See https://github.com/hypothesis/client/issues/1329 // See https://github.com/hypothesis/client/issues/1329
it('anchors selectors that match the last text on the page', async () => { it('anchors selectors that match the last text on the page', async () => {
viewer.pdfViewer.setCurrentPage(1); viewer.pdfViewer.setCurrentPage(1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment