Commit f7346842 authored by Robert Knight's avatar Robert Knight

Add test to cover changes in text rendering between PDF.js versions

Add tests that simulate the different handling of whitespace-only text
items between older (< v2.9.359) and newer PDF.js releases and check
that describing and anchoring selectors works in both cases.
parent 8e288390
......@@ -21,9 +21,10 @@ import { RenderingStates } from '../pdf';
* @param {string} content - The text content for the page
* @param {boolean} rendered - True if the page should be "rendered" or false if
* it should be an empty placeholder for a not-yet-rendered page
* @param {PDFJSConfig} config
* @return {Element} - The root Element for the page
*/
function createPage(content, rendered) {
function createPage(content, rendered, config) {
const pageEl = document.createElement('div');
pageEl.classList.add('page');
......@@ -35,6 +36,11 @@ function createPage(content, rendered) {
textLayer.classList.add('textLayer');
content.split(/\n/).forEach(item => {
if (!config.newTextRendering && /^\s*$/.test(item)) {
// PDF.js releases before v2.9.359 do not create elements in the text
// layer for whitespace-only text items.
return;
}
const itemEl = document.createElement('div');
itemEl.textContent = item;
textLayer.appendChild(itemEl);
......@@ -50,8 +56,13 @@ function createPage(content, rendered) {
* The original is defined at https://github.com/mozilla/pdf.js/blob/master/src/display/api.js
*/
class FakePDFPageProxy {
constructor(pageText) {
/**
* @param {string} pageText
* @param {PDFJSConfig} config
*/
constructor(pageText, config) {
this.pageText = pageText;
this._config = config;
}
getTextContent(params = {}) {
......@@ -61,6 +72,17 @@ class FakePDFPageProxy {
);
}
const makeTextItem = str => {
if (this._config.newTextRendering) {
// The `hasEOL` property was added in https://github.com/mozilla/pdf.js/pull/13257
// and is used to feature-detect whether whitespace-only items need
// to ignored in the `items` array. The value is unimportant.
return { str, hasEOL: false };
} else {
return { str };
}
};
const textContent = {
// The way that the page text is split into items will depend on
// the PDF and the version of PDF.js - individual text items might be
......@@ -68,7 +90,7 @@ class FakePDFPageProxy {
//
// Here we split items by line which matches the typical output for a
// born-digital PDF.
items: this.pageText.split(/\n/).map(line => ({ str: line })),
items: this.pageText.split(/\n/).map(makeTextItem),
};
return Promise.resolve(textContent);
......@@ -77,8 +99,9 @@ class FakePDFPageProxy {
/**
* @typedef FakePDFPageViewOptions
* @prop [boolean] rendered - Whether this page is "rendered", as if it were
* @prop {boolean} rendered - Whether this page is "rendered", as if it were
* near the viewport, or not.
* @prop {PDFJSConfig} config
*/
/**
......@@ -91,8 +114,8 @@ class FakePDFPageView {
* @param {string} text - Text of the page
* @param {FakePDFPageViewOptions} options
*/
constructor(text, options) {
const pageEl = createPage(text, options.rendered);
constructor(text, { rendered, config }) {
const pageEl = createPage(text, rendered, config);
const textLayerEl = pageEl.querySelector('.textLayer');
this.div = pageEl;
......@@ -102,7 +125,7 @@ class FakePDFPageView {
this.renderingState = textLayerEl
? RenderingStates.FINISHED
: RenderingStates.INITIAL;
this.pdfPage = new FakePDFPageProxy(text);
this.pdfPage = new FakePDFPageProxy(text, config);
}
dispose() {
......@@ -120,6 +143,7 @@ class FakePDFViewer {
* @param {Options} options
*/
constructor(options) {
this._config = options.config;
this._container = options.container;
this._content = options.content;
......@@ -167,6 +191,7 @@ class FakePDFViewer {
(text, idx) =>
new FakePDFPageView(text, {
rendered: idx >= index && idx <= lastRenderedPage,
config: this._config,
})
);
......@@ -201,11 +226,21 @@ class FakePDFViewer {
}
/**
* @typedef {Object} Options
* @property {Element} container - The container into which the fake PDF viewer
* should render the content
* @property {string[]} content - Array of strings containing the text for each
* page
* Options that control global aspects of the PDF.js fake, such as which
* version of PDF.js is being emulated.
*
* @typedef PDFJSConfig
* @prop {boolean} newTextRendering - Whether to emulate the PDF.js text rendering
* changes added in v2.9.359.
*/
/**
* @typedef Options
* @prop {Element} container - The container into which the fake PDF viewer
* should render the content
* @prop {string[]} content - Array of strings containing the text for each
* page
* @prop {PDFJSConfig} [config]
*/
/**
......@@ -220,15 +255,16 @@ export default class FakePDFViewerApplication {
* @param {Options} options
*/
constructor(options) {
if (!options.config) {
options.config = { newTextRendering: true };
}
this.appConfig = {
// The root element which contains all of the PDF.js UI. In the real PDF.js
// viewer this is generally `document.body`.
appContainer: document.createElement('div'),
};
this.pdfViewer = new FakePDFViewer({
content: options.content,
container: options.container,
});
this.pdfViewer = new FakePDFViewer(options);
}
/**
......
......@@ -23,10 +23,14 @@ function delay(ms) {
}
const fixtures = {
// Each item in this list contains the text for one page of the "PDF"
// Each item in this list contains the text for one page of the "PDF".
//
// Each line within an item is converted to a single text item, as returned by
// PDF.js' text APIs, and rendered as a separate element in the text layer.
pdfPages: [
'Pride And Prejudice And Zombies\n' +
'By Jane Austin and Seth Grahame-Smith ',
' \n' + // nb. Blank text item handling differs between PDF.js versions
'By Jane Austen and Seth Grahame-Smith ',
'IT IS A TRUTH universally acknowledged that a zombie in possession of\n' +
'brains must be in want of more brains. Never was this truth more plain\n' +
......@@ -49,8 +53,9 @@ describe('annotator/anchoring/pdf', () => {
*
* @param {string[]} content -
* Array containing the text content of each page of the loaded PDF document
* @param {import('./fake-pdf-viewer-application').PDFJSConfig} [config]
*/
function initViewer(content) {
function initViewer(content, config) {
cleanupViewer();
// The rendered text for each page is cached during anchoring.
......@@ -60,6 +65,7 @@ describe('annotator/anchoring/pdf', () => {
viewer = new FakePDFViewerApplication({
container,
content,
config,
});
window.PDFViewerApplication = viewer;
......@@ -97,18 +103,41 @@ describe('annotator/anchoring/pdf', () => {
});
});
it('returns a position selector with correct start/end offsets', () => {
it('returns a position selector with correct start/end offsets', async () => {
viewer.pdfViewer.setCurrentPage(2);
const quote = 'Netherfield Park';
const range = findText(container, quote);
const contentStr = fixtures.pdfPages.join('');
const expectedPos = contentStr.replace(/\n/g, '').lastIndexOf(quote);
return pdfAnchoring.describe(container, range).then(selectors => {
const position = selectors[0];
assert.equal(position.start, expectedPos);
assert.equal(position.end, expectedPos + quote.length);
});
const [positionSelector] = await pdfAnchoring.describe(container, range);
assert.equal(positionSelector.start, expectedPos);
assert.equal(positionSelector.end, expectedPos + quote.length);
});
// This test is similar to the above, but simulates older PDF.js releases
// which do not create elements in the text layer for whitespace-only text items.
it('returns a position selector with correct start/end offsets (old text rendering)', async () => {
initViewer(fixtures.pdfPages, { newTextRendering: false });
viewer.pdfViewer.setCurrentPage(2);
const quote = 'Netherfield Park';
const range = findText(container, quote);
const contentStr = fixtures.pdfPages
.map(pageText =>
pageText
.split('\n')
.filter(line => !line.match(/^\s*$/)) // Strip whitespace-only text items
.join('\n')
)
.join('');
const expectedPos = contentStr.replace(/\n/g, '').lastIndexOf(quote);
const [positionSelector] = await pdfAnchoring.describe(container, range);
assert.equal(positionSelector.start, expectedPos);
assert.equal(positionSelector.end, expectedPos + quote.length);
});
it('returns a quote selector with the correct quote', () => {
......@@ -241,6 +270,16 @@ describe('annotator/anchoring/pdf', () => {
});
});
it('anchors text in older PDF.js versions', async () => {
initViewer(fixtures.pdfPages, { newTextRendering: false });
// Choose a quote in the first page, which has blank text items in it.
const quote = { type: 'TextQuoteSelector', exact: 'Jane Austen' };
const range = await pdfAnchoring.anchor(container, [quote]);
assert.equal(range.toString(), 'Jane Austen');
});
// See https://github.com/hypothesis/client/issues/1329
it('anchors selectors that match the last text on the page', async () => {
viewer.pdfViewer.setCurrentPage(1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment