Commit 265e59fc authored by Robert Knight's avatar Robert Knight

Break words in text layer when adjacent chars do not overlap vertically

For text selection to work well it is important that `<hypothesis-text-word>`
elements do not span multiple lines. Previously the layout analysis only created
new words after a space. Since most lines end with a whole word, this was
sufficient in most cases. However this doesn't work if a word is broken across
multiple lines (eg. with a hyphen). To handle this case, also create a word
break if the current character does not vertically overlap the previous
character. This may create some false positives (ie. unnecessary word breaks),
but that doesn't harm text selection.

Fixes #5066
parent becbfd0e
...@@ -57,6 +57,13 @@ function analyzeLayout(charBoxes: DOMRect[], text: string): ColumnBox[] { ...@@ -57,6 +57,13 @@ function analyzeLayout(charBoxes: DOMRect[], text: string): ColumnBox[] {
const char = text[i]; const char = text[i];
const isSpace = /\s/.test(char); const isSpace = /\s/.test(char);
if (
currentWord.text.length > 0 &&
!rectsOverlapVertically(currentWord.rect, rect)
) {
addWord();
}
currentWord.rect = unionRects(currentWord.rect, rect); currentWord.rect = unionRects(currentWord.rect, rect);
// To simplify downstream logic, normalize whitespace. // To simplify downstream logic, normalize whitespace.
......
...@@ -10,13 +10,22 @@ const lineSpacing = 0.1; ...@@ -10,13 +10,22 @@ const lineSpacing = 0.1;
/** /**
* Create character bounding box data for text in an image. * Create character bounding box data for text in an image.
*
* Lines are broken after new-line chars and also before any indicies in
* `breakPositions`.
*/ */
function createCharBoxes(text) { function createCharBoxes(text, breakPositions = []) {
const charBoxes = []; const charBoxes = [];
let lineIndex = 0; let lineIndex = 0;
let charIndex = 0; let charIndex = 0;
for (let char of text) { for (let i = 0; i < text.length; i++) {
if (breakPositions.includes(i)) {
charIndex = 0;
++lineIndex;
}
const char = text[i];
charBoxes.push({ charBoxes.push({
left: charIndex * charSpacing, left: charIndex * charSpacing,
right: charIndex * charSpacing + charWidth, right: charIndex * charSpacing + charWidth,
...@@ -200,6 +209,23 @@ describe('ImageTextLayer', () => { ...@@ -200,6 +209,23 @@ describe('ImageTextLayer', () => {
assert.deepEqual(wordBoxPositions, expectedPositions); assert.deepEqual(wordBoxPositions, expectedPositions);
}); });
it('breaks words when characters do not overlap vertically', () => {
const { image } = createPageImage();
const imageText = 'first linesecond line';
const textLayer = createTextLayer(
image,
createCharBoxes(imageText, [imageText.indexOf('second')]),
imageText
);
assert.equal(textLayer.container.textContent, 'first linesecond line');
const wordBoxes = getWordBoxes(textLayer);
assert.deepEqual(
wordBoxes.map(ws => ws.textContent),
['first ', 'line', 'second ', 'line']
);
});
it('creates lines and columns in the text layer', () => { it('creates lines and columns in the text layer', () => {
const { image } = createPageImage(); const { image } = createPageImage();
const textLayer = createTextLayer( const textLayer = createTextLayer(
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment