Commit f42c3656 authored by csillag's avatar csillag

Simply PDF text extraction

Earlier, on some versions of PDF.js, we used to
use the PDF.js's FindController as a data source
for the text extraction.

However, at some point, we stopped using the
routines shipped with it, since it didn't always
provide use adequate spacing between the various
pieces of texts. So we ended up just contenating
the various pieces of text ourselves.

Then, for new versions of PDF.js, we introduced
other means of accessing the same information,
completely bypassing the PDFFindController.

This change simply unifies the access; now we can
do the same an all PDF.js versions.
parent 31b83c97
......@@ -50,20 +50,8 @@ class window.PDFTextMapper extends PageTextMapperCore
if PDFViewerApplication?
@_app = PDFViewerApplication
@_viewer = @_app.pdfViewer
@_tryExtractPage = (index) => @_viewer.getPageTextContent(index)
else
@_app = @_viewer = PDFView
@_finder = @_app.findController ? # PDF.js v1.0.712
PDFFindController # up to PDF.js v1.0.437
@_tryExtractPage = (index) =>
new Promise (resolve, reject) =>
tryIt = =>
page = @_finder.pdfPageSource.pages[index]
if page?.pdfPage?
page.getTextContent().then(resolve)
else
setTimeout tryIt, 100
tryIt()
@setEvents()
......@@ -147,9 +135,10 @@ class window.PDFTextMapper extends PageTextMapperCore
@_pendingScanResolve = resolve
@waitForInit().then =>
# Wait for the document to load
@_app.pdfDocument.getPage(1).then =>
# Initialize our main page data array
@pageInfo = []
# Start the text extraction
@_extractPageText 0
# Manually extract the text from the PDF document.
......@@ -157,7 +146,11 @@ class window.PDFTextMapper extends PageTextMapperCore
# own text extraction routines, which sometimes fail to add
# adequate spacing.
_extractPageText: (pageIndex) ->
@_tryExtractPage(pageIndex).then (data) =>
# Wait for the page to load
@_app.pdfDocument.getPage(pageIndex + 1).then (page) =>
# Wait for the data to be extracted
page.getTextContent().then (data) =>
# There is some variation about what I might find here,
# depending on PDF.js version, so we need to do some guesswork.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment