|
@@ -20,8 +20,8 @@ if (typeof PDFJS === 'undefined') {
|
|
|
(typeof window !== 'undefined' ? window : this).PDFJS = {};
|
|
|
}
|
|
|
|
|
|
-PDFJS.version = '1.3.32';
|
|
|
-PDFJS.build = 'c2dfe9e';
|
|
|
+PDFJS.version = '1.3.34';
|
|
|
+PDFJS.build = '0819d71';
|
|
|
|
|
|
(function pdfjsWrapper() {
|
|
|
// Use strict in our context only - users might not want it
|
|
@@ -2354,6 +2354,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
|
|
|
return PDFDocumentProxy;
|
|
|
})();
|
|
|
|
|
|
+/**
|
|
|
+ * Page getTextContent parameters.
|
|
|
+ *
|
|
|
+ * @typedef {Object} getTextContentParameters
|
|
|
+ * @param {boolean} normalizeWhitespace - replaces all occurrences of
|
|
|
+ * whitespace with standard spaces (0x20). The default value is `false`.
|
|
|
+ */
|
|
|
+
|
|
|
/**
|
|
|
* Page text content.
|
|
|
*
|
|
@@ -2632,12 +2640,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
|
|
|
},
|
|
|
|
|
|
/**
|
|
|
+ * @param {getTextContentParameters} params - getTextContent parameters.
|
|
|
* @return {Promise} That is resolved a {@link TextContent}
|
|
|
* object that represent the page text content.
|
|
|
*/
|
|
|
- getTextContent: function PDFPageProxy_getTextContent() {
|
|
|
+ getTextContent: function PDFPageProxy_getTextContent(params) {
|
|
|
+ var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
|
|
|
+
|
|
|
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
|
|
|
- pageIndex: this.pageNumber - 1
|
|
|
+ pageIndex: this.pageNumber - 1,
|
|
|
+ normalizeWhitespace: normalizeWhitespace,
|
|
|
});
|
|
|
},
|
|
|
|
|
@@ -9909,7 +9921,8 @@ var Page = (function PageClosure() {
|
|
|
});
|
|
|
},
|
|
|
|
|
|
- extractTextContent: function Page_extractTextContent(task) {
|
|
|
+ extractTextContent: function Page_extractTextContent(task,
|
|
|
+ normalizeWhitespace) {
|
|
|
var handler = {
|
|
|
on: function nullHandlerOn() {},
|
|
|
send: function nullHandlerSend() {}
|
|
@@ -9939,7 +9952,9 @@ var Page = (function PageClosure() {
|
|
|
|
|
|
return partialEvaluator.getTextContent(contentStream,
|
|
|
task,
|
|
|
- self.resources);
|
|
|
+ self.resources,
|
|
|
+ /* stateManager = */ null,
|
|
|
+ normalizeWhitespace);
|
|
|
});
|
|
|
},
|
|
|
|
|
@@ -18842,12 +18857,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|
|
});
|
|
|
},
|
|
|
|
|
|
- getTextContent: function PartialEvaluator_getTextContent(stream, task,
|
|
|
- resources,
|
|
|
- stateManager) {
|
|
|
+ getTextContent:
|
|
|
+ function PartialEvaluator_getTextContent(stream, task, resources,
|
|
|
+ stateManager,
|
|
|
+ normalizeWhitespace) {
|
|
|
|
|
|
stateManager = (stateManager || new StateManager(new TextState()));
|
|
|
|
|
|
+ var WhitespaceRegexp = /\s/g;
|
|
|
+
|
|
|
var textContent = {
|
|
|
items: [],
|
|
|
styles: Object.create(null)
|
|
@@ -18961,11 +18979,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|
|
return textContentItem;
|
|
|
}
|
|
|
|
|
|
+ function replaceWhitespace(str) {
|
|
|
+ // Replaces all whitespaces with standard spaces (0x20), to avoid
|
|
|
+ // alignment issues between the textLayer and the canvas if the text
|
|
|
+ // contains e.g. tabs (fixes issue6612.pdf).
|
|
|
+ var i = 0, ii = str.length, code;
|
|
|
+ while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
|
|
|
+ i++;
|
|
|
+ }
|
|
|
+ return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
|
|
|
+ }
|
|
|
+
|
|
|
function runBidiTransform(textChunk) {
|
|
|
var str = textChunk.str.join('');
|
|
|
var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
|
|
|
return {
|
|
|
- str: bidiResult.str,
|
|
|
+ str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
|
|
|
+ bidiResult.str),
|
|
|
dir: bidiResult.dir,
|
|
|
width: textChunk.width,
|
|
|
height: textChunk.height,
|
|
@@ -19286,8 +19316,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|
|
}
|
|
|
|
|
|
return self.getTextContent(xobj, task,
|
|
|
- xobj.dict.get('Resources') || resources, stateManager).
|
|
|
- then(function (formTextContent) {
|
|
|
+ xobj.dict.get('Resources') || resources, stateManager,
|
|
|
+ normalizeWhitespace).then(function (formTextContent) {
|
|
|
Util.appendToArray(textContent.items, formTextContent.items);
|
|
|
Util.extendObj(textContent.styles, formTextContent.styles);
|
|
|
stateManager.restore();
|
|
@@ -41996,12 +42026,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
|
|
|
|
|
|
handler.on('GetTextContent', function wphExtractText(data) {
|
|
|
var pageIndex = data.pageIndex;
|
|
|
+ var normalizeWhitespace = data.normalizeWhitespace;
|
|
|
return pdfManager.getPage(pageIndex).then(function(page) {
|
|
|
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
|
|
startWorkerTask(task);
|
|
|
var pageNum = pageIndex + 1;
|
|
|
var start = Date.now();
|
|
|
- return page.extractTextContent(task).then(function(textContent) {
|
|
|
+ return page.extractTextContent(task, normalizeWhitespace).then(
|
|
|
+ function(textContent) {
|
|
|
finishWorkerTask(task);
|
|
|
info('text indexing: page=' + pageNum + ' - time=' +
|
|
|
(Date.now() - start) + 'ms');
|