Published
Edited
Dec 27, 2021
2 stars
Insert cell
Insert cell
PDFJS = {
const PDFJS = await require("pdfjs-dist@2");
PDFJS.GlobalWorkerOptions.workerSrc = await require.resolve(
"pdfjs-dist@2/build/pdf.worker.js"
);
return PDFJS;
}
Insert cell
extractText = async (pdf) => {
const doc = await PDFJS.getDocument(pdf).promise;

let text = "";

for (var i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);

const textContent = await page.getTextContent({
normalizeWhitespace: true
});

const counts = { x: {}, y: {} };

for (const item of textContent.items) {
const [, , , x, y] = item.transform;
counts.x[x] = (counts.x[x] || 0) + 1;
counts.y[y] = (counts.y[y] || 0) + 1;
}

const items = Array.from(Object.entries(counts.x)).sort(
(a, b) => b[1] - a[1]
);

const commonX = items[0][0];

// TODO: exclude header/footer?

let previousX = 0;
let previousY = 5000;

for (const item of textContent.items) {
const [, , , x, y] = item.transform;

if (x < commonX) {
continue;
}

if (y < previousX) {
if (!text.endsWith("-")) {
text += " ";
}
previousX = y;
previousX = x + item.width;
// TODO: split into paragraphs?
} else {
if (x > previousX + 2) {
text += " ";
}
previousX += item.width;
}
text += item.str;
}
}

return text;
}
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more