Platform
Resources
Pricing
Sign in
Get started
Alf Eaton
Workspace
Fork
Published
By
Alf Eaton
Edited
Dec 27, 2021
2 stars
Insert cell
Insert cell
PDFJS
=
{
const
PDFJS
=
await
require
(
"pdfjs-dist@2"
)
;
PDFJS
.
GlobalWorkerOptions
.
workerSrc
=
await
require
.
resolve
(
"pdfjs-dist@2/build/pdf.worker.js"
)
;
return
PDFJS
;
}
Insert cell
extractText
=
async
(
pdf
)
=>
{
const
doc
=
await
PDFJS
.
getDocument
(
pdf
)
.
promise
;
let
text
=
""
;
for
(
var
i
=
1
;
i
<=
doc
.
numPages
;
i
++
)
{
const
page
=
await
doc
.
getPage
(
i
)
;
const
textContent
=
await
page
.
getTextContent
(
{
normalizeWhitespace
:
true
}
)
;
const
counts
=
{
x
:
{
}
,
y
:
{
}
}
;
for
(
const
item
of
textContent
.
items
)
{
const
[
,
,
,
x
,
y
]
=
item
.
transform
;
counts
.
x
[
x
]
=
(
counts
.
x
[
x
]
||
0
)
+
1
;
counts
.
y
[
y
]
=
(
counts
.
y
[
y
]
||
0
)
+
1
;
}
const
items
=
Array
.
from
(
Object
.
entries
(
counts
.
x
)
)
.
sort
(
(
a
,
b
)
=>
b
[
1
]
-
a
[
1
]
)
;
const
commonX
=
items
[
0
]
[
0
]
;
// TODO: exclude header/footer?
let
previousX
=
0
;
let
previousY
=
5000
;
for
(
const
item
of
textContent
.
items
)
{
const
[
,
,
,
x
,
y
]
=
item
.
transform
;
if
(
x
<
commonX
)
{
continue
;
}
if
(
y
<
previousX
)
{
if
(
!
text
.
endsWith
(
"-"
)
)
{
text
+=
" "
;
}
previousX
=
y
;
previousX
=
x
+
item
.
width
;
// TODO: split into paragraphs?
}
else
{
if
(
x
>
previousX
+
2
)
{
text
+=
" "
;
}
previousX
+=
item
.
width
;
}
text
+=
item
.
str
;
}
}
return
text
;
}
Insert cell
Purpose-built for displays of data
Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Try it for free
Learn more
Fork
View
Export
Edit
Add comment
Select
Duplicate
Copy link
Embed
Delete
JavaScript
Markdown
HTML
PDFJS
Add comment
Copy import
Select
Duplicate
Copy link
Embed
Delete
JavaScript
Markdown
HTML
extractText
Add comment
Copy import
Select
Duplicate
Copy link
Embed
Delete
JavaScript
Markdown
HTML