Public
Edited
Aug 10, 2023
Insert cell
Insert cell
{
let txt = `I am not quite sure of the exact place or exact date of my birth, but at any rate I suspect I must have been born somewhere and at some time.`
return getPhrases(txt)
}
Insert cell
getPhrases = {
// token sizes
const MAX = 5
const MIN = 2
// output properties
const opts = { offset: true, terms: false }

// tokenizers to run in descending order, until satisfied
let methods = [
// split prepositions 'in, by, for'
(part) => part.splitBefore('#Preposition'),
// 'and, or, but'
(part) => part.splitBefore('#Conjunction'),
// 'if ...'
(part) => part.splitBefore('(if|which|so|then)'),
// 'talk quickly'
(part) => part.splitAfter('#Verb #Adverb+'),
// 'walks to'
(part) => part.splitBefore('#PresentTense to'),
// '99 reasons'
(part) => part.splitAfter('#Value #Noun+'),
// 'spencer kelly'
(part) => part.splitAfter('#ProperNoun+'),
// 'Canada's'
(part) => part.splitBefore('#Possessive+'),
// 'aug 10th 2023'
(part) => part.splitAfter('#Date+'),
// 'i suspect'
(part) => part.splitBefore('#Pronoun'),
// 'is very nice'
(part) => part.splitAfter('#Copula #Adverb+? #Adjective'),
// spencer walks
(part) => part.splitAfter('[#Noun] #Verb', 0),
// split on any comma?
(part) => part.splitAfter('@hasComma'),
// fallback, split blindly after 5 words
(part) => part.splitAfter('.{5}')
]

const splitOne = function (part) {
for (let i = 0; i < methods.length; i += 1) {
if (part.terms().length <= MAX) {
return part
}
part = methods[i](part)
}
return part
}

const splitList = function (list) {
let out = list.none()
list.forEach((part) => {
out = out.concat(splitOne(part))
})
return out
}

// find anything < MIN and join to neighbour
const joinSmalls = function (list) {
let out = list.none()
for (let i = 0; i < list.length; i += 1) {
let part = list.eq(i)
let beside = list.eq(i + 1)
if (part.terms().length < MIN && beside.found && beside.lookBefore('.').found) {
out = out.concat(part.append(beside))
i += 1
} else {
out = out.concat(part)
}
}
return out
}

// loosely tokenize text by phrases of a given size
const getPhrases = function (str) {
let doc = nlp(str)
// first, split commas, semicolons
let list = doc.clauses()
// other natural sentence chunks
list = list.splitOn(doc.parentheses())
list = list.splitOn(doc.quotations())
// run each of our split methods, in sequence
list = splitList(list)
// join any too-small
list = joinSmalls(list)
return list.json(opts)
}

return getPhrases
}
Insert cell
nlp=require('compromise@latest')
Insert cell
nlp.version
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more