Public
Edited
Aug 10, 2023
Insert cell
Insert cell
{
let txt = `I am not quite sure of the exact place or exact date of my birth, but at any rate I suspect I must have been born somewhere and at some time.`
return getPhrases(txt)
}
Insert cell
getPhrases = {
// token sizes
const MAX = 5
const MIN = 2
// output properties
const opts = { offset: true, terms: false }

// tokenizers to run in descending order, until satisfied
let methods = [
// split prepositions 'in, by, for'
(part) => part.splitBefore('#Preposition'),
// 'and, or, but'
(part) => part.splitBefore('#Conjunction'),
// 'if ...'
(part) => part.splitBefore('(if|which|so|then)'),
// 'talk quickly'
(part) => part.splitAfter('#Verb #Adverb+'),
// 'walks to'
(part) => part.splitBefore('#PresentTense to'),
// '99 reasons'
(part) => part.splitAfter('#Value #Noun+'),
// 'spencer kelly'
(part) => part.splitAfter('#ProperNoun+'),
// 'Canada's'
(part) => part.splitBefore('#Possessive+'),
// 'aug 10th 2023'
(part) => part.splitAfter('#Date+'),
// 'i suspect'
(part) => part.splitBefore('#Pronoun'),
// 'is very nice'
(part) => part.splitAfter('#Copula #Adverb+? #Adjective'),
// spencer walks
(part) => part.splitAfter('[#Noun] #Verb', 0),
// split on any comma?
(part) => part.splitAfter('@hasComma'),
// fallback, split blindly after 5 words
(part) => part.splitAfter('.{5}')
]

const splitOne = function (part) {
for (let i = 0; i < methods.length; i += 1) {
if (part.terms().length <= MAX) {
return part
}
part = methods[i](part)
}
return part
}

const splitList = function (list) {
let out = list.none()
list.forEach((part) => {
out = out.concat(splitOne(part))
})
return out
}

// find anything < MIN and join to neighbour
const joinSmalls = function (list) {
let out = list.none()
for (let i = 0; i < list.length; i += 1) {
let part = list.eq(i)
let beside = list.eq(i + 1)
if (part.terms().length < MIN && beside.found && beside.lookBefore('.').found) {
out = out.concat(part.append(beside))
i += 1
} else {
out = out.concat(part)
}
}
return out
}

// loosely tokenize text by phrases of a given size
const getPhrases = function (str) {
let doc = nlp(str)
// first, split commas, semicolons
let list = doc.clauses()
// other natural sentence chunks
list = list.splitOn(doc.parentheses())
list = list.splitOn(doc.quotations())
// run each of our split methods, in sequence
list = splitList(list)
// join any too-small
list = joinSmalls(list)
return list.json(opts)
}

return getPhrases
}
Insert cell
nlp=require('compromise@latest')
Insert cell
nlp.version
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more