Compromise-tokenization / spencer kelly

spencer kelly

freelance javascripter

Workspace

Public

nlp-compromise

Edited

Jan 5, 2024

1 fork

2 stars

{

let doc=nlp("Hi Dr. Miller the price is 4.59 for the U.C.L.A. Ph.Ds.")

// sentence parsing happens automatically...

// return an array of sentence texts:

let arr = doc.json().map(o=> o.text)

return arr

}

{

let abbrevs = nlp().model.one.abbreviations

let mine = {

abbr: true, //add known abbreviation

co: false //remove known abbreviation

}

// add them in

Object.assign(abbrevs, mine)

return nlp('the new abbr. is working. whoo hoo').out('array')

}

md`you can always see the term-splitting, in a number of ways:`

nlp('i wanna').json()

nlp('new york, new york. it\'s a hell-of-a town.').terms().json()

{

// add 'semi' as a non-word prefix:

nlp.world().model.one.prefixes.semi = true

return nlp('semi-detatched but slightly-ajar').terms().json({normal:true}).map(t=>t.normal)

}

{

nlp('spencer\'s cool').debug(); // see dev console

return null

}

nlp(`"Oh! say, do you see?!"`).docs[0].map(t=>[t.pre, t.text, t.post])

{

let world = nlp.world()

// support "=foo=" as a word

world.model.one.prePunctuation['='] = true

world.model.one.postPunctuation['='] = true

return nlp.tokenize('=cool=').json()[0].terms[0]

}

{

let world = nlp.world()

// re-interpret "=foo=" as "foo" (default)

world.model.one.prePunctuation['='] = false

world.model.one.postPunctuation['='] = false

return nlp.tokenize('=cool=').json()[0].terms[0]

}

{

let doc = nlp.tokenize('spencer kelly is working', {working:'NotCool'})

return JSON.stringify(doc.out('tags')[0], null, 2)

}

{

const methods = nlp.world().methods

//change the sentence-splitting tokenizer

methods.one.tokenize.splitSentences = function (str) {

return str.split(/[.?!]/) //(demonstration purposes!)

}

//change the term-splitting tokenizer

methods.one.tokenize.splitTerms = function (str) {

return str.split(/ /) //works for me!

}

//now the hyphenated term is combined:

return nlp('one two-three four five').terms().out('array')

}

nlp.world().methods.one.tokenize

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more