Distinguishing words with PMI Bigrams / Benjamin Schmidt

Pricing

Get started

Benjamin Schmidt

Digital Humanist. Manhattan-based.

Workspace

Public

Edited

Jan 30, 2023

Fork of Trends in History dissertations / statistics of distinguishing words as SQL

•

1 fork

[...stats[0].terms]

client

WITH

titles AS

(SELECT Dissertation as id, Dissertation as text, round(year/10) * 10 AS category FROM data WHERE year >= 1920 AND year < 2025

-- There are a lot of French and Spanish dissertations in the 80s/90s, this makes them go away

AND "Dissertation" NOT SIMILAR TO '(^(L''|La |Le |El ).*)|(.*de [lL].*)')

tokenized AS (SELECT id, category, string_split_regex(LOWER(text), '[ ,\-:\?".''\|«»]') words FROM titles),

bigrams_raw AS

(SELECT category, UNNEST(array_pop_back(words)) w1, UNNEST(array_pop_front(words)) w2 FROM tokenized),

bigrams AS

(SELECT * FROM bigrams_raw WHERE w1 != '' and w2 != ''),

bigram_counts_by_category AS

(SELECT category, w1, w2, COUNT(*)::FLOAT AS count FROM bigrams GROUP BY category, w1, w2),

bigram_counts AS

(SELECT w1, w2, SUM(count) bigramcount FROM bigram_counts_by_category GROUP BY w1, w2 HAVING bigramcount > 10),

words AS

(SELECT category, UNNEST(words) as word FROM tokenized),

totalcounts AS

(SELECT COUNT(*)::FLOAT total FROM words),

word_counts AS (

SELECT category, word, COUNT(*)::FLOAT AS count FROM words WHERE word != '' GROUP BY category, word

word_totals AS (

SELECT word, sum("count")::FLOAT AS word_total, COUNT(*)::FLOAT/(SELECT total FROM totalcounts) wordrate FROM word_counts GROUP BY word

bigramrates AS

(SELECT *, (SELECT total FROM totalcounts) AS totalcounts,

-- The expected number of occurrences. Only divide by the total words once

-- because the second one cancels out.

w1.word_total * w2.word_total / (SELECT total FROM totalcounts) AS expected,

FROM bigram_counts

INNER JOIN word_totals w1 ON (bigram_counts.w1 = w1.word)

INNER JOIN word_totals w2 ON (bigram_counts.w2 = w2.word)

topbigrams AS

(SELECT w1, w2, w1 || ' ' || w2 AS word, bigramcount FROM bigramrates WHERE bigramcount / expected > 100),

topbigrams_by_category AS (

SELECT category, word, count FROM bigram_counts_by_category NATURAL JOIN topbigrams

ngrams AS

(SELECT * FROM topbigrams_by_category UNION

SELECT * FROM word_counts),

ngram_totals AS

(SELECT word, word_total FROM word_totals UNION SELECT word, bigramcount AS word_total FROM topbigrams),

word_counts_by_category AS

(SELECT word, category, COUNT(*)::FLOAT count FROM words GROUP BY category, word),

doctokens AS

(

SELECT w1 || ' ' || w2 word, category, "count"

FROM bigram_counts_by_category

-- UNION

-- SELECT * FROM word_counts_by_category

)

-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.

category_totals AS (

SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category

pmi_part1 AS (

SELECT

(SELECT sum("category_size")::FLOAT FROM category_totals) AS corpus_size,

FROM doctokens NATURAL JOIN ngram_totals NATURAL JOIN category_totals

pmi_part2 AS (

SELECT *,

greatest(word_total - "count", 0.2) AS count_excluding, -- 0.2 is a fudge here--you could call it a 'prior,', I guess. If a word doesn't appear, we pretend that it appeared *just a little* to make the logarithms work.

corpus_size - category_size AS size_excluding_category

FROM pmi_part1

pmi_part3 AS (

SELECT *,

category_size * (word_total / corpus_size) AS expected_here,

size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,

FROM pmi_part2

pmi_part4 AS (

SELECT *,

2 * (

"count" * log("count" / expected_here) +

"count_excluding" * log(count_excluding / expected_elsewhere)

) AS llr,

log("count" / expected_here) AS pmi,

FROM pmi_part3),

ranked_ordered AS (

SELECT category,

word,

count,

word_total,

PMI,

llr,

RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,

FROM pmi_part4 WHERE PMI > 0

)

SELECT category, list(word) terms FROM ranked_ordered

WHERE ordering < 25 GROUP BY category ORDER BY category

client

WITH

titles AS

(SELECT Dissertation, round(year/10) * 10 AS category FROM data WHERE year >= 1920 AND year < 2025),

word_counts AS

(SELECT category, word, COUNT(*) count FROM

-- This should be more unicode safe

(SELECT category, UNLIST(string_split_regex(LOWER(Dissertation), '[ ,\-:\?".]')) word FROM titles)

GROUP BY category, word

-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.

word_totals AS (

SELECT word, sum("count")::FLOAT AS word_total FROM word_counts GROUP BY word

category_totals AS (

SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category

pmi_part1 AS (

SELECT

SUM("count") OVER ()::FLOAT as corpus_size,

FROM word_counts NATURAL JOIN word_totals NATURAL JOIN category_totals

pmi_part2 AS (

SELECT *,

corpus_size - category_size AS size_excluding_category

FROM pmi_part1

pmi_part3 AS (

SELECT *,

category_size * (word_total / corpus_size) AS expected_here,

size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,

FROM pmi_part2

pmi_part4 AS (

SELECT *,

2 * (

"count" * log("count" / expected_here) +

"count_excluding" * log(count_excluding / expected_elsewhere)

) AS llr,

log("count" / expected_here) AS pmi,

FROM pmi_part3),

ranked_ordered AS (

SELECT category,

word,

count,

word_total,

PMI,

llr,

RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,

FROM pmi_part4 WHERE PMI > 0

)

SELECT

category, list(word) terms FROM ranked_ordered

WHERE ordering < 16 GROUP BY category ORDER BY category

client

SELECT year, Dissertation, rown FROM (SELECT *, (row_number() OVER ())::float AS rown FROM data) WHERE Dissertation SIMILAR TO '.* de l.*' ORDER BY RANDOM() ASC

client = DuckDBClient.of({ data: FileAttachment("all-dissertations@2.parquet") })

client.execute('CREATE MACRO dynamic_table(tab, ) AS TABLE

WITH

tokenized AS SELECT

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more