Public
Edited
Mar 9, 2023
5 forks
5 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
client
WITH
titles AS
(SELECT Dissertation, round(year/4) * 4 AS category FROM data WHERE year >= 1923 AND year < 2022),
word_counts AS
(SELECT category, word, COUNT(*) count FROM
-- This should be more unicode safe
(SELECT category, UNLIST(string_split_regex(LOWER(Dissertation), '[ ,\-:\?".]')) word FROM titles)
GROUP BY category, word
),
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total FROM word_counts GROUP BY word
),
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS (
SELECT
SUM("count") OVER ()::FLOAT as corpus_size,
*
FROM word_counts NATURAL JOIN word_totals NATURAL JOIN category_totals
),
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding, -- 0.2 is a fudge here--you could call it a 'prior,', I guess. If a word doesn't appear, we pretend that it appeared *just a little* to make the logarithms work.
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word,
count,
word_total,
PMI,
llr,
RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,
FROM pmi_part4 WHERE PMI > 0
)
SELECT
category, list(word) terms FROM ranked_ordered
WHERE ordering < 16 GROUP BY category ORDER BY category
Insert cell
client = DuckDBClient.of({ data: FileAttachment("all-dissertations@2.parquet") })
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more