Public
Edited
Feb 27, 2023
Insert cell
Insert cell
[...stats[0].terms]
Insert cell
client
WITH
titles AS
(SELECT Dissertation as id, Dissertation as text, round(year/10) * 10 AS category FROM data WHERE year >= 1920 AND year < 2025
-- There are a lot of French and Spanish dissertations in the 80s/90s, this makes them go away
AND "Dissertation" NOT SIMILAR TO '(^(L''|La |Le |El ).*)|(.*de [lL].*)')
,
tokenized AS (SELECT id, category, string_split_regex(LOWER(text), '[ ,\-:\?".''\|«»]') words FROM titles),
bigrams_raw AS
(SELECT category, UNNEST(array_pop_back(words)) w1, UNNEST(array_pop_front(words)) w2 FROM tokenized),
bigrams AS
(SELECT * FROM bigrams_raw WHERE w1 != '' and w2 != ''),
bigram_counts_by_category AS
(SELECT category, w1, w2, COUNT(*)::FLOAT AS count FROM bigrams GROUP BY category, w1, w2),
bigram_counts AS
(SELECT w1, w2, SUM(count) bigramcount FROM bigram_counts_by_category GROUP BY w1, w2 HAVING bigramcount > 10),
words AS
(SELECT category, UNNEST(words) as word FROM tokenized),
totalcounts AS
(SELECT COUNT(*)::FLOAT total FROM words),
word_counts AS (
SELECT category, word, COUNT(*)::FLOAT AS count FROM words WHERE word != '' GROUP BY category, word
),
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total, COUNT(*)::FLOAT/(SELECT total FROM totalcounts) wordrate FROM word_counts GROUP BY word
),
bigramrates AS
(SELECT *, (SELECT total FROM totalcounts) AS totalcounts,
-- The expected number of occurrences. Only divide by the total words once
-- because the second one cancels out.
w1.word_total * w2.word_total / (SELECT total FROM totalcounts) AS expected,
FROM bigram_counts
INNER JOIN word_totals w1 ON (bigram_counts.w1 = w1.word)
INNER JOIN word_totals w2 ON (bigram_counts.w2 = w2.word)
),
topbigrams AS
(SELECT w1, w2, w1 || ' ' || w2 AS word, bigramcount FROM bigramrates WHERE bigramcount / expected > 100),
topbigrams_by_category AS (
SELECT category, word, count FROM bigram_counts_by_category NATURAL JOIN topbigrams
),
ngrams AS
(SELECT * FROM topbigrams_by_category UNION
SELECT * FROM word_counts),
ngram_totals AS
(SELECT word, word_total FROM word_totals UNION SELECT word, bigramcount AS word_total FROM topbigrams),
word_counts_by_category AS
(SELECT word, category, COUNT(*)::FLOAT count FROM words GROUP BY category, word),
doctokens AS
(
SELECT w1 || ' ' || w2 word, category, "count"
FROM bigram_counts_by_category
-- UNION
-- SELECT * FROM word_counts_by_category
)
,
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS (
SELECT
(SELECT sum("category_size")::FLOAT FROM category_totals) AS corpus_size,
*
FROM doctokens NATURAL JOIN ngram_totals NATURAL JOIN category_totals
),
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding, -- 0.2 is a fudge here--you could call it a 'prior,', I guess. If a word doesn't appear, we pretend that it appeared *just a little* to make the logarithms work.
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word,
count,
word_total,
PMI,
llr,
RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,
FROM pmi_part4 WHERE PMI > 0
)
SELECT category, list(word) terms FROM ranked_ordered
WHERE ordering < 25 GROUP BY category ORDER BY category
Insert cell
Insert cell
Insert cell
Insert cell
client
WITH
titles AS
(SELECT Dissertation, round(year/10) * 10 AS category FROM data WHERE year >= 1920 AND year < 2025),
word_counts AS
(SELECT category, word, COUNT(*) count FROM
-- This should be more unicode safe
(SELECT category, UNLIST(string_split_regex(LOWER(Dissertation), '[ ,\-:\?".]')) word FROM titles)
GROUP BY category, word
),
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total FROM word_counts GROUP BY word
),
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS (
SELECT
SUM("count") OVER ()::FLOAT as corpus_size,
*
FROM word_counts NATURAL JOIN word_totals NATURAL JOIN category_totals
),
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding, -- 0.2 is a fudge here--you could call it a 'prior,', I guess. If a word doesn't appear, we pretend that it appeared *just a little* to make the logarithms work.
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word,
count,
word_total,
PMI,
llr,
RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,
FROM pmi_part4 WHERE PMI > 0
)
SELECT
category, list(word) terms FROM ranked_ordered
WHERE ordering < 16 GROUP BY category ORDER BY category
Insert cell
client
SELECT year, Dissertation, rown FROM (SELECT *, (row_number() OVER ())::float AS rown FROM data) WHERE Dissertation SIMILAR TO '.* de l.*' ORDER BY RANDOM() ASC
Insert cell
client = DuckDBClient.of({ data: FileAttachment("all-dissertations@2.parquet") })
Insert cell
client.execute('CREATE MACRO dynamic_table(tab, ) AS TABLE
WITH
tokenized AS SELECT
')
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more