WITH
titles AS
(SELECT Dissertation as id, Dissertation as text, round(year/10) * 10 AS category FROM data WHERE year >= 1920 AND year < 2025
-- There are a lot of French and Spanish dissertations in the 80s/90s, this makes them go away
AND "Dissertation" NOT SIMILAR TO '(^(L''|La |Le |El ).*)|(.*de [lL].*)')
,
tokenized AS (SELECT id, category, string_split_regex(LOWER(text), '[ ,\-:\?".''\|«»]') words FROM titles),
bigrams_raw AS
(SELECT category, UNNEST(array_pop_back(words)) w1, UNNEST(array_pop_front(words)) w2 FROM tokenized),
bigrams AS
(SELECT * FROM bigrams_raw WHERE w1 != '' and w2 != ''),
bigram_counts_by_category AS
(SELECT category, w1, w2, COUNT(*)::FLOAT AS count FROM bigrams GROUP BY category, w1, w2),
bigram_counts AS
(SELECT w1, w2, SUM(count) bigramcount FROM bigram_counts_by_category GROUP BY w1, w2 HAVING bigramcount > 10),
words AS
(SELECT category, UNNEST(words) as word FROM tokenized),
totalcounts AS
(SELECT COUNT(*)::FLOAT total FROM words),
word_counts AS (
SELECT category, word, COUNT(*)::FLOAT AS count FROM words WHERE word != '' GROUP BY category, word
),
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total, COUNT(*)::FLOAT/(SELECT total FROM totalcounts) wordrate FROM word_counts GROUP BY word
),
bigramrates AS
(SELECT *, (SELECT total FROM totalcounts) AS totalcounts,
-- The expected number of occurrences. Only divide by the total words once
-- because the second one cancels out.
w1.word_total * w2.word_total / (SELECT total FROM totalcounts) AS expected,
FROM bigram_counts
INNER JOIN word_totals w1 ON (bigram_counts.w1 = w1.word)
INNER JOIN word_totals w2 ON (bigram_counts.w2 = w2.word)
),
topbigrams AS
(SELECT w1, w2, w1 || ' ' || w2 AS word, bigramcount FROM bigramrates WHERE bigramcount / expected > 100),
topbigrams_by_category AS (
SELECT category, word, count FROM bigram_counts_by_category NATURAL JOIN topbigrams
),
ngrams AS
(SELECT * FROM topbigrams_by_category UNION
SELECT * FROM word_counts),
ngram_totals AS
(SELECT word, word_total FROM word_totals UNION SELECT word, bigramcount AS word_total FROM topbigrams),
word_counts_by_category AS
(SELECT word, category, COUNT(*)::FLOAT count FROM words GROUP BY category, word),
doctokens AS
(
SELECT w1 || ' ' || w2 word, category, "count"
FROM bigram_counts_by_category
-- UNION
-- SELECT * FROM word_counts_by_category
)
,
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS (
SELECT
(SELECT sum("category_size")::FLOAT FROM category_totals) AS corpus_size,
*
FROM doctokens NATURAL JOIN ngram_totals NATURAL JOIN category_totals
),
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding, -- 0.2 is a fudge here--you could call it a 'prior,', I guess. If a word doesn't appear, we pretend that it appeared *just a little* to make the logarithms work.
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word,
count,
word_total,
PMI,
llr,
RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,
FROM pmi_part4 WHERE PMI > 0
)
SELECT category, list(word) terms FROM ranked_ordered
WHERE ordering < 25 GROUP BY category ORDER BY category