q = `WITH
word_counts AS
(SELECT boroname || ': ' || nta_name category, spc_common as word, COUNT(*) count FROM trees
GROUP BY category, word
),
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total FROM word_counts GROUP BY word
),
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS (
SELECT
SUM("count") OVER ()::FLOAT as corpus_size,
*
FROM word_counts NATURAL JOIN word_totals NATURAL JOIN category_totals
),
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding, -- 0.2 is a fudge here--you could call it a 'prior,', I guess. If a word doesn't appear, we pretend that it appeared *just a little* to make the logarithms work.
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word || '(' || count::INT || ')' AS label,
count,
word_total,
PMI,
llr,
CASE WHEN PMI < 0 THEN 'under-represented' ELSE 'over-represented' END AS direction,
RANK() OVER (PARTITION BY category, PMI < 0 ORDER BY llr DESC) ordering,
FROM pmi_part4
)
SELECT
category, direction, list(label) terms FROM ranked_ordered
WHERE ordering < 6 GROUP BY direction, category ORDER BY direction, category`