Public
Edited
Dec 29, 2022
Insert cell
Insert cell
client
Type SQL, then Shift-Enter. Ctrl-space for more options.

Insert cell
Insert cell
Insert cell
Insert cell
client
Type SQL, then Shift-Enter. Ctrl-space for more options.

Insert cell
client
SELECT * FROM data WHERE occ IS NOT NULL;
Insert cell
client
WITH
titles AS
(SELECT Dissertation, "Employer Category" AS category FROM data WHERE carnegie IS NOT NULL),
word_counts AS
(SELECT category, word, COUNT(*) count FROM
-- This should be more unicode safe
(SELECT category, UNLIST(string_split_regex(LOWER(Dissertation), '[^a-zé]+')) word FROM titles)
GROUP BY category, word
),
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total FROM word_counts GROUP BY word
),
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS
( SELECT
SUM("count") OVER ()::FLOAT as corpus_size,
*
FROM word_counts NATURAL JOIN word_totals NATURAL JOIN category_totals
),
-- A few steps to define all the variables we need.
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding,
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word,
count,
word_total,
PMI,
llr,
RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,
FROM pmi_part4 WHERE PMI > 0 AND llr > 1 AND category_size > 100 AND count > 2
)
SELECT
category, list(word) terms FROM ranked_ordered
WHERE ordering < 16 GROUP BY category ORDER BY category
Insert cell
client.query("SELECT * FROM data WHERE Dissertation LIKE '% et %'")
Insert cell
c
Insert cell
DuckDBClient
Insert cell
client
WITH initial AS
(SELECT occ, "department.x", COUNT(*) count FROM data WHERE occ IS NOT NULL GROUP BY ALL),
grouped AS (SELECT "Department.x", count, occ, (SUM(count) OVER (PARTITION BY "Department.x"))::FLOAT tot FROM initial)

SELECT count/tot,occ, "department.x" FROM grouped ORDER BY count DESC
Insert cell
client = DuckDBClient.of({ data: FileAttachment("all-dissertations@2.parquet") })
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more