Public
Edited
Jan 2, 2023
Insert cell
Insert cell
client
Type SQL, then Shift-Enter. Ctrl-space for more options.

Insert cell
Insert cell
Insert cell
Insert cell
chart = {
// vals; // Don't do this until the better query is done.
// Get the counts by year for the word we're interested in.
let v = await client.query(
`
WITH
word_counts AS
(SELECT year AS year, word, COUNT(*)::FLOAT count FROM
(SELECT year, UNLIST(string_split_regex(LOWER(Dissertation), '[ ,\-]')) word FROM data) t1
GROUP BY ALL
),
year_totals AS (SELECT year, SUM(count) as total FROM word_counts WHERE year > 1920 GROUP BY year),
word_count AS (SELECT * FROM word_counts WHERE word = '${word}')

SELECT year, word, COALESCE(count, 0) AS count, total, 100 * COALESCE(count, 0)::FLOAT/total::FLOAT AS rate FROM year_totals LEFT JOIN word_count USING ("year")
ORDER BY year`
);
v = v.map((v) => ({ ...v }));
return Plot.plot({
marks: [Plot.line(v, { x: "year", y: "rate" })]
});
}
Insert cell
client.query(
"SELECT Dissertation, year FROM data WHERE Dissertation LIKE '%hair%' AND year > 2010 LIMIT 10"
)
Insert cell
client
Type SQL, then Shift-Enter. Ctrl-space for more options.

Insert cell
client
WITH
titles AS
(SELECT Dissertation, round((year) / 5) * 5 AS category FROM data WHERE year >= 1923 AND year < 2023),
word_counts AS
(SELECT category, word, COUNT(*) count FROM
-- This should be more unicode safe
(SELECT category, UNLIST(string_split_regex(LOWER(Dissertation), '[ ,\-:]')) word FROM titles)
GROUP BY category, word
),
-- I created the counts as separate subqueries because I was unable to make PARTITION BY work correctly.
word_totals AS (
SELECT word, sum("count")::FLOAT AS word_total FROM word_counts GROUP BY word
),
category_totals AS (
SELECT category, sum("count")::FLOAT AS category_size FROM word_counts GROUP BY category
),
pmi_part1 AS (
SELECT
SUM("count") OVER ()::FLOAT as corpus_size,
*
FROM word_counts NATURAL JOIN word_totals NATURAL JOIN category_totals
),
pmi_part2 AS (
SELECT *,
greatest(word_total - "count", 0.2) AS count_excluding,
corpus_size - category_size AS size_excluding_category
FROM pmi_part1
),
pmi_part3 AS (
SELECT *,
category_size * (word_total / corpus_size) AS expected_here,
size_excluding_category * (word_total / corpus_size) AS expected_elsewhere,
FROM pmi_part2
),
pmi_part4 AS (
SELECT *,
2 * (
"count" * log("count" / expected_here) +
"count_excluding" * log(count_excluding / expected_elsewhere)
) AS llr,
log("count" / expected_here) AS pmi,
FROM pmi_part3),
ranked_ordered AS (
SELECT category,
*,
word,
count,
word_total,
PMI,
llr,
RANK() OVER (PARTITION BY category ORDER BY llr DESC) ordering,
FROM pmi_part4 WHERE PMI > 0
)
SELECT
category, list(word) terms FROM ranked_ordered
WHERE ordering < 16 GROUP BY category ORDER BY category
Insert cell
Type JavaScript, then Shift-Enter. Ctrl-space for more options. Arrow ↑/↓ to switch modes.

Insert cell
DuckDBClient
Insert cell
client
WITH initial AS
(SELECT occ, "department.x", COUNT(*) count FROM data WHERE occ IS NOT NULL GROUP BY ALL),
grouped AS (SELECT "Department.x", count, occ, (SUM(count) OVER (PARTITION BY "Department.x"))::FLOAT tot FROM initial)

SELECT count/tot,occ, "department.x" FROM grouped ORDER BY count DESC
Insert cell
all-dissertations@2.parquet
SELECT md5(Dissertation) FROM 'all-dissertations@2.parquet' USING SAMPLE 10;
Insert cell
all-dissertations@2.parquet
SELECT str_split(Dissertator, ', ')[2] || ' ' || str_split(Dissertator, ', ')[1] AS name FROM 'all-dissertations@2.parquet' USING SAMPLE 10;

Insert cell
client = DuckDBClient.of({ data: FileAttachment("all-dissertations@2.parquet") })
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more