Published
Edited
Mar 4, 2022
Importers
4 stars
Insert cell
Insert cell
Insert cell
data = Array.from(
(function* all() {
let i = 0;
for (let e = 0; e < episodes.length; e++)
for (const line of parse(episodes[e]))
yield { ...line, i: i++, episode: e + 1 };
})()
)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
function* parse(text) {
const notCharacter = /^(WIDE|CUT TO|A |ON FINN|SERIES OF SHOTS)/; // J.J. Abrams… 😐
const sceneHeader = /^(?:\d+\s+)?(INT|EXT)\.? (.*)/;
const dialogueNewline = /^([-A-ZÉ1-9#/. ]+)(\(O\.S\.?\)|\(V\.O\.\)|'S VOICE)?$/;
const dialogueColon = /^([-A-ZÉ1-9/#. ]+)[:\t](.*)/;

let lastLine, lastIndent, lastScene;
let lastSection = [];
for (let line of text.split("\n")) {
const indent = line.match(/^\s*/)[0].length;
const trimmed = line
.trim()
.replace(/^\(?CONTINUED\)?:?\s*/, "")
.replace(" (CONT'D)", "")
.trim();
let match;

if (
(!trimmed ||
(indent === lastIndent - 1 &&
lastSection.length > 1 &&
!lastLine.startsWith("\t"))) &&
lastSection.length
) {
if ((match = lastSection[0].match(sceneHeader))) {
// ---------------------------------------- scene header
const [, interior, heading] = match;
const [location, ...detail] = heading.split(" - ");
yield (lastScene = {
type: "scene",
location: location.trim(),
detail: detail
? detail.map((s) => s.trim()).filter((s) => s)
: undefined,
interior,
raw: lastSection
});
if (lastSection.length > 1) {
// ---------------------------------------- action (with scene header)
const description = paragraph(lastSection.slice(1));
const nouns = Array.from(properNouns(description));
const character =
nouns[0] && characterNouns.has(nouns[0].normalized)
? nouns[0].normalized
: undefined;
yield {
type: "action",
description,
nouns,
character,
scene: lastScene,
raw: lastSection
};
}
} else if (
lastScene &&
lastSection.length > 1 &&
(match = lastSection[0].match(dialogueNewline)) &&
!lastSection[0].endsWith(".") &&
!lastSection[0].match(notCharacter)
) {
// ---------------------------------------- dialogue (newline separated)
const [, character, modifier] = match;
const line = paragraph(lastSection.slice(1));
yield {
type: "dialogue",
character: normalize(character),
line,
nouns: Array.from(properNouns(line)),
modifier,
scene: lastScene,
raw: lastSection
};
} else if (
lastScene &&
lastSection[0].match(dialogueColon) &&
!lastSection[0].match(notCharacter)
) {
// ---------------------------------------- dialogue (colon or tab separated)
const lines = lastSection.reduce((lines, line) => {
if (line.match(dialogueColon)) lines.push(line);
else
lines[lines.length - 1] = paragraph([
lines[lines.length - 1],
line
]);
return lines;
}, []);
for (const l of lines) {
const [, character, line] = l.match(dialogueColon);
if (line) {
yield {
type: "dialogue",
character: normalize(character),
line: line.trim(),
nouns: Array.from(properNouns(line)),
scene: lastScene,
raw: lastSection
};
}
}
} else {
// ---------------------------------------- action
const description = paragraph(lastSection);
const nouns = Array.from(properNouns(description));
const character =
nouns[0] && characterNouns.has(nouns[0].normalized)
? nouns[0].normalized
: undefined;
yield {
type: "action",
description,
nouns,
character,
scene: lastScene,
raw: lastSection
};
}
lastSection = [];
}

if (trimmed) lastSection.push(trimmed);

lastLine = line;
lastIndent = indent;
}
}
Insert cell
function* properNouns(text) {
for (const ngram of ngrams(text)) {
const normalized = normalize(ngram);
if (ngram.match(/^[A-Z]/) && !stopwords.has(normalized))
yield {ngram, normalized};
}
}
Insert cell
Array.from(
properNouns(
"Loki Pitiyanuvath gingerly descended the stairs. He was careful to not trip on Coco."
)
)
Insert cell
ngrams = (text) =>
text
.replace(/(capt)\./i, "$1")
.replace(/([.?!,])/g, " $1")
.split(/\s+/)
.reduce((out, word, i, words) => {
// recombine strings of capitalized words
if (word.match(/^[A-Z]/) && words[i - 1] && words[i - 1].match(/^[A-Z]/))
out[out.length - 1] += " " + word;
else out.push(word);
return out;
}, [])
Insert cell
ngrams("Loki Pitiyanuvath gingerly descended the stairs. He was careful to not trip on Coco.")
Insert cell
normalize = (noun) => {
let normalized = noun
.trim()
.toUpperCase()
.replace(/'S$/, "")
.replace(/^(A|THE) /, "")
.replace(/[)"']/g, "");
if (renames.has(normalized)) normalized = renames.get(normalized);
return normalized;
}
Insert cell
characterNouns = new Set([
"LUKE",
"ANAKIN",
"OBI-WAN",
"HAN",
"ARTOO",
"CHEWBACCA",
"PADMÉ",
"LEIA",
"VADER",
"THREEPIO",
"REY",
"QUI-GON",
"PALPATINE",
"FINN",
"JAR JAR",
"YODA",
"LANDO",
"DOOKU",
"BB-8",
"KYLO",
"JABBA",
"POE",
"JANGO",
"GENERAL GRIEVOUS",
"QUEEN",
"SENATOR",
"CAPTAIN PANAKA",
"COUNCIL",
"SEBULBA",
"MACE",
"WATTO",
"BOBA",
"BAIL",
"SIDIOUS",
"DROID",
"SHMI",
"BIGGS",
"NUTE",
"ZAM",
"RED LEADER",
"SITH LORD",
"WEDGE",
"MAUL",
"VICEROY",
"EIRTAE",
"CAPTAIN",
"WICKET",
"OWEN",
"GOLD LEADER",
"KITSTER",
"MAZ",
"SUPREME CHANCELLOR",
"COMMANDER",
"SENATOR AMIDALA",
"CAPTAIN TYPHO",
"RUNE",
"MAS AMEDDA",
"TAUN WE",
"BIB",
"FN-2187",
"RABE",
"BIBBLE",
"CHANCELLOR PALPATINE",
"DORME",
"STORMTROOPER",
"KI-ADI",
"BERU"
])
Insert cell
renames = new Map([
["ADMIRAL ACKBAR", "ACKBAR"],
["AMIDALA", "PADMÉ"],
["ANKAIN", "ANAKIN"],
["ANAKN", "ANAKIN"],
["ANAKNI", "ANAKIN"],
["ANAKINN", "ANAKIN"],
["ANNIE", "ANAKIN"],
["ARTOO-DETOO", "ARTOO"],
["ARTOO BEEPS", "ARTOO"],
["AUNT BERU", "BERU"],
["BAIL ORGANA", "BAIL"],
["BEN", "OBI-WAN"],
["BEN KENOBI", "OBI-WAN"],
["BOBA FETT", "BOBA"],
["C-3PO", "THREEPIO"],
["CAPT. PANAKA", "CAPTAIN PANAKA"],
["CHEWIE", "CHEWBACCA"],
["COUNT DOOKU", "DOOKU"],
["CREATURE", "YODA"],
["DARTH MAUL", "MAUL"],
["DABTH SIDIOUS", "SIDIOUS"],
["DARTH SIDIOUS", "SIDIOUS"],
["DARTH VADER", "VADER"],
["ELAN SLEAZEBAGGANO", "ELAN"],
["EMPEROR", "PALPATINE"],
["FODE BEED", "FODE/BEED"],
["FOOD SERVER", "SERVER"],
["G-3PO", "THREEPIO"],
["GUI-GON", "QUI-GON"],
["HAN SOLO", "HAN"],
["HERMIONE BAGWA", "HERMIONE"],
["INTERCOM VOICE", "INTERCOM"],
["JANGO FETT", "JANGO"],
["KI-ADI-MUNDI", "KI-ADI"],
["KYLO REN", "KYLO"],
["LORD VADER", "VADER"],
["LUKE SKYWALKER", "LUKE"],
["MACE WINDU", "MACE"],
["MACE-WINDU", "MACE"],
["MASTER KENOBI", "OBI-WAN"],
["MASTER LUKE", "LUKE"],
["MASTER YODA", "YODA"],
["MILLENNIUM FALCON", "FALCON"],
["NUTE GUNRAY", "NUTE"],
["OBI-WAM", "OBI-WAN"],
["OBI-WAN KENOBI", "OBI-WAN"],
["PALAPATINE", "PALPATINE"],
["PADME", "PADMÉ"],
["PAMDE", "PADMÉ"],
["PANAKA", "CAPTAIN PANAKA"],
["PRINCESS LEIA", "LEIA"],
["QUEEN AMIDALA", "PADMÉ"],
["QUI -GON", "QUI-GON"],
["R2", "ARTOO"],
["R2-D2", "ARTOO"],
["READ LEADER", "RED LEADER"],
["REN", "KYLO"],
["SEE-THREEPIO", "THREEPIO"],
["SIO BIBBLE", "BIBBLE"],
["SOLO", "HAN"],
["TROOPER VOICE", "TROOPER"],
["UNKAR PLUTT", "UNKAR"],
["VOICE", "INTERCOM"],
["WINDU", "MACE"],
["ZAM WESSEL", "ZAM"]
])
Insert cell
function* sample(array, n = 100) {
for (let i = 0; i < n; i++)
yield array[Math.floor(Math.random() * array.length)]
}
Insert cell
stopwords = new Set(
(await import("https://cdn.skypack.dev/stopwords")).english
.concat([
"beeps",
"blast",
"but i",
"excuse",
"finally",
"good",
"hey",
"hold",
"hurry",
"listen",
"mesa",
"move",
"remember",
"send",
"sir",
"stay",
"stop",
"suddenly",
"uh",
"wait",
"watch",
"wesa",
"whoa",
"yeah",
"yousa"
])
.map((d) => d.toUpperCase())
)
Insert cell
// Rejoin lines with spaces, except for lines ending in a hyphen.
function paragraph(lines) {
let p = "";
for (const line of lines) p += (p && !p.endsWith("-") ? " " : "") + line;
return p;
}
Insert cell
episodes = Promise.all(
[
FileAttachment("Star-Wars-The-Phantom-Menace@2.html"), // https://imsdb.com/scripts/Star-Wars-The-Phantom-Menace.html
FileAttachment("Star-Wars-Attack-of-the-Clones@1.html"), // https://imsdb.com/scripts/Star-Wars-Attack-of-the-Clones.html
FileAttachment("Star-Wars-Revenge-of-the-Sith@3.html"), // https://imsdb.com/scripts/Star-Wars-Revenge-of-the-Sith.html
FileAttachment("Star-Wars-A-New-Hope@1.html"), // https://imsdb.com/scripts/Star-Wars-A-New-Hope.html
FileAttachment("Star-Wars-The-Empire-Strikes-Back.html"), // https://imsdb.com/scripts/Star-Wars-The-Empire-Strikes-Back.html
FileAttachment("Star-Wars-Return-of-the-Jedi@2.html"), // https://imsdb.com/scripts/Star-Wars-Return-of-the-Jedi.html
FileAttachment("Star-Wars-The-Force-Awakens@2.html") // https://imsdb.com/scripts/Star-Wars-The-Force-Awakens.html
].map(
async (f) =>
new DOMParser()
.parseFromString(await f.text(), "text/html")
.querySelector("td.scrtext").innerText
)
)
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more