Public
Edited
Jul 13, 2024
Importers
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
cleanedJSONLLMResponses = jsonLLMResponses
.map((respObj) => {
return respObj.sentences.map((sentObj, idx) => {
const sentenceDetails = {
bow: sentObj.bow,
respIndex: respObj.index,
sentIndex: idx,
sentence: sentObj.text
};
return sentenceDetails;
});
})
.flat()
Insert cell
Insert cell
jsonPDCofLLMResponses = FileAttachment("sentences_in_groups.json").json()
Insert cell
Insert cell
deepEqualsListOfLists(
jsonPDCofLLMResponses,
pdc_with_cleaner_input_spec(cleanedJSONLLMResponses, 0.55, 0.1, 1.2, 1.5, 1)
)
Insert cell
deepEqualsListOfLists(jsonPDCofLLMResponses, pdc(jsonLLMResponses,0.55,
0.1,
1.2,
1.5,
1))
Insert cell
Insert cell
pdc_with_cleaner_input_spec(cleanedJSONLLMResponses, 0.55, 0.1, 1.2, 1.5, 1)
Insert cell
pdc(jsonLLMResponses,0.55,
0.1,
1.2,
1.5,
1)
Insert cell
Insert cell
absolute_sim_dist_threshold = 0.55 //absolute_sim_dist_threshold is the maximum distance between two segments in order to be considered close enough to be clustered together.
Insert cell
percentage_same_response_threshold = 0.1 //The maximum fraction of segments from the same documents in a group allowed
Insert cell
group_making_threshold = 1.2; // For a given pair of segments, the linear mixture of the absolute difference in normalized segment location and content similarity (defined by b and a respectively) must be below the group_making_threshold for the pair to be considered for inclusion in the same group.
Insert cell
a = 1.5; //relative weight on content similarity
Insert cell
b = 1; //relative weight on location coherence (less important than content, I think)
Insert cell
Insert cell
function pdc_with_cleaner_input_spec(
all_segments,
absolute_sim_dist_threshold,
percentage_same_response_threshold,
group_making_threshold,
a,
b
) {
//HELPERS
//helper 1 -- definition of what's close; Defining a notion of distance with meta data (in the future, it may not be symmetric, especially if its specific to our visualization)
let distance = (sentence_obj1, sentence_obj2) => {
var mismatch_total = 0;
var total_tokens = 0;
var mismatch_dict = {};
var match_dict = {};
// could replace w a different distance measure, could be roughly BoW cosine
// this could be based on greying
for (const [key, value] of Object.entries(sentence_obj1.bow)) {
let value2 = sentence_obj2.bow[key];
if (value2 === undefined) {
mismatch_total += value;
mismatch_dict[key] = value;
total_tokens += value;
} else {
match_dict[key] = { value, value2 };
total_tokens += value + value2;
}
}
for (const [key, value] of Object.entries(sentence_obj2.bow)) {
if (sentence_obj1.bow[key] === undefined) {
mismatch_total += value;
mismatch_dict[key] = value;
total_tokens += value;
}
}
var mismatch_score = mismatch_total / total_tokens;
return {
mismatch_score,
mismatch_total,
total_tokens,
mismatch_dict,
match_dict
};
};

//helper 2 --- compute pairwise distances
let get_pairs = (minSimScoreThreshold, allSentences) => {
let simMatrix = Array(allSentences.length)
.fill(0)
.map(() => Array(allSentences.length).fill(0)); // TODO: SAFEST THING IS TO FILL WITH INF NOT ZEROS
let simMatrix_unmodified = Array(allSentences.length)
.fill(0)
.map(() => Array(allSentences.length).fill(0)); // TODO: SAFEST THING IS TO FILL WITH INF NOT ZEROS
for (let i = 0; i < allSentences.length; i++) {
for (let j = 0; j < i; j++) {
//instead of j<i, I'm filling out the whole matrix for the unmodified version (not used for clustering) so it's bidirectional
simMatrix[i][j] = distance(allSentences[i], allSentences[j])[
"mismatch_score"
];
}
}
for (let i = 0; i < allSentences.length; i++) {
for (let j = 0; j < allSentences.length; j++) {
//instead of j<i, I'm filling out the whole matrix for the unmodified version (not used for clustering) so it's bidirectional
simMatrix_unmodified[i][j] = distance(allSentences[i], allSentences[j])[
"mismatch_score"
];
}
}

// run clustering algo
let clusters = [];
let cluster_dists = [];
const getMaxSimIndices = (arr) => {
let min = Infinity;
let indices = [];
for (let i = 0; i < arr.length; i++) {
for (let j = 0; j < i; j++) {
//TODO: EXPAND TO ARR.LENGTH IF ASSYMETRIC FUNCTION
if (arr[i][j] < min) {
min = arr[i][j];
indices = [i, j];
}
}
}
return [min, indices];
};
let minSimScore;
let bestPair;
[minSimScore, bestPair] = getMaxSimIndices(simMatrix);
clusters.push(bestPair);
cluster_dists.push(minSimScore);
simMatrix[bestPair[0]][bestPair[1]] = Infinity;

while (minSimScore < minSimScoreThreshold) {
[minSimScore, bestPair] = getMaxSimIndices(simMatrix);
clusters.push(bestPair);
cluster_dists.push(minSimScore);
simMatrix[bestPair[0]][bestPair[1]] = Infinity;
}

// only output that is used is simMatrix_unmodified
return { minSimScore, clusters, cluster_dists, simMatrix_unmodified };
};

//helper
let getCurrentGroupIdx = (sentenceID, groups) => {
for (let i = 0; i < groups.length; i++) {
if (groups[i].includes(sentenceID)) {
return i;
}
}
return -1;
};

//helper to check that there aren't too many segments in a cluster from the same document
let getPercentageSharedResponses = (groupA, groupB, allSentences) => {
let responses_included = [];
for (let i = 0; i < groupA.length; i++) {
responses_included.push(allSentences[groupA[i]].respIndex);
}
for (let i = 0; i < groupB.length; i++) {
responses_included.push(allSentences[groupB[i]].respIndex);
}
let responses_included_unique = Array.from(new Set(responses_included));
return 1 - responses_included_unique.length / responses_included.length;
};

let addToGroups = (groups, pair) => {
let group_idx_pair0 = getCurrentGroupIdx(pair[0], groups);
let group_idx_pair1 = getCurrentGroupIdx(pair[1], groups);
if (group_idx_pair0 == -1 && group_idx_pair1 == -1) {
groups.push(pair);
} else if (group_idx_pair0 == group_idx_pair1) {
return groups; //do nothing!
} else if (group_idx_pair0 > -1 && group_idx_pair1 > -1) {
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
if (
getPercentageSharedResponses(
groups[group_idx_pair0],
groups[group_idx_pair1],
all_segments
) > percentage_same_response_threshold
) {
return groups;
}

//WHICH IS BETTER FOR MINIMIZING DISTANCES OF ADJACENT GROUP MEMBERS---ADDING A TO B OR B TO A?
let end_of_group_a = groups[group_idx_pair0].slice(-1);
let start_of_group_b = groups[group_idx_pair1].slice(0, 1);
let end_of_group_b = groups[group_idx_pair1].slice(-1);
let start_of_group_a = groups[group_idx_pair0].slice(0, 1);

let dist_a_push_b = simMatrix[end_of_group_a][start_of_group_b];
let dist_b_push_a = simMatrix[end_of_group_b][start_of_group_a];

if (dist_a_push_b <= dist_b_push_a) {
let group_part_b = groups[group_idx_pair1];
for (let i = 0; i < group_part_b.length; i++) {
groups[group_idx_pair0].push(group_part_b[i]);
}
groups.splice(group_idx_pair1, 1); //removes the group was added elsewhere
} else {
let group_part_a = groups[group_idx_pair0];
for (let i = 0; i < group_part_a.length; i++) {
groups[group_idx_pair1].push(group_part_a[i]);
}
groups.splice(group_idx_pair0, 1); //removes the group was added elsewhere
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
} else if (group_idx_pair0 > -1) {
if (
getPercentageSharedResponses(
groups[group_idx_pair0],
[pair[1]],
all_segments
) > percentage_same_response_threshold
) {
return groups;
}

let start_of_group = groups[group_idx_pair0].slice(0, 1);
let end_of_group = groups[group_idx_pair0].slice(-1);
let dist_at_end_of_group = simMatrix[end_of_group][pair[1]];
let dist_at_start_of_group = simMatrix[pair[1]][start_of_group];
if (dist_at_start_of_group <= dist_at_end_of_group) {
groups[group_idx_pair0].splice(0, 0, pair[1]);
} else {
groups[group_idx_pair0].splice(
groups[group_idx_pair0].length,
0,
pair[1]
);
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
} else if (group_idx_pair1 > -1) {
if (
getPercentageSharedResponses(
groups[group_idx_pair1],
[pair[0]],
all_segments
) > percentage_same_response_threshold
) {
return groups;
}

let start_of_group = groups[group_idx_pair1].slice(0, 1);
let end_of_group = groups[group_idx_pair1].slice(-1);
let dist_at_end_of_group = simMatrix[end_of_group][pair[0]];
let dist_at_start_of_group = simMatrix[pair[0]][start_of_group];
if (dist_at_start_of_group <= dist_at_end_of_group) {
groups[group_idx_pair1].splice(0, 0, pair[0]);
} else {
groups[group_idx_pair1].splice(
groups[group_idx_pair1].length,
0,
pair[0]
);
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
}
return groups; //TODO: PUT IN LOGIC
};

//MAIN

let simMatrix = get_pairs(
absolute_sim_dist_threshold,
all_segments
).simMatrix_unmodified;
let pairs = get_pairs(absolute_sim_dist_threshold, all_segments).clusters;
//TODO: WHY IS GET_PAIRS RUN TWICE?

// this computes the ordered clusters
let groups = [];
let ordered_pairs = get_pairs(
absolute_sim_dist_threshold,
all_segments
).clusters;
for (let pair_idx = 0; pair_idx < ordered_pairs.length; pair_idx++) {
let pair = ordered_pairs[pair_idx];
let pair_loc_abs_diff = Math.abs(
all_segments[pair[0]].normalized_location -
all_segments[pair[1]].normalized_location
);
let pair_sim_dist = simMatrix[pair[0]][pair[1]]; // TODO: SAFEST IS TO CONSIDER BOTH DIRECTIONS OR MIN OF BOTH DIRECTIONS IF DISTANCE IS ASSYMMETRIC
if (a * pair_sim_dist + b * pair_loc_abs_diff < group_making_threshold) {
groups = addToGroups(groups, pair);
//return groups
}
}
// add singleton clusters to groups
let flattened_groups = groups.flat();
for (let i = 0; i < all_segments.length; i++) {
if (!flattened_groups.includes(i)) {
groups.push([i]); // adds a singleton
}
}
//return groups

//now order it by average or median normalized location
//compute aggregate normalized location for each group
let ordered_groups = [];
let median_groups_locations = [];
let num_groups = groups.length;
for (let group_idx = 0; group_idx < num_groups; group_idx++) {
let group_locations = [];
for (
let intra_group_idx = 0;
intra_group_idx < groups[group_idx].length;
intra_group_idx++
) {
group_locations.push(
all_segments[groups[group_idx][intra_group_idx]].normalized_location
);
}
median_groups_locations.push(d3.mean(group_locations));
}
//return median_groups_locations

for (let i = 0; i < num_groups; i++) {
let next_group_idx = median_groups_locations.indexOf(
d3.min(median_groups_locations)
); //TODO: TIE BREAK TO PUT SENTENCES FROM LONGER DOCS FIRST //update: using mean fixed this
ordered_groups.push(groups[next_group_idx]);
groups.splice(next_group_idx, 1);
median_groups_locations.splice(next_group_idx, 1);
//return median_groups_locations
}

//for pretty printing
let sentences_in_groups = [];
for (let i = 0; i < ordered_groups.length; i++) {
sentences_in_groups.push([]);
for (let j = 0; j < ordered_groups[i].length; j++) {
sentences_in_groups[i].push(all_segments[ordered_groups[i][j]].sentence);
}
}
return sentences_in_groups;
}
Insert cell
Insert cell
d3 = require("d3")
Insert cell
function pdc(
segmented_documents,
absolute_sim_dist_threshold,
percentage_same_response_threshold,
group_making_threshold,
a,
b
) {
//HELPERS
//helper 0 -- processing data
//compute the response length
let compute_response_lengths_and_add_normalized_locations = (
allSentences
) => {
let response_lengths = {};
for (let i = 0; i < allSentences.length; i++) {
if (Object.keys(response_lengths).includes(allSentences[i].respIndex)) {
if (
response_lengths[allSentences[i].respIndex] <
allSentences[i].sentIndex
) {
response_lengths[allSentences[i].respIndex] =
allSentences[i].sentIndex;
}
} else {
response_lengths[allSentences[i].respIndex] = allSentences[i].sentIndex;
}
}
for (let i = 0; i < allSentences.length; i++) {
allSentences[i].total_resp_length =
response_lengths[allSentences[i].respIndex];
if (allSentences[i].sentIndex == 0) {
allSentences[i].normalized_location = 0;
} else {
allSentences[i].normalized_location =
parseFloat(allSentences[i].sentIndex) /
parseFloat(allSentences[i].total_resp_length);
}
}
return response_lengths;
};

//helper 1 -- definition of what's close; Defining a notion of distance with meta data (in the future, it may not be symmetric, especially if its specific to our visualization)
let distance = (sentence_obj1, sentence_obj2) => {
var mismatch_total = 0;
var total_tokens = 0;
var mismatch_dict = {};
var match_dict = {};
// could replace w a different distance measure, could be roughly BoW cosine
// this could be based on greying
for (const [key, value] of Object.entries(sentence_obj1.bow)) {
let value2 = sentence_obj2.bow[key];
if (value2 === undefined) {
mismatch_total += value;
mismatch_dict[key] = value;
total_tokens += value;
} else {
match_dict[key] = { value, value2 };
total_tokens += value + value2;
}
}
for (const [key, value] of Object.entries(sentence_obj2.bow)) {
if (sentence_obj1.bow[key] === undefined) {
mismatch_total += value;
mismatch_dict[key] = value;
total_tokens += value;
}
}
var mismatch_score = mismatch_total / total_tokens;
return {
mismatch_score,
mismatch_total,
total_tokens,
mismatch_dict,
match_dict
};
};

//helper 2 --- compute pairwise distances
let get_pairs = (minSimScoreThreshold, allSentences) => {
let simMatrix = Array(allSentences.length)
.fill(0)
.map(() => Array(allSentences.length).fill(0)); // TODO: SAFEST THING IS TO FILL WITH INF NOT ZEROS
let simMatrix_unmodified = Array(allSentences.length)
.fill(0)
.map(() => Array(allSentences.length).fill(0)); // TODO: SAFEST THING IS TO FILL WITH INF NOT ZEROS
for (let i = 0; i < allSentences.length; i++) {
for (let j = 0; j < i; j++) {
//instead of j<i, I'm filling out the whole matrix for the unmodified version (not used for clustering) so it's bidirectional
simMatrix[i][j] = distance(allSentences[i], allSentences[j])[
"mismatch_score"
];
}
}
for (let i = 0; i < allSentences.length; i++) {
for (let j = 0; j < allSentences.length; j++) {
//instead of j<i, I'm filling out the whole matrix for the unmodified version (not used for clustering) so it's bidirectional
simMatrix_unmodified[i][j] = distance(allSentences[i], allSentences[j])[
"mismatch_score"
];
}
}

// run clustering algo
let clusters = [];
let cluster_dists = [];
const getMaxSimIndices = (arr) => {
let min = Infinity;
let indices = [];
for (let i = 0; i < arr.length; i++) {
for (let j = 0; j < i; j++) {
//TODO: EXPAND TO ARR.LENGTH IF ASSYMETRIC FUNCTION
if (arr[i][j] < min) {
min = arr[i][j];
indices = [i, j];
}
}
}
return [min, indices];
};
let minSimScore;
let bestPair;
[minSimScore, bestPair] = getMaxSimIndices(simMatrix);
clusters.push(bestPair);
cluster_dists.push(minSimScore);
simMatrix[bestPair[0]][bestPair[1]] = Infinity;

while (minSimScore < minSimScoreThreshold) {
[minSimScore, bestPair] = getMaxSimIndices(simMatrix);
clusters.push(bestPair);
cluster_dists.push(minSimScore);
simMatrix[bestPair[0]][bestPair[1]] = Infinity;
}

// only output that is used is simMatrix_unmodified
return { minSimScore, clusters, cluster_dists, simMatrix_unmodified };
};

//helper
let getCurrentGroupIdx = (sentenceID, groups) => {
for (let i = 0; i < groups.length; i++) {
if (groups[i].includes(sentenceID)) {
return i;
}
}
return -1;
};

//helper to check that there aren't too many segments in a cluster from the same document
let getPercentageSharedResponses = (groupA, groupB, allSentences) => {
let responses_included = [];
for (let i = 0; i < groupA.length; i++) {
responses_included.push(allSentences[groupA[i]].respIndex);
}
for (let i = 0; i < groupB.length; i++) {
responses_included.push(allSentences[groupB[i]].respIndex);
}
let responses_included_unique = Array.from(new Set(responses_included));
return 1 - responses_included_unique.length / responses_included.length;
};

let addToGroups = (groups,pair) => {
let group_idx_pair0 = getCurrentGroupIdx(pair[0], groups)
let group_idx_pair1 = getCurrentGroupIdx(pair[1], groups)
if (group_idx_pair0 == -1 && group_idx_pair1 == -1) {
groups.push(pair);
} else if (group_idx_pair0 == group_idx_pair1){
return groups //do nothing!
} else if (group_idx_pair0 > -1 && group_idx_pair1 > -1) {
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
if (getPercentageSharedResponses(groups[group_idx_pair0],groups[group_idx_pair1],all_segments) > percentage_same_response_threshold) {
return groups
}
//WHICH IS BETTER FOR MINIMIZING DISTANCES OF ADJACENT GROUP MEMBERS---ADDING A TO B OR B TO A?
let end_of_group_a = groups[group_idx_pair0].slice(-1);
let start_of_group_b = groups[group_idx_pair1].slice(0,1);
let end_of_group_b = groups[group_idx_pair1].slice(-1);
let start_of_group_a = groups[group_idx_pair0].slice(0,1);
let dist_a_push_b = simMatrix[end_of_group_a][start_of_group_b];
let dist_b_push_a = simMatrix[end_of_group_b][start_of_group_a];
if (dist_a_push_b <= dist_b_push_a) {
let group_part_b = groups[group_idx_pair1];
for (let i = 0; i<group_part_b.length; i++){
groups[group_idx_pair0].push(group_part_b[i]);
}
groups.splice(group_idx_pair1,1); //removes the group was added elsewhere
} else {
let group_part_a = groups[group_idx_pair0];
for (let i = 0; i<group_part_a.length; i++){
groups[group_idx_pair1].push(group_part_a[i]);
}
groups.splice(group_idx_pair0,1); //removes the group was added elsewhere
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
} else if (group_idx_pair0 > -1){
if (getPercentageSharedResponses(groups[group_idx_pair0],[pair[1]],all_segments) > percentage_same_response_threshold) {
return groups
}
let start_of_group = groups[group_idx_pair0].slice(0,1);
let end_of_group = groups[group_idx_pair0].slice(-1);
let dist_at_end_of_group = simMatrix[end_of_group][pair[1]];
let dist_at_start_of_group = simMatrix[pair[1]][start_of_group];
if (dist_at_start_of_group <= dist_at_end_of_group){
groups[group_idx_pair0].splice(0,0,pair[1]);
} else{
groups[group_idx_pair0].splice(groups[group_idx_pair0].length,0,pair[1]);
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
} else if (group_idx_pair1 > -1){
if (getPercentageSharedResponses(groups[group_idx_pair1],[pair[0]],all_segments) > percentage_same_response_threshold) {
return groups
}
let start_of_group = groups[group_idx_pair1].slice(0,1);
let end_of_group = groups[group_idx_pair1].slice(-1);
let dist_at_end_of_group = simMatrix[end_of_group][pair[0]];
let dist_at_start_of_group = simMatrix[pair[0]][start_of_group];
if (dist_at_start_of_group <= dist_at_end_of_group){
groups[group_idx_pair1].splice(0,0,pair[0]);
} else {
groups[group_idx_pair1].splice(groups[group_idx_pair1].length,0,pair[0]);
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
}
return groups; //TODO: PUT IN LOGIC
}

//MAIN
//extract and format relevant data from segmented_documents
let all_segments = segmented_documents
.map((respObj) => {
return respObj.sentences.map((sentObj, idx) => {
const sentenceDetails = {
bow: sentObj.bow,
respIndex: respObj.index,
sentIndex: idx,
sentence: sentObj.text
};
return sentenceDetails;
});
})
.flat();

//add normalized locations of segments within documents; changes all_segments in place
let response_lengths =
compute_response_lengths_and_add_normalized_locations(all_segments);
let simMatrix = get_pairs(
absolute_sim_dist_threshold,
all_segments
).simMatrix_unmodified;
let pairs = get_pairs(absolute_sim_dist_threshold, all_segments).clusters;
//TODO: WHY IS GET_PAIRS RUN TWICE?

// this computes the ordered clusters
let groups = [];
let ordered_pairs = get_pairs(absolute_sim_dist_threshold, all_segments).clusters;
for (let pair_idx = 0; pair_idx < ordered_pairs.length; pair_idx++) {
let pair = ordered_pairs[pair_idx];
let pair_loc_abs_diff = Math.abs(
all_segments[pair[0]].normalized_location -
all_segments[pair[1]].normalized_location
);
let pair_sim_dist = simMatrix[pair[0]][pair[1]]; // TODO: SAFEST IS TO CONSIDER BOTH DIRECTIONS OR MIN OF BOTH DIRECTIONS IF DISTANCE IS ASSYMMETRIC
if (a * pair_sim_dist + b * pair_loc_abs_diff < group_making_threshold) {
groups = addToGroups(groups, pair);
//return groups
}
}
// add singleton clusters to groups
let flattened_groups = groups.flat();
for (let i=0; i<all_segments.length; i++){
if (!flattened_groups.includes(i)){
groups.push([i]); // adds a singleton
}
}
//return groups

//now order it by average or median normalized location
//compute aggregate normalized location for each group
let ordered_groups = [];
let median_groups_locations = [];
let num_groups = groups.length;
for (let group_idx = 0; group_idx< num_groups; group_idx++){
let group_locations = [];
for (let intra_group_idx = 0; intra_group_idx<groups[group_idx].length; intra_group_idx++){
group_locations.push(all_segments[groups[group_idx][intra_group_idx]].normalized_location);
}
median_groups_locations.push(d3.mean(group_locations));
}
//return median_groups_locations
for (let i=0; i<num_groups; i++){
let next_group_idx = median_groups_locations.indexOf(d3.min(median_groups_locations)); //TODO: TIE BREAK TO PUT SENTENCES FROM LONGER DOCS FIRST //update: using mean fixed this
ordered_groups.push(groups[next_group_idx]);
groups.splice(next_group_idx,1);
median_groups_locations.splice(next_group_idx,1);
//return median_groups_locations
}

//for pretty printing
let sentences_in_groups = [];
for (let i = 0; i < ordered_groups.length; i++) {
sentences_in_groups.push([]);
for (let j = 0; j < ordered_groups[i].length; j++) {
sentences_in_groups[i].push(all_segments[ordered_groups[i][j]].sentence);
}
}
return sentences_in_groups;
}
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more