function pdc(
segmented_documents,
absolute_sim_dist_threshold,
percentage_same_response_threshold,
group_making_threshold,
a,
b
) {
let compute_response_lengths_and_add_normalized_locations = (
allSentences
) => {
let response_lengths = {};
for (let i = 0; i < allSentences.length; i++) {
if (Object.keys(response_lengths).includes(allSentences[i].respIndex)) {
if (
response_lengths[allSentences[i].respIndex] <
allSentences[i].sentIndex
) {
response_lengths[allSentences[i].respIndex] =
allSentences[i].sentIndex;
}
} else {
response_lengths[allSentences[i].respIndex] = allSentences[i].sentIndex;
}
}
for (let i = 0; i < allSentences.length; i++) {
allSentences[i].total_resp_length =
response_lengths[allSentences[i].respIndex];
if (allSentences[i].sentIndex == 0) {
allSentences[i].normalized_location = 0;
} else {
allSentences[i].normalized_location =
parseFloat(allSentences[i].sentIndex) /
parseFloat(allSentences[i].total_resp_length);
}
}
return response_lengths;
};
//helper 1 -- definition of what's close; Defining a notion of distance with meta data (in the future, it may not be symmetric, especially if its specific to our visualization)
let distance = (sentence_obj1, sentence_obj2) => {
var mismatch_total = 0;
var total_tokens = 0;
var mismatch_dict = {};
var match_dict = {};
// could replace w a different distance measure, could be roughly BoW cosine
// this could be based on greying
for (const [key, value] of Object.entries(sentence_obj1.bow)) {
let value2 = sentence_obj2.bow[key];
if (value2 === undefined) {
mismatch_total += value;
mismatch_dict[key] = value;
total_tokens += value;
} else {
match_dict[key] = { value, value2 };
total_tokens += value + value2;
}
}
for (const [key, value] of Object.entries(sentence_obj2.bow)) {
if (sentence_obj1.bow[key] === undefined) {
mismatch_total += value;
mismatch_dict[key] = value;
total_tokens += value;
}
}
var mismatch_score = mismatch_total / total_tokens;
return {
mismatch_score,
mismatch_total,
total_tokens,
mismatch_dict,
match_dict
};
};
//helper 2 --- compute pairwise distances
let get_pairs = (minSimScoreThreshold, allSentences) => {
let simMatrix = Array(allSentences.length)
.fill(0)
.map(() => Array(allSentences.length).fill(0)); // TODO: SAFEST THING IS TO FILL WITH INF NOT ZEROS
let simMatrix_unmodified = Array(allSentences.length)
.fill(0)
.map(() => Array(allSentences.length).fill(0)); // TODO: SAFEST THING IS TO FILL WITH INF NOT ZEROS
for (let i = 0; i < allSentences.length; i++) {
for (let j = 0; j < i; j++) {
//instead of j<i, I'm filling out the whole matrix for the unmodified version (not used for clustering) so it's bidirectional
simMatrix[i][j] = distance(allSentences[i], allSentences[j])[
"mismatch_score"
];
}
}
for (let i = 0; i < allSentences.length; i++) {
for (let j = 0; j < allSentences.length; j++) {
//instead of j<i, I'm filling out the whole matrix for the unmodified version (not used for clustering) so it's bidirectional
simMatrix_unmodified[i][j] = distance(allSentences[i], allSentences[j])[
"mismatch_score"
];
}
}
// run clustering algo
let clusters = [];
let cluster_dists = [];
const getMaxSimIndices = (arr) => {
let min = Infinity;
let indices = [];
for (let i = 0; i < arr.length; i++) {
for (let j = 0; j < i; j++) {
//TODO: EXPAND TO ARR.LENGTH IF ASSYMETRIC FUNCTION
if (arr[i][j] < min) {
min = arr[i][j];
indices = [i, j];
}
}
}
return [min, indices];
};
let minSimScore;
let bestPair;
[minSimScore, bestPair] = getMaxSimIndices(simMatrix);
clusters.push(bestPair);
cluster_dists.push(minSimScore);
simMatrix[bestPair[0]][bestPair[1]] = Infinity;
while (minSimScore < minSimScoreThreshold) {
[minSimScore, bestPair] = getMaxSimIndices(simMatrix);
clusters.push(bestPair);
cluster_dists.push(minSimScore);
simMatrix[bestPair[0]][bestPair[1]] = Infinity;
}
// only output that is used is simMatrix_unmodified
return { minSimScore, clusters, cluster_dists, simMatrix_unmodified };
};
//helper
let getCurrentGroupIdx = (sentenceID, groups) => {
for (let i = 0; i < groups.length; i++) {
if (groups[i].includes(sentenceID)) {
return i;
}
}
return -1;
};
//helper to check that there aren't too many segments in a cluster from the same document
let getPercentageSharedResponses = (groupA, groupB, allSentences) => {
let responses_included = [];
for (let i = 0; i < groupA.length; i++) {
responses_included.push(allSentences[groupA[i]].respIndex);
}
for (let i = 0; i < groupB.length; i++) {
responses_included.push(allSentences[groupB[i]].respIndex);
}
let responses_included_unique = Array.from(new Set(responses_included));
return 1 - responses_included_unique.length / responses_included.length;
};
let addToGroups = (groups,pair) => {
let group_idx_pair0 = getCurrentGroupIdx(pair[0], groups)
let group_idx_pair1 = getCurrentGroupIdx(pair[1], groups)
if (group_idx_pair0 == -1 && group_idx_pair1 == -1) {
groups.push(pair);
} else if (group_idx_pair0 == group_idx_pair1){
return groups //do nothing!
} else if (group_idx_pair0 > -1 && group_idx_pair1 > -1) {
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
if (getPercentageSharedResponses(groups[group_idx_pair0],groups[group_idx_pair1],all_segments) > percentage_same_response_threshold) {
return groups
}
//WHICH IS BETTER FOR MINIMIZING DISTANCES OF ADJACENT GROUP MEMBERS---ADDING A TO B OR B TO A?
let end_of_group_a = groups[group_idx_pair0].slice(-1);
let start_of_group_b = groups[group_idx_pair1].slice(0,1);
let end_of_group_b = groups[group_idx_pair1].slice(-1);
let start_of_group_a = groups[group_idx_pair0].slice(0,1);
let dist_a_push_b = simMatrix[end_of_group_a][start_of_group_b];
let dist_b_push_a = simMatrix[end_of_group_b][start_of_group_a];
if (dist_a_push_b <= dist_b_push_a) {
let group_part_b = groups[group_idx_pair1];
for (let i = 0; i<group_part_b.length; i++){
groups[group_idx_pair0].push(group_part_b[i]);
}
groups.splice(group_idx_pair1,1); //removes the group was added elsewhere
} else {
let group_part_a = groups[group_idx_pair0];
for (let i = 0; i<group_part_a.length; i++){
groups[group_idx_pair1].push(group_part_a[i]);
}
groups.splice(group_idx_pair0,1); //removes the group was added elsewhere
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
} else if (group_idx_pair0 > -1){
if (getPercentageSharedResponses(groups[group_idx_pair0],[pair[1]],all_segments) > percentage_same_response_threshold) {
return groups
}
let start_of_group = groups[group_idx_pair0].slice(0,1);
let end_of_group = groups[group_idx_pair0].slice(-1);
let dist_at_end_of_group = simMatrix[end_of_group][pair[1]];
let dist_at_start_of_group = simMatrix[pair[1]][start_of_group];
if (dist_at_start_of_group <= dist_at_end_of_group){
groups[group_idx_pair0].splice(0,0,pair[1]);
} else{
groups[group_idx_pair0].splice(groups[group_idx_pair0].length,0,pair[1]);
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
} else if (group_idx_pair1 > -1){
if (getPercentageSharedResponses(groups[group_idx_pair1],[pair[0]],all_segments) > percentage_same_response_threshold) {
return groups
}
let start_of_group = groups[group_idx_pair1].slice(0,1);
let end_of_group = groups[group_idx_pair1].slice(-1);
let dist_at_end_of_group = simMatrix[end_of_group][pair[0]];
let dist_at_start_of_group = simMatrix[pair[0]][start_of_group];
if (dist_at_start_of_group <= dist_at_end_of_group){
groups[group_idx_pair1].splice(0,0,pair[0]);
} else {
groups[group_idx_pair1].splice(groups[group_idx_pair1].length,0,pair[0]);
}
//TODO: CONSIDER TESTING FOR MERGED CLUSTER PROPERTIES/STATS
//TODO: WHAT PERCENTAGE OF SENTENCES IN THIS CLUSTER SHARE A RESPONSE? IF 30% OR MORE SHARE THE SAME RESPONSE WHEN COMBINED, DON'T COMBINE
}
return groups; //TODO: PUT IN LOGIC
}
//MAIN
//extract and format relevant data from segmented_documents
let all_segments = segmented_documents
.map((respObj) => {
return respObj.sentences.map((sentObj, idx) => {
const sentenceDetails = {
bow: sentObj.bow,
respIndex: respObj.index,
sentIndex: idx,
sentence: sentObj.text
};
return sentenceDetails;
});
})
.flat();
//add normalized locations of segments within documents; changes all_segments in place
let response_lengths =
compute_response_lengths_and_add_normalized_locations(all_segments);
let simMatrix = get_pairs(
absolute_sim_dist_threshold,
all_segments
).simMatrix_unmodified;
let pairs = get_pairs(absolute_sim_dist_threshold, all_segments).clusters;
//TODO: WHY IS GET_PAIRS RUN TWICE?
// this computes the ordered clusters
let groups = [];
let ordered_pairs = get_pairs(absolute_sim_dist_threshold, all_segments).clusters;
for (let pair_idx = 0; pair_idx < ordered_pairs.length; pair_idx++) {
let pair = ordered_pairs[pair_idx];
let pair_loc_abs_diff = Math.abs(
all_segments[pair[0]].normalized_location -
all_segments[pair[1]].normalized_location
);
let pair_sim_dist = simMatrix[pair[0]][pair[1]]; // TODO: SAFEST IS TO CONSIDER BOTH DIRECTIONS OR MIN OF BOTH DIRECTIONS IF DISTANCE IS ASSYMMETRIC
if (a * pair_sim_dist + b * pair_loc_abs_diff < group_making_threshold) {
groups = addToGroups(groups, pair);
//return groups
}
}
// add singleton clusters to groups
let flattened_groups = groups.flat();
for (let i=0; i<all_segments.length; i++){
if (!flattened_groups.includes(i)){
groups.push([i]); // adds a singleton
}
}
//return groups
//now order it by average or median normalized location
//compute aggregate normalized location for each group
let ordered_groups = [];
let median_groups_locations = [];
let num_groups = groups.length;
for (let group_idx = 0; group_idx< num_groups; group_idx++){
let group_locations = [];
for (let intra_group_idx = 0; intra_group_idx<groups[group_idx].length; intra_group_idx++){
group_locations.push(all_segments[groups[group_idx][intra_group_idx]].normalized_location);
}
median_groups_locations.push(d3.mean(group_locations));
}
//return median_groups_locations
for (let i=0; i<num_groups; i++){
let next_group_idx = median_groups_locations.indexOf(d3.min(median_groups_locations)); //TODO: TIE BREAK TO PUT SENTENCES FROM LONGER DOCS FIRST //update: using mean fixed this
ordered_groups.push(groups[next_group_idx]);
groups.splice(next_group_idx,1);
median_groups_locations.splice(next_group_idx,1);
//return median_groups_locations
}
//for pretty printing
let sentences_in_groups = [];
for (let i = 0; i < ordered_groups.length; i++) {
sentences_in_groups.push([]);
for (let j = 0; j < ordered_groups[i].length; j++) {
sentences_in_groups[i].push(all_segments[ordered_groups[i][j]].sentence);
}
}
return sentences_in_groups;
}