Public
Edited
Apr 30
1 star
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
empiricalBreastCancerIncidenceRates = (
await fetchAndParseCsv(
"https://raw.githubusercontent.com/jeyabbalas/py-icare/master/data/iCARE-Lit/age_specific_breast_cancer_incidence_rates.csv",
"\n"
)
).map((d) => {
return { age: parseInt(d.age), rate: parseFloat(d.rate) };
})
Insert cell
Insert cell
empiricalCdfs = empiricalCdf(empiricalBreastCancerIncidenceRates)
Insert cell
pgsData = loadPgsData(4, "GRCh37")
Insert cell
snpNames = getSnpNames(pgsData)
Insert cell
mafs = pgsData.data[pgsData.headers.indexOf("allelefrequency_effect")]
Insert cell
Insert cell
betas = pgsData.data[pgsData.headers.indexOf("effect_weight")]
Insert cell
nSamples = 100000 // Note to Rodrigo: Keep this fixed for all diseases/populations
Insert cell
prs = simulatePolygenicRiskScores(nSamples, mafs, betas)
Insert cell
weibullParams = fitWeibullParams(empiricalCdfs, prs, {
errorTolerance: 1e-12,
maxIterations: 200
})
Insert cell
maxAge = empiricalBreastCancerIncidenceRates[empiricalBreastCancerIncidenceRates.length - 1].age;
Insert cell
modeledIncidenceRates = generateWeibullIncidenceCurve(
weibullParams.k,
weibullParams.b,
prs,
maxAge
)
Insert cell
Insert cell
Insert cell
nCohortSamples = 1000
Insert cell
entryAgeRange = ({ min: 50, max: 67 })
Insert cell
followupRange = ({ min: 5, max: 13 })
Insert cell
simulatedProspectiveCohort = simulateProspectiveCohort(
snpNames,
mafs,
betas,
weibullParams.k,
weibullParams.b,
entryAgeRange,
followupRange,
nCohortSamples
)
Insert cell
kmCurveData = generateKaplanMeierData(simulatedProspectiveCohort)
Insert cell
Insert cell
Insert cell
Insert cell
async function fetchAndParseCsv(url, lineSeparator, nLines) {
let response = await fetch(url);
let data = (await response.text()).split(lineSeparator);
let header = data[0].split(",");
if (nLines !== undefined) {
data = data.slice(1, nLines + 1);
} else {
data = data.slice(1);
}
data = data.map((d) => {
if (d.trim() === "") return null;
let elements = d.split(",");
return header.reduce((obj, k, i) => ({ ...obj, [k]: elements[i] }), {});
});

return data.filter((row) => row !== null);
}
Insert cell
function empiricalCdf(incidenceRates) {
let cumulativeHazard = 0;
const cdfArray = incidenceRates.map((ageRate) => {
cumulativeHazard += ageRate.rate;
const cdf = 1 - Math.exp(-cumulativeHazard);
return { age: ageRate.age, cdf };
});

return cdfArray;
}
Insert cell
async function loadPgsData(id, build = "GRCh37") {
const pgsCatalogId = "PGS" + id.toString().padStart(6, "0");

try {
const url = `https://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/${pgsCatalogId}/ScoringFiles/Harmonized/${pgsCatalogId}_hmPOS_${build}.txt.gz`;

const text = await fetchGzippedFile(url);
const parsed = parseTsvContent(text);

return parsed;
} catch (error) {
console.error("Failed to load or parse file:", error);
throw error;
}
}
Insert cell
async function fetchGzippedFile(url) {
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}

const compressed = await response.arrayBuffer();
const ds = new DecompressionStream('gzip');
const compressedStream = new Blob([compressed])
.stream()
.pipeThrough(ds);

const decompressedResponse = new Response(compressedStream);
const text = await decompressedResponse.text();

return text;
} catch (error) {
console.error('Error fetching or decompressing file:', error);
throw error;
}
}
Insert cell
function parseTsvContent(content) {
const lines = content.split("\n");

const headerIndex = lines.findIndex(
(line) => !line.startsWith("#") && line.trim().length > 0
);
if (headerIndex === -1) {
throw new Error("No header line found in file");
}

const headers = lines[headerIndex].split("\t").map((header) => header.trim());
const numColumns = headers.length;

const data = Array(numColumns)
.fill()
.map(() => []);

lines
.slice(headerIndex + 1)
.filter((line) => line.trim().length > 0)
.forEach((line) => {
const values = line.split("\t");
values.forEach((value, colIndex) => {
if (colIndex < numColumns) {
const trimmedValue = value ? value.trim() : "";
data[colIndex].push(
!isNaN(trimmedValue) && trimmedValue !== ""
? Number(trimmedValue)
: trimmedValue
);
}
});
});

return {
headers,
data
};
}
Insert cell
function getSnpNames(pgsData) {
const chrNames = pgsData.data[pgsData.headers.indexOf("chr_name")];
const chrPositions = pgsData.data[pgsData.headers.indexOf("chr_position")];
const effectAlleles = pgsData.data[pgsData.headers.indexOf("effect_allele")];
const otherAlleles = pgsData.data[pgsData.headers.indexOf("other_allele")];

const snpNames = chrNames?.map(
(_, i) =>
`${chrNames[i]}:${chrPositions[i]}:${effectAlleles[i]}:${otherAlleles[i]}`
);

return snpNames;
}
Insert cell
function simulatePolygenicRiskScores(nSamples, mafs, betas) {
if (
!Array.isArray(mafs) ||
!Array.isArray(betas) ||
mafs.length !== betas.length ||
nSamples <= 0
) {
throw new Error("Invalid input parameters");
}

const drawGenotypeHWE = (maf) => {
// Hardy-Weinberg equilibrium (i.e. ~Bernoulli(2, maf))
const r = Math.random();
const probHomozygousMajor = (1 - maf) ** 2; // p^2
const probHeterozygous = 2 * (1 - maf) * maf; // 2pq
if (r < probHomozygousMajor) return 0;
if (r < probHomozygousMajor + probHeterozygous) return 1;
return 2;
};

const nSnps = betas.length;
const prs = new Float64Array(nSamples);

for (let i = 0; i < nSamples; i++) {
let iPrs = 0;
for (let j = 0; j < nSnps; j++) {
const g = drawGenotypeHWE(mafs[j]);
iPrs += betas[j] * g;
}
prs[i] = iPrs;
}

return prs;
}
Insert cell
async function fitWeibullParams(
empiricalCdfs,
linearPredictors,
optimizerParams = { errorTolerance: 1e-12, maxIterations: 200 }
) {
await pyodide.loadPackage("scipy");

const ages = empiricalCdfs.map((x) => x.age);
const empCdfs = empiricalCdfs.map((x) => x.cdf);
const expLinearPredictors = Array.from(linearPredictors.map(Math.exp));

pyodide.globals.set("ages", JSON.stringify(ages));
pyodide.globals.set("emp_cdfs", JSON.stringify(empCdfs));
pyodide.globals.set(
"exp_linear_predictors",
JSON.stringify(expLinearPredictors)
);

pyodide.globals.set("maxiter", optimizerParams.maxIterations);
pyodide.globals.set("ftol", optimizerParams.errorTolerance);

const pyCode = `
import json
import numpy as np
from scipy.optimize import minimize

# Load data passed from JS
ages = np.array(json.loads(ages), dtype=float)
emp_cdfs = np.array(json.loads(emp_cdfs), dtype=float)
exp_linear_predictors = np.array(json.loads(exp_linear_predictors), dtype=float)

N = len(exp_linear_predictors)

def population_cdf(k, b, t_array, expLP):
"""
F_pop(t) for each t in t_array;
where F_pop(t) = 1 - (1/N) * sum_{i=1..N} exp(-b * t^k * exp(beta^T Z_i)).
"""
results = np.zeros_like(t_array, dtype=float)
for ix, t in enumerate(t_array):
if t <= 0.0:
results[ix] = 0.0
else:
sum_surv = np.exp(-b * (t ** k) * expLP).sum()
avg_surv = sum_surv / N
results[ix] = 1.0 - avg_surv
return results

def rmse(pred, truth):
return np.sqrt(np.mean((pred - truth) ** 2))

def rmse_weibull(params):
"""
Objective function: RMSE(F_pop(ages), emp_cdfs).
params = [k, log_b] with k > 0.
"""
k, log_b = params
b = np.exp(log_b)
model_cdf = population_cdf(k, b, ages, exp_linear_predictors)
return rmse(model_cdf, emp_cdfs)

def callback(p):
k, log_b = p
print(f" Current k={k:.4e}, b={np.exp(log_b):.4e} => RMSE={rmse_weibull(p):.4e}")

# Initial guess:
initial_guess = [1.0, 0.0] # k = 1.0, b = exp(0) = 1.0
bounds = [(1e-8, None), (-20, 5)] # k, b > 0
options = {'maxiter': maxiter, 'ftol': ftol, 'gtol': 1e-12}

result = minimize(
rmse_weibull,
x0 = initial_guess,
method = 'L-BFGS-B',
bounds = bounds,
callback = callback,
options = options
)

if not result.success:
print("Optimization failed:", result.message)

opt_k = result.x[0]
opt_log_b = result.x[1]
opt_b = np.exp(opt_log_b)
final_rmse = rmse_weibull([opt_k, opt_log_b])

print("\\nFit done:")
print(f" k={opt_k:.4e}, b={opt_b:.4e}, RMSE={final_rmse:.4e}")

print(f"Optimization result details:\\n\\tSuccess: {result.success}\\n\\tStatus: {result.status}\\n\\tMessage: {result.message}\\n\\tNumber of iterations: {result.nit}\\n\\tNumber of function evaluations: {result.nfev}")

result_list = [opt_k, opt_b, final_rmse]
result_list
`;

const result = await pyodide.runPythonAsync(pyCode); // [k, b, final_rmse]

return {
k: result[0],
b: result[1],
rmse: result[2]
};
}
Insert cell
function generateWeibullIncidenceCurve(k, b, linearPredictors, maxAge) {
// For each age = 0..maxAge,
// define predicted incidence "rate" as the average hazard in [age, age+1),
// or approximate using difference of the population CDF:
//
// incRate(age) = Probability(Event in [age, age+1)) per person-year
// = [ F_pop(age+1) - F_pop(age) ] / 1
// F_pop(t) = 1 - (1/N) sum_i exp(-b*expLP_i*t^k)

const expLinearPredictors = linearPredictors.map(Math.exp);

function populationCdf(t) {
if (t <= 0) return 0.0;
let sumSurv = 0.0;
for (let i = 0; i < expLinearPredictors.length; i++) {
sumSurv += Math.exp(-b * expLinearPredictors[i] * Math.pow(t, k));
}
const avgSurv = sumSurv / expLinearPredictors.length;
return 1 - avgSurv;
}

const results = [];
for (let age = 0; age <= maxAge; age++) {
const cdf1 = populationCdf(age);
const cdf2 = populationCdf(age + 1);
const inc = cdf2 - cdf1; // Probability of event in [age, age+1)
// probability per year => "annual incidence rate"
results.push({ age, rate: inc });
}
return results;
}
Insert cell
/**
* Simulate a prospective cohort under a Cox-Weibull model with left-truncation
* at study entry age 'a'. The output matches the format of your earlier
* `simulateGenotypeProfile` but adds a column called "time_of_onset".
*
* @param {string[]} snpNames Array of SNP names
* @param {number[]} mafs Minor allele frequencies (same length as snpNames)
* @param {number[]} betas Coefficients for each SNP (same length as snpNames)
* @param {number} k Learned Weibull shape parameter
* @param {number} b Learned Weibull rate (or scale-like) parameter
* @param {{min: number, max: number}} entryAgeRange
* @param {{min: number, max: number}} followupRange
* @param {number} nSamples
* @returns {{ names: string[], data: number[][] }}
*/
function simulateProspectiveCohort(
snpNames,
mafs,
betas,
k,
b,
entryAgeRange,
followupRange,
nSamples
) {
if (snpNames.length !== mafs.length || mafs.length !== betas.length) {
throw new Error("Lengths of snpNames, mafs, and betas must all match");
}
if (
typeof entryAgeRange.min !== "number" ||
typeof entryAgeRange.max !== "number" ||
typeof followupRange.min !== "number" ||
typeof followupRange.max !== "number"
) {
throw new Error("Invalid range parameters");
}

function generateGenotype(maf) {
// Hardy-Weinberg equilibrium
const r = Math.random();
const p0 = (1 - maf) ** 2;
const p1 = 2 * maf * (1 - maf);
if (r < p0) return 0;
if (r < p0 + p1) return 1;
return 2;
}

function getRandomInt(min, max) {
return Math.floor(Math.random() * (max - min + 1)) + min;
}

const demographicNames = [
"study_entry_age",
"observed_followup",
"study_exit_age",
"time_of_onset"
];
const allNames = [...demographicNames, ...snpNames];

const data = new Array(nSamples);

for (let i = 0; i < nSamples; i++) {
// Sample entry age and follow-up
const entryAge = getRandomInt(entryAgeRange.min, entryAgeRange.max);
const followup = getRandomInt(followupRange.min, followupRange.max);
const exitAge = entryAge + followup;

// Generate genotype data
const genotype = mafs.map(generateGenotype);

// Compute PRS
const prs = genotype.reduce((sum, g, idx) => sum + g * betas[idx], 0);

// Sample time_of_onset from T >= entryAge
// T^k = a^k - ln(U) / [ b * exp(prs) ]
// Ensure T >= entryAge
const aPow = Math.pow(entryAge, k);
const eLP = Math.exp(prs);

let timeOfOnset;
while (true) {
const U = Math.random();
const val = aPow - Math.log(U) / (b * eLP);
if (val >= 0) {
timeOfOnset = Math.pow(val, 1 / k);
break;
}
}
data[i] = [entryAge, followup, exitAge, timeOfOnset, ...genotype];
}

return {
names: allNames,
data
};
}
Insert cell
/**
* Generate Kaplan-Meier curve data from a simulated cohort.
*
* The returned array is an array of objects:
* [ {time, survival, lower, upper}, ... ]
*
* where:
* - time is the time since study entry
* - survival is the Kaplan-Meier survival estimate at that time
* - lower, upper are approximate 95% CI (Greenwood’s formula)
*
* @param {{names: string[], data: number[][]}} cohort
* A dataset from `simulateProspectiveCohort`, where columns are:
* 0: study_entry_age
* 1: observed_followup
* 2: study_exit_age
* 3: time_of_onset
* 4..: genotypes
* @returns {Array<{time: number, survival: number, lower: number, upper: number}>}
*/
function generateKaplanMeierData(cohort) {
if (!cohort || !cohort.names || !cohort.data) {
throw new Error("Invalid cohort input");
}

const colStudyEntry = cohort.names.indexOf("study_entry_age");
const colFollowup = cohort.names.indexOf("observed_followup");
const colExitAge = cohort.names.indexOf("study_exit_age");
const colOnset = cohort.names.indexOf("time_of_onset");

if (
colStudyEntry === -1 ||
colFollowup === -1 ||
colExitAge === -1 ||
colOnset === -1
) {
throw new Error("Missing required columns in cohort data");
}

const subjects = cohort.data.map((row) => {
const entryAge = row[colStudyEntry];
const followup = row[colFollowup];
const exitAge = row[colExitAge];
const onsetAge = row[colOnset];

// The event occurs if time_of_onset <= study_exit_age
// timeSinceEntryOfEvent = onsetAge - entryAge
const eventTimeSinceEntry = onsetAge - entryAge;
const censorTime = followup; // = exitAge - entryAge

// Observed time is min(eventTimeSinceEntry, censorTime)
const observedTime = Math.min(eventTimeSinceEntry, censorTime);

// eventIndicator = 1 if onsetAge <= exitAge, 0 otherwise
const isEvent = eventTimeSinceEntry <= censorTime ? 1 : 0;

return {
time: observedTime,
event: isEvent
};
});

// Exclude any negative or zero times (shouldn't happen if T>=entryAge)
// but just in case numeric rounding etc.
const cleaned = subjects.filter((subj) => subj.time >= 0);

// Sort by time ascending
cleaned.sort((a, b) => a.time - b.time);

// Kaplan-Meier calculation
// S(0) = 1
// For each unique event time t_j:
// r_j = number at risk just prior to t_j
// d_j = number of events at t_j
// S(t_j) = S(t_{j-1}) * (1 - d_j / r_j)
//
// Greenwood variance:
// var( S(t_j) ) = S(t_j)^2 * sum_{i=1..j} [ d_i / ( r_i * (r_i - d_i) ) ]
// 95% CI => S(t_j) +/- 1.96 * sqrt( var( S(t_j) ) )

let atRisk = cleaned.length; // at time=0
let prevSurv = 1.0; // S(0)
let prevVarTerm = 0.0; // sum_{i} [ d_i / (r_i*(r_i - d_i)) ]
const kmData = [];
kmData.push({
time: 0,
survival: 1.0,
lower: 1.0,
upper: 1.0
});

let idx = 0;

while (idx < cleaned.length) {
const currentTime = cleaned[idx].time;

let nEventsAtThisTime = 0;
let nTotalAtThisTime = 0;

const thisTime = currentTime;
while (idx < cleaned.length && cleaned[idx].time === thisTime) {
nTotalAtThisTime++;
if (cleaned[idx].event === 1) {
nEventsAtThisTime++;
}
idx++;
}

// r_j = 'atRisk' just prior to t_j
const rj = atRisk;
const dj = nEventsAtThisTime;

// If no events at this time, survival does not jump down
if (dj > 0) {
const newSurv = prevSurv * (1 - dj / rj);
// Greenwood increment
const increment = dj / (rj * (rj - dj));
prevVarTerm += increment;

// variance of S(t_j)
const varSurv = (newSurv * newSurv) * prevVarTerm;
const sdSurv = Math.sqrt(varSurv);
const z = 1.96; // ~95% normal approximation
const lower = Math.max(0, newSurv - z * sdSurv);
const upper = Math.min(1, newSurv + z * sdSurv);

prevSurv = newSurv;

kmData.push({
time: thisTime,
survival: newSurv,
lower,
upper
});
} else {
// push a point in case we want a step at an event-free time
kmData.push({
time: thisTime,
survival: prevSurv,
lower: Math.max(0, prevSurv - 1e-8),
upper: Math.min(1, prevSurv + 1e-8)
});
}

// Everyone who had time == thisTime is no longer at risk after
// (whether event or censored).
atRisk -= nTotalAtThisTime;
}

return kmData;
}

Insert cell
pyodide = (
await import("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.mjs")
).loadPyodide()
Insert cell

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.
Learn more