Simulating prospective cohort study datasets for a given disease and population / Jeya Balaji Balasubramanian

Jeya Balaji Balasubramanian

AI Researcher in Medicine. All opinions are my own.

Workspace

Public

Edited

Apr 30

1 star

empiricalBreastCancerIncidenceRates = (

await fetchAndParseCsv(

"https://raw.githubusercontent.com/jeyabbalas/py-icare/master/data/iCARE-Lit/age_specific_breast_cancer_incidence_rates.csv",

"\n"

)

).map((d) => {

return { age: parseInt(d.age), rate: parseFloat(d.rate) };

})

empiricalCdfs = empiricalCdf(empiricalBreastCancerIncidenceRates)

pgsData = loadPgsData(4, "GRCh37")

snpNames = getSnpNames(pgsData)

mafs = pgsData.data[pgsData.headers.indexOf("allelefrequency_effect")]

betas = pgsData.data[pgsData.headers.indexOf("effect_weight")]

nSamples = 100000 // Note to Rodrigo: Keep this fixed for all diseases/populations

prs = simulatePolygenicRiskScores(nSamples, mafs, betas)

weibullParams = fitWeibullParams(empiricalCdfs, prs, {

errorTolerance: 1e-12,

maxIterations: 200

})

maxAge = empiricalBreastCancerIncidenceRates[empiricalBreastCancerIncidenceRates.length - 1].age;

modeledIncidenceRates = generateWeibullIncidenceCurve(

weibullParams.k,

weibullParams.b,

prs,

maxAge

)

nCohortSamples = 1000

entryAgeRange = ({ min: 50, max: 67 })

followupRange = ({ min: 5, max: 13 })

simulatedProspectiveCohort = simulateProspectiveCohort(

snpNames,

mafs,

betas,

weibullParams.k,

weibullParams.b,

entryAgeRange,

followupRange,

nCohortSamples

)

kmCurveData = generateKaplanMeierData(simulatedProspectiveCohort)

async function fetchAndParseCsv(url, lineSeparator, nLines) {

let response = await fetch(url);

let data = (await response.text()).split(lineSeparator);

let header = data[0].split(",");

if (nLines !== undefined) {

data = data.slice(1, nLines + 1);

} else {

data = data.slice(1);

}

data = data.map((d) => {

if (d.trim() === "") return null;

let elements = d.split(",");

return header.reduce((obj, k, i) => ({ ...obj, [k]: elements[i] }), {});

});

return data.filter((row) => row !== null);

}

function empiricalCdf(incidenceRates) {

let cumulativeHazard = 0;

const cdfArray = incidenceRates.map((ageRate) => {

cumulativeHazard += ageRate.rate;

const cdf = 1 - Math.exp(-cumulativeHazard);

return { age: ageRate.age, cdf };

});

return cdfArray;

}

async function loadPgsData(id, build = "GRCh37") {

const pgsCatalogId = "PGS" + id.toString().padStart(6, "0");

try {

const url = `https://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/${pgsCatalogId}/ScoringFiles/Harmonized/${pgsCatalogId}_hmPOS_${build}.txt.gz`;

const text = await fetchGzippedFile(url);

const parsed = parseTsvContent(text);

return parsed;

} catch (error) {

console.error("Failed to load or parse file:", error);

throw error;

}

async function fetchGzippedFile(url) {

try {

const response = await fetch(url);

if (!response.ok) {

throw new Error(`HTTP error! status: ${response.status}`);

}

const compressed = await response.arrayBuffer();

const ds = new DecompressionStream('gzip');

const compressedStream = new Blob([compressed])

.stream()

.pipeThrough(ds);

const decompressedResponse = new Response(compressedStream);

const text = await decompressedResponse.text();

return text;

} catch (error) {

console.error('Error fetching or decompressing file:', error);

throw error;

}

function parseTsvContent(content) {

const lines = content.split("\n");

const headerIndex = lines.findIndex(

(line) => !line.startsWith("#") && line.trim().length > 0

);

if (headerIndex === -1) {

throw new Error("No header line found in file");

}

const headers = lines[headerIndex].split("\t").map((header) => header.trim());

const numColumns = headers.length;

const data = Array(numColumns)

.fill()

.map(() => []);

lines

.slice(headerIndex + 1)

.filter((line) => line.trim().length > 0)

.forEach((line) => {

const values = line.split("\t");

values.forEach((value, colIndex) => {

if (colIndex < numColumns) {

const trimmedValue = value ? value.trim() : "";

data[colIndex].push(

!isNaN(trimmedValue) && trimmedValue !== ""

? Number(trimmedValue)

: trimmedValue

);

}

});

return {

headers,

data

};

}

function getSnpNames(pgsData) {

const chrNames = pgsData.data[pgsData.headers.indexOf("chr_name")];

const chrPositions = pgsData.data[pgsData.headers.indexOf("chr_position")];

const effectAlleles = pgsData.data[pgsData.headers.indexOf("effect_allele")];

const otherAlleles = pgsData.data[pgsData.headers.indexOf("other_allele")];

const snpNames = chrNames?.map(

(_, i) =>

`${chrNames[i]}:${chrPositions[i]}:${effectAlleles[i]}:${otherAlleles[i]}`

);

return snpNames;

}

function simulatePolygenicRiskScores(nSamples, mafs, betas) {

if (

!Array.isArray(mafs) ||

!Array.isArray(betas) ||

mafs.length !== betas.length ||

nSamples <= 0

) {

throw new Error("Invalid input parameters");

}

const drawGenotypeHWE = (maf) => {

// Hardy-Weinberg equilibrium (i.e. ~Bernoulli(2, maf))

const r = Math.random();

const probHomozygousMajor = (1 - maf) ** 2; // p^2

const probHeterozygous = 2 * (1 - maf) * maf; // 2pq

if (r < probHomozygousMajor) return 0;

if (r < probHomozygousMajor + probHeterozygous) return 1;

return 2;

};

const nSnps = betas.length;

const prs = new Float64Array(nSamples);

for (let i = 0; i < nSamples; i++) {

let iPrs = 0;

for (let j = 0; j < nSnps; j++) {

const g = drawGenotypeHWE(mafs[j]);

iPrs += betas[j] * g;

}

prs[i] = iPrs;

}

return prs;

}

async function fitWeibullParams(

empiricalCdfs,

linearPredictors,

optimizerParams = { errorTolerance: 1e-12, maxIterations: 200 }

) {

await pyodide.loadPackage("scipy");

const ages = empiricalCdfs.map((x) => x.age);

const empCdfs = empiricalCdfs.map((x) => x.cdf);

const expLinearPredictors = Array.from(linearPredictors.map(Math.exp));

pyodide.globals.set("ages", JSON.stringify(ages));

pyodide.globals.set("emp_cdfs", JSON.stringify(empCdfs));

pyodide.globals.set(

"exp_linear_predictors",

JSON.stringify(expLinearPredictors)

);

pyodide.globals.set("maxiter", optimizerParams.maxIterations);

pyodide.globals.set("ftol", optimizerParams.errorTolerance);

const pyCode = `

import json

import numpy as np

from scipy.optimize import minimize

# Load data passed from JS

ages = np.array(json.loads(ages), dtype=float)

emp_cdfs = np.array(json.loads(emp_cdfs), dtype=float)

exp_linear_predictors = np.array(json.loads(exp_linear_predictors), dtype=float)

N = len(exp_linear_predictors)

def population_cdf(k, b, t_array, expLP):

"""

F_pop(t) for each t in t_array;

where F_pop(t) = 1 - (1/N) * sum_{i=1..N} exp(-b * t^k * exp(beta^T Z_i)).

"""

results = np.zeros_like(t_array, dtype=float)

for ix, t in enumerate(t_array):

if t <= 0.0:

results[ix] = 0.0

else:

sum_surv = np.exp(-b * (t ** k) * expLP).sum()

avg_surv = sum_surv / N

results[ix] = 1.0 - avg_surv

return results

def rmse(pred, truth):

return np.sqrt(np.mean((pred - truth) ** 2))

def rmse_weibull(params):

"""

Objective function: RMSE(F_pop(ages), emp_cdfs).

params = [k, log_b] with k > 0.

"""

k, log_b = params

b = np.exp(log_b)

model_cdf = population_cdf(k, b, ages, exp_linear_predictors)

return rmse(model_cdf, emp_cdfs)

def callback(p):

k, log_b = p

print(f" Current k={k:.4e}, b={np.exp(log_b):.4e} => RMSE={rmse_weibull(p):.4e}")

# Initial guess:

initial_guess = [1.0, 0.0] # k = 1.0, b = exp(0) = 1.0

bounds = [(1e-8, None), (-20, 5)] # k, b > 0

options = {'maxiter': maxiter, 'ftol': ftol, 'gtol': 1e-12}

result = minimize(

rmse_weibull,

x0 = initial_guess,

method = 'L-BFGS-B',

bounds = bounds,

callback = callback,

options = options

)

if not result.success:

print("Optimization failed:", result.message)

opt_k = result.x[0]

opt_log_b = result.x[1]

opt_b = np.exp(opt_log_b)

final_rmse = rmse_weibull([opt_k, opt_log_b])

print("\\nFit done:")

print(f" k={opt_k:.4e}, b={opt_b:.4e}, RMSE={final_rmse:.4e}")

print(f"Optimization result details:\\n\\tSuccess: {result.success}\\n\\tStatus: {result.status}\\n\\tMessage: {result.message}\\n\\tNumber of iterations: {result.nit}\\n\\tNumber of function evaluations: {result.nfev}")

result_list = [opt_k, opt_b, final_rmse]

result_list

const result = await pyodide.runPythonAsync(pyCode); // [k, b, final_rmse]

return {

k: result[0],

b: result[1],

rmse: result[2]

};

}

function generateWeibullIncidenceCurve(k, b, linearPredictors, maxAge) {

// For each age = 0..maxAge,

// define predicted incidence "rate" as the average hazard in [age, age+1),

// or approximate using difference of the population CDF:

// incRate(age) = Probability(Event in [age, age+1)) per person-year

// = [ F_pop(age+1) - F_pop(age) ] / 1

// F_pop(t) = 1 - (1/N) sum_i exp(-b*expLP_i*t^k)

const expLinearPredictors = linearPredictors.map(Math.exp);

function populationCdf(t) {

if (t <= 0) return 0.0;

let sumSurv = 0.0;

for (let i = 0; i < expLinearPredictors.length; i++) {

sumSurv += Math.exp(-b * expLinearPredictors[i] * Math.pow(t, k));

}

const avgSurv = sumSurv / expLinearPredictors.length;

return 1 - avgSurv;

}

const results = [];

for (let age = 0; age <= maxAge; age++) {

const cdf1 = populationCdf(age);

const cdf2 = populationCdf(age + 1);

const inc = cdf2 - cdf1; // Probability of event in [age, age+1)

// probability per year => "annual incidence rate"

results.push({ age, rate: inc });

}

return results;

}

/**

* Simulate a prospective cohort under a Cox-Weibull model with left-truncation

* at study entry age 'a'. The output matches the format of your earlier

* `simulateGenotypeProfile` but adds a column called "time_of_onset".

* @param {string[]} snpNames Array of SNP names

* @param {number[]} mafs Minor allele frequencies (same length as snpNames)

* @param {number[]} betas Coefficients for each SNP (same length as snpNames)

* @param {number} k Learned Weibull shape parameter

* @param {number} b Learned Weibull rate (or scale-like) parameter

* @param {{min: number, max: number}} entryAgeRange

* @param {{min: number, max: number}} followupRange

* @param {number} nSamples

* @returns {{ names: string[], data: number[][] }}

function simulateProspectiveCohort(

snpNames,

mafs,

betas,

entryAgeRange,

followupRange,

nSamples

) {

if (snpNames.length !== mafs.length || mafs.length !== betas.length) {

throw new Error("Lengths of snpNames, mafs, and betas must all match");

}

if (

typeof entryAgeRange.min !== "number" ||

typeof entryAgeRange.max !== "number" ||

typeof followupRange.min !== "number" ||

typeof followupRange.max !== "number"

) {

throw new Error("Invalid range parameters");

}

function generateGenotype(maf) {

// Hardy-Weinberg equilibrium

const r = Math.random();

const p0 = (1 - maf) ** 2;

const p1 = 2 * maf * (1 - maf);

if (r < p0) return 0;

if (r < p0 + p1) return 1;

return 2;

}

function getRandomInt(min, max) {

return Math.floor(Math.random() * (max - min + 1)) + min;

}

const demographicNames = [

"study_entry_age",

"observed_followup",

"study_exit_age",

"time_of_onset"

];

const allNames = [...demographicNames, ...snpNames];

const data = new Array(nSamples);

for (let i = 0; i < nSamples; i++) {

// Sample entry age and follow-up

const entryAge = getRandomInt(entryAgeRange.min, entryAgeRange.max);

const followup = getRandomInt(followupRange.min, followupRange.max);

const exitAge = entryAge + followup;

// Generate genotype data

const genotype = mafs.map(generateGenotype);

// Compute PRS

const prs = genotype.reduce((sum, g, idx) => sum + g * betas[idx], 0);

// Sample time_of_onset from T >= entryAge

// T^k = a^k - ln(U) / [ b * exp(prs) ]

// Ensure T >= entryAge

const aPow = Math.pow(entryAge, k);

const eLP = Math.exp(prs);

let timeOfOnset;

while (true) {

const U = Math.random();

const val = aPow - Math.log(U) / (b * eLP);

if (val >= 0) {

timeOfOnset = Math.pow(val, 1 / k);

break;

}

data[i] = [entryAge, followup, exitAge, timeOfOnset, ...genotype];

}

return {

names: allNames,

data

};

}

/**

* Generate Kaplan-Meier curve data from a simulated cohort.

* The returned array is an array of objects:

* [ {time, survival, lower, upper}, ... ]

* where:

* - time is the time since study entry

* - survival is the Kaplan-Meier survival estimate at that time

* - lower, upper are approximate 95% CI (Greenwood’s formula)

* @param {{names: string[], data: number[][]}} cohort

* A dataset from `simulateProspectiveCohort`, where columns are:

* 0: study_entry_age

* 1: observed_followup

* 2: study_exit_age

* 3: time_of_onset

* 4..: genotypes

* @returns {Array<{time: number, survival: number, lower: number, upper: number}>}

function generateKaplanMeierData(cohort) {

if (!cohort || !cohort.names || !cohort.data) {

throw new Error("Invalid cohort input");

}

const colStudyEntry = cohort.names.indexOf("study_entry_age");

const colFollowup = cohort.names.indexOf("observed_followup");

const colExitAge = cohort.names.indexOf("study_exit_age");

const colOnset = cohort.names.indexOf("time_of_onset");

if (

colStudyEntry === -1 ||

colFollowup === -1 ||

colExitAge === -1 ||

colOnset === -1

) {

throw new Error("Missing required columns in cohort data");

}

const subjects = cohort.data.map((row) => {

const entryAge = row[colStudyEntry];

const followup = row[colFollowup];

const exitAge = row[colExitAge];

const onsetAge = row[colOnset];

// The event occurs if time_of_onset <= study_exit_age

// timeSinceEntryOfEvent = onsetAge - entryAge

const eventTimeSinceEntry = onsetAge - entryAge;

const censorTime = followup; // = exitAge - entryAge

// Observed time is min(eventTimeSinceEntry, censorTime)

const observedTime = Math.min(eventTimeSinceEntry, censorTime);

// eventIndicator = 1 if onsetAge <= exitAge, 0 otherwise

const isEvent = eventTimeSinceEntry <= censorTime ? 1 : 0;

return {

time: observedTime,

event: isEvent

};

});

// Exclude any negative or zero times (shouldn't happen if T>=entryAge)

// but just in case numeric rounding etc.

const cleaned = subjects.filter((subj) => subj.time >= 0);

// Sort by time ascending

cleaned.sort((a, b) => a.time - b.time);

// Kaplan-Meier calculation

// S(0) = 1

// For each unique event time t_j:

// r_j = number at risk just prior to t_j

// d_j = number of events at t_j

// S(t_j) = S(t_{j-1}) * (1 - d_j / r_j)

// Greenwood variance:

// var( S(t_j) ) = S(t_j)^2 * sum_{i=1..j} [ d_i / ( r_i * (r_i - d_i) ) ]

// 95% CI => S(t_j) +/- 1.96 * sqrt( var( S(t_j) ) )

let atRisk = cleaned.length; // at time=0

let prevSurv = 1.0; // S(0)

let prevVarTerm = 0.0; // sum_{i} [ d_i / (r_i*(r_i - d_i)) ]

const kmData = [];

kmData.push({

time: 0,

survival: 1.0,

lower: 1.0,

upper: 1.0

});

let idx = 0;

while (idx < cleaned.length) {

const currentTime = cleaned[idx].time;

let nEventsAtThisTime = 0;

let nTotalAtThisTime = 0;

const thisTime = currentTime;

while (idx < cleaned.length && cleaned[idx].time === thisTime) {

nTotalAtThisTime++;

if (cleaned[idx].event === 1) {

nEventsAtThisTime++;

}

idx++;

}

// r_j = 'atRisk' just prior to t_j

const rj = atRisk;

const dj = nEventsAtThisTime;

// If no events at this time, survival does not jump down

if (dj > 0) {

const newSurv = prevSurv * (1 - dj / rj);

// Greenwood increment

const increment = dj / (rj * (rj - dj));

prevVarTerm += increment;

// variance of S(t_j)

const varSurv = (newSurv * newSurv) * prevVarTerm;

const sdSurv = Math.sqrt(varSurv);

const z = 1.96; // ~95% normal approximation

const lower = Math.max(0, newSurv - z * sdSurv);

const upper = Math.min(1, newSurv + z * sdSurv);

prevSurv = newSurv;

kmData.push({

time: thisTime,

survival: newSurv,

lower,

upper

});

} else {

// push a point in case we want a step at an event-free time

kmData.push({

time: thisTime,

survival: prevSurv,

lower: Math.max(0, prevSurv - 1e-8),

upper: Math.min(1, prevSurv + 1e-8)

});

}

// Everyone who had time == thisTime is no longer at risk after

// (whether event or censored).

atRisk -= nTotalAtThisTime;

}

return kmData;

}

pyodide = (

await import("https://cdn.jsdelivr.net/pyodide/v0.27.2/full/pyodide.mjs")

).loadPyodide()

Purpose-built for displays of data

Observable is your go-to platform for exploring data and creating expressive data visualizations. Use reactive JavaScript notebooks for prototyping and a collaborative canvas for visual data exploration and dashboard creation.

Learn more