const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
import { fetchURLAndCache, groupBy } from "./utils.js";
import { convertMatrix } from "./mutationalSpectrum.js";
import * as pako from "https://cdn.jsdelivr.net/npm/pako/+esm";
/**
* Obtain projects by gene
*
* @async
* @function getProjectsByGene
* @memberof tcga
*
* @param {array} genes List with the genes of interest (ensemble id)
*
* @returns {Object} Object containing the list of projects in which the genes are over/under expressed and the list of projects organized by genes
*
* @example
* let tcga = await import('https://raw.githubusercontent.com/YasCoMa/msig/main/mSigSDKScripts/tcga.js')
* let genes = ['ENSG00000155657']
* var result = await tcga.getProjectsByGene(genes)
*/
async function getProjectsByGene(genes) {
var dat = {};
var projects = [];
var i = 0;
var ide = genes;
var info = [];
while (i < ide.length) {
var end = i + 15 <= ide.length ? i + 15 : ide.length;
var temp = ide.slice(i, end);
info = info.concat(
await Promise.all(
temp.map(async (g) => {
var url = `https://api.gdc.cancer.gov/analysis/top_cases_counts_by_genes?gene_ids=${g}`;
var data = await fetchURLAndCache("TCGA", url);
data = await data.json();
var temp = [];
for (var p of data["aggregations"]["projects"]["buckets"]) {
if (!temp.includes(p["key"])) {
temp.push(p["key"]);
}
if (!projects.includes(p["key"])) {
projects.push(p["key"]);
}
}
dat[g] = temp;
await sleep(300);
return url;
})
)
);
i += 15;
if (i >= ide.length) {
break;
}
}
var result = { projects: projects, projects_by_gene: dat };
return result;
}
/**
* Obtain tpm count of a list of genes in the sample files of the selected projects
*
* @async
* @function getTpmCountsByGenesOnProjects
* @memberof tcga
*
* @param {array} genes List with the genes of interest (ensemble id)
* @param {array} projects List with the projects of interest
*
* @returns {Object} Object containing the list of count file ids and file descriptions organized by projects
*
* @example
* let genes = ['ENSG00000155657']
* let projects = ['TCGA-LUSC', 'TCGA-OV']
* var result = await tcga.getTpmCountsByGenesOnProjects(genes, projects)
*/
async function getTpmCountsByGenesOnProjects(genes, projects) {
var result = {};
var i = 0;
var ide = projects;
var info = [];
while (i < ide.length) {
var end = i + 15 <= ide.length ? i + 15 : ide.length;
var temp = ide.slice(i, end);
info = info.concat(
await Promise.all(
temp.map(async (p) => {
result[p] = {};
var query = {
filters: {
op: "and",
content: [
{
op: "in",
content: {
field: "cases.project.project_id",
value: [p],
},
},
{
op: "=",
content: {
field: "data_type",
value: "Gene Expression Quantification",
},
},
{
op: "=",
content: {
field: "experimental_strategy",
value: "RNA-Seq",
},
},
],
},
format: "tsv",
fields:
"file_id,file_name,cases.submitter_id,cases.case_id,data_category,data_type,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.samples.sample_type,cases.samples.submitter_id,cases.samples.sample_id,analysis.workflow_type,cases.project.project_id,cases.samples.portions.analytes.aliquots.submitter_id",
size: "1000",
};
var data = await fetchURLAndCache(
"TCGA",
"https://api.gdc.cancer.gov/files",
{
method: "POST",
body: JSON.stringify(query),
headers: { "Content-Type": "application/json" },
}
);
data = await data.text();
var table = data
.replaceAll("\r", "")
.split("\n")
.slice(1)
.map((e) => {
return e.split("\t");
});
var files_ = [];
var count_files = table.map((e) => {
console.log(e);
var files = {};
files["workflow_type"] = e[0];
files["case_id"] = e[1];
files["sample_id"] = e[4];
files["sample_type"] = e[5];
files["cases_sample_submitter_id"] = e[6];
files["tissue_type"] = e[7];
files["tumor_descriptor"] = e[8];
files["cases_submitter_id"] = e[9];
files["data_category"] = e[10];
files["data_type"] = e[11];
files["file_name"] = e[13];
files["file_id"] = e[14];
files_.push(files);
return e[e.length - 1];
});
result[p]["count_files"] = count_files;
result[p]["files_description"] = files_;
await sleep(300);
return p;
})
)
);
i += 15;
if (i >= ide.length) {
break;
}
}
return result;
}
/**
* Obtain tpm count of a list of genes given sample file identifiers
*
* @async
* @function getTpmCountsByGenesFromFiles
* @memberof tcga
*
* @param {array} genes List with the genes of interest (ensemble id)
* @param {array} files List with the file ids
*
* @returns {Object} Object containing the list of count ftpm and fpkm from each file organized by genes
*
* @example
* let tcga = await import('https://raw.githubusercontent.com/YasCoMa/msig/main/mSigSDKScripts/tcga.js')
* let genes = ['ENSG00000155657']
* let files = ['9e5f8edc-5074-43b7-a870-594aeb36e2aa', '8d5a94c8-b3d9-4991-8ce9-f7aa9189938c', 'dedf9f52-7ded-4cc5-bba2-da89a48b5176', '3aa53aa2-97cd-43a8-b7b1-09f0bf6381dd']
* var result = await tcga.getTpmCountsByGenesFromFiles(genes, files)
*/
async function getTpmCountsByGenesFromFiles(genes, files) {
var result = {};
for (var g of genes) {
result[g] = { name: "", type: "", counts_fpkm: {}, counts_tpm: {} };
}
var info = [];
var i = 0;
while (i < files.length) {
var end = i + 15 <= files.length ? i + 15 : files.length;
var temp = files.slice(i, end);
info = info.concat(
await Promise.all(
temp.map(async (f) => {
var data = await fetchURLAndCache(
"TCGA",
`https://api.gdc.cancer.gov/data/${f}`
);
data = await data.text();
data = data
.split("\n")
.map((e) => {
return e.split("\t");
})
.filter((e) => e.length > 1);
console.log(f);
var col_tpm = -1;
var col_fpkm = -1;
var i = 0;
for (var c of data[0]) {
if (c.toLowerCase().indexOf("tpm") != -1) {
col_tpm = i;
}
if (c.toLowerCase().indexOf("fpkm") != -1) {
col_fpkm = i;
}
i += 1;
}
var gr = [];
var filter = data.filter((e) => genes.includes(e[0].split(".")[0]));
filter.forEach((e) => {
if (col_tpm != -1 && col_fpkm != -1) {
var gene = e[0].split(".")[0];
result[gene]["name"] = e[1];
result[gene]["type"] = e[2];
result[gene]["counts_fpkm"][f] = e[col_fpkm];
result[gene]["counts_tpm"][f] = e[col_tpm];
gr.push([e[2], e[1], Number(e[col_fpkm]), Number(e[col_tpm])]);
}
});
await sleep(300);
return gr;
})
)
);
i += 15;
if (i >= files.length) {
break;
}
}
return result;
}
/**
* Obtain MAF file ids and demograpic info of a list of projects
*
* @async
* @function getMafInformationFromProjects
* @memberof tcga
*
* @param {array} projects List with the projects of interest
*
* @returns {Object} Object containing the list of maf files and samples demographic information
*
* @example
* let tcga = await import('https://raw.githubusercontent.com/YasCoMa/msig/main/mSigSDKScripts/tcga.js')
* let projects = ['TCGA-LUSC', 'TCGA-OV']
* var result = await tcga.getMafInformationFromProjects(projects)
*/
async function getMafInformationFromProjects(projects) {
var result = {};
var i = 0;
var ide = projects;
var info = [];
while (i < ide.length) {
var end = i + 15 <= ide.length ? i + 15 : ide.length;
var temp = ide.slice(i, end);
info = info.concat(
await Promise.all(
temp.map(async (p) => {
result[p] = {};
var query = {
filters: {
op: "and",
content: [
{
op: "in",
content: {
field: "cases.project.project_id",
value: [p],
},
},
{
op: "=",
content: {
field: "data_category",
value: "Simple Nucleotide Variation",
},
},
{
op: "=",
content: {
field: "data_type",
value: "Masked Somatic Mutation",
},
},
{
op: "=",
content: {
field: "experimental_strategy",
value: "WXS",
},
},
],
},
format: "tsv",
fields:
"file_id,cases.project.project_id,cases.submitter_id,cases.case_id,cases.samples.tumor_descriptor,cases.samples.tissue_type,cases.demographic.ethnicity,cases.demographic.gender,cases.demographic.race,cases.demographic.year_of_birth,cases.diagnoses.age_at_diagnosis,cases.diagnoses.classification_of_tumor,cases.diagnoses.days_to_recurrence,cases.diagnoses.tumor_stage",
size: "1000",
};
var data = await fetchURLAndCache(
"TCGA",
"https://api.gdc.cancer.gov/files",
{
method: "POST",
body: JSON.stringify(query),
headers: { "Content-Type": "application/json" },
}
);
data = await data.text();
var table = data
.replaceAll("\r", "")
.split("\n")
.slice(1)
.map((e) => {
return e.split("\t");
});
var files_ = [];
var count_files = table.map((e) => {
var files = {};
files["case_id"] = e[0];
files["ethnicity"] = e[1];
files["gender"] = e[2];
files["race"] = e[3];
files["year_of_birth"] = Number(e[4]);
files["age_at_diagnosis"] = Number(
(Number(e[5]) / 365).toPrecision(2)
);
files["classification_of_tumor"] = e[6];
files["case_submitter_id"] = e[13];
files["file_id"] = e[14];
files_.push(files);
return e[e.length - 1];
});
result[p]["maf_files"] = count_files;
result[p]["samples_description"] = files_;
await sleep(300);
return p;
})
)
);
i += 15;
if (i >= ide.length) {
break;
}
}
return result;
}
/**
* Obtain mutations and variant information given MAF file identifiers
*
* @async
* @function getVariantInformationFromMafFiles
* @memberof tcga
* @param {array} res Object containing the list of maf files and samples demographic information
* @returns {Object} Object containing the list of patient mutation information
* @example
* let tcga = await import('https://raw.githubusercontent.com/YasCoMa/msig/main/mSigSDKScripts/tcga.js')
* let res = { 'TCGA-LUSC': { 'maf_files': ['0b3d2db3-8ae3-4d39-bd9b-9d1e7a133b65', '9fed5902-6e95-4526-a119-ec4eade5576b' ] } }
* var result = await tcga.getVariantInformationFromMafFiles(res)
*/
async function getVariantInformationFromMafFiles(res) {
var result = {};
var projects = Object.keys(res);
for (var p of projects) {
result[p] = {};
result[p]["variant_information"] = [];
result[p]["mutational_spectra"] = null;
var files = res[p]["maf_files"];
var info = [];
var gr = [];
var i = 0;
while (i < files.length) {
var end = i + 15 <= files.length ? i + 15 : files.length;
var temp = files.slice(i, end);
info = info.concat(
await Promise.all(
temp.map(async (f) => {
var url = `https://api.gdc.cancer.gov/data/${f}`;
try {
var dat = await fetchURLAndCache("TCGA", url);
var raw = await dat.arrayBuffer();
var data = await pako.default.inflate(raw, { to: "string" });
data = data
.split("\n")
.filter((e) => e.indexOf("#") != 0)
.map((e) => {
return e.split("\t");
})
.filter((e) => e.length > 1);
var patients = [];
var filter = data.slice(1);
filter.forEach((e) => {
var build =
e[3].indexOf("37") != -1
? "hg19"
: e[3].indexOf("38") != -1
? "hg38"
: "";
if (build != "") {
var obj = { project_code: p, sample: f, build: build };
obj["chromosome"] = e[4].toLowerCase().replace("chr", "");
obj["reference_genome_allele"] = e[10];
obj["mutated_to_allele"] = e[12];
obj["chromosome_start"] = e[6];
obj["mutation_type"] = e[9];
obj["mutation_classification"] = e[8];
result[p]["variant_information"].push(obj);
patients.push(obj);
}
});
await sleep(300);
info.push(patients);
gr.push(i);
if (files.length == gr.length) {
result[p]["mutational_spectra"] = await convertMatrix(
info,
"sample",
100,
"hg19",
true
);
}
} catch (e) {
console.log(e);
}
return url;
})
)
);
i += 15;
if (i >= files.length) {
break;
}
}
}
return result;
}
function convertTCGAProjectIntoJSON(MAFfiles, mutSpec, dataType = "WGS") {
// loop through each mutational spectrum in the mutSpec dictionary and create a JSON object for each one
const mergedPatientJSONs = [];
let i = 0;
for (let patient in mutSpec) {
const patientJSON = [];
for (let mutationType in mutSpec[patient]) {
let mutSpecObj = {
sample: patient,
strategy: dataType,
profile: "SBS",
matrix: 96,
mutationType: mutationType,
mutations: mutSpec[patient][mutationType],
};
patientJSON.push(mutSpecObj);
}
mergedPatientJSONs.push(patientJSON);
i++;
}
return mergedPatientJSONs;
}
export {
getProjectsByGene,
getTpmCountsByGenesOnProjects,
getTpmCountsByGenesFromFiles,
getMafInformationFromProjects,
getVariantInformationFromMafFiles,
convertTCGAProjectIntoJSON,
};