Source: mSigSDKScripts/mutationalSpectrum.js

import { fetchURLAndCache } from "./utils.js";

function get_sbs_trinucleotide_contexts() {
  const nucleotide_bases = ["A", "C", "G", "T"];
  const substitution_types = ["C>A", "C>G", "C>T", "T>A", "T>C", "T>G"];
  let sbs_trinucleotide_contexts = [];

  for (let base_5 of nucleotide_bases) {
    for (let substitution of substitution_types) {
      for (let base_3 of nucleotide_bases) {
        sbs_trinucleotide_contexts.push(`${base_5}[${substitution}]${base_3}`);
      }
    }
  }

  return sbs_trinucleotide_contexts;
}

function standardize_substitution(ref_allele, mut_allele) {
  /*
COSMIC signatures define mutations from a pyrimidine allele (C, T) to any
other base (C>A, C>G, C>T, T>A, T>C, T>G). If a mutation in the MAF file
is defined from a reference purine allele (A, G), then we infer the substituted
base in the complementary sequence, which would be from a pyrimidine
allele due to purines and pyrimidines complementing each other in a
double-stranded DNA.
 :param ref_allele: base in the reference genome.
:param mut_allele: base in the mutated genome
:return: substitution string from pyrimidine to any other base.
*/
  var complement_seq, purines;
  complement_seq = {
    A: "T",
    C: "G",
    T: "A",
    G: "C",
  };
  purines = ["A", "G"];

  if (purines.some((v) => ref_allele.includes(v))) {
    return `${complement_seq[ref_allele]}>${complement_seq[mut_allele]}`;
  } else {
    return `${ref_allele}>${mut_allele}`;
  }
}

function init_sbs_mutational_spectra(n_records) {
  /*
Initilizes an ordered dictionary with SBS trinucleotide context as keys and
a list of counts, one for each sample.
 :param n_records: number of samples to record in the mutational spectra matrix.
:return: a dictionary of trinucleotide context and a list of counts
initialized to zeros.
*/

  let tri_nuc_context = get_sbs_trinucleotide_contexts();

  let sbs_mutational_spectra = {};

  for (var i = 0; i < tri_nuc_context.length; i++) {
    let context = tri_nuc_context[i];
    sbs_mutational_spectra[context] = 0;
  }

  return sbs_mutational_spectra;
}

function standardize_trinucleotide(trinucleotide_ref) {
  // COSMIC signatures define mutations from a pyrimidine allele (C, T) to any
  // other base (C>A, C>G, C>T, T>A, T>C, T>G). If a mutation in the MAF file
  // is defined from a purine allele (A, G), then we infer the trinucleotide
  // context in the complementary sequence, which would be from a pyrimidine
  // allele due to purines and pyrimidines complementing each other in a
  // double-stranded DNA.

  // :param trinucleotide_ref: trinucleotide sequence seen in the reference genome.
  // :return: a pyrimidine-centric trinucleotide sequence.

  // Ensure trinucleotide_ref is captialized
  trinucleotide_ref = trinucleotide_ref.toUpperCase();

  let complement_seq = {
    A: "T",
    C: "G",
    T: "A",
    G: "C",
  };
  let purines = "AG";
  if (purines.includes(trinucleotide_ref[1])) {
    return `${complement_seq[trinucleotide_ref[2]]}${complement_seq[trinucleotide_ref[1]]
      }${complement_seq[trinucleotide_ref[0]]}`;
  } else {
    return trinucleotide_ref;
  }
}


const extractFirstNumber = (str, extractSexChromosome = false) => {
  // Check if the string is just "X" or "Y" and extractSexChromosome is true
  if (extractSexChromosome && /^[XY]$/.test(str)) {
    return str; // Return "X" or "Y" directly
  }

  // Check if the string consists only of digits
  if (/^\d+$/.test(str)) {
    return parseInt(str, 10); // Convert the string to an integer
  }

  const match = str.match(/chr([XY\d]+)/);
  if (match) {
    const chrom = match[1];
    if (extractSexChromosome) {
      return chrom; // Returns "X", "Y", or "1", "2", etc., as strings
    } else {
      if (/^\d+$/.test(chrom)) {
        return parseInt(chrom, 10); // Returns numbers as integers
      } else {
        return null; // Returns null for "X" and "Y"
      }
    }
  } else {
    return null; // Returns null if no match is found
  }
};

/**
 * Converts input mutational data into a mutational spectrum matrix grouped by a specified field.
 *
 * This function processes raw mutational data, extracts trinucleotide contexts, and aggregates 
 * mutational spectra for each group. It supports TCGA and non-TCGA formats, allowing batch processing 
 * for large datasets. The resulting mutational spectrum matrix is formatted for downstream visualization 
 * and analysis.
 *
 * @async
 * @function convertMatrix
 * @memberof userData
 * @param {Array<Array<Object>>} data - Array of patient-level mutational data. Each patient's data is represented 
 * as an array of objects, where each object contains mutational details (e.g., chromosome, position, mutation type).
 * @param {string} [group_by="Center"] - Field to group data by (e.g., "Center" or "sample_id"). This field should exist in the input data.
 * @param {number} [batch_size=100] - Number of mutations to process in parallel batches. Adjust this for memory management.
 * @param {string} [genome="hg19"] - Reference genome build. Defaults to "hg19" unless specified in the data.
 * @param {boolean} [tcga=false] - Flag indicating whether the input data is in TCGA format. If true, expects TCGA-specific fields.
 * @returns {Promise<Object>} - A promise resolving to an object where each key is a group (e.g., "CNIC"),
 * and each value is a mutational spectrum object. The mutational spectrum object contains trinucleotide contexts as keys (e.g., "A[C>A]A") and counts as values.
 *
 * @example
 * // Example input data
 * const data = [
 *   [
 *     { chromosome: "1", start_position: "12345", reference_allele: "C", tumor_seq_allele2: "T", variant_type: "SNP", build: "hg19", Center: "CNIC" },
 *     { chromosome: "2", start_position: "67890", reference_allele: "G", tumor_seq_allele2: "A", variant_type: "SNP", build: "hg19", Center: "CNIC" }
 *   ],
 *   [
 *     { chromosome: "3", start_position: "101112", reference_allele: "T", tumor_seq_allele2: "C", variant_type: "SNP", build: "hg19", Center: "OtherCenter" }
 *   ]
 * ];
 *
 * // Convert data to mutational spectra grouped by Center
 * const mutationalSpectra = await convertMatrix(data, "Center", 50, "hg19", false);
 * console.log(mutationalSpectra);
 * // Output:
 * // {
 * //   "CNIC": {
 * //     "A[C>A]A": 9,
 * //     "A[C>A]C": 7,
 * //     "A[C>A]G": 6,
 * //     ...
 * //   },
 * //   "OtherCenter": {
 * //     "T[T>C]A": 15,
 * //     "T[T>C]T": 8,
 * //     ...
 * //   }
 * // }
 */

async function convertMatrix(
  data,
  group_by = "project_code",
  batch_size = 100,
  genome = "hg19",
  tcga = false
) {
  const mutationalSpectra = {};
  group_by = group_by.toLowerCase();

  for (let patient of data) {
    // Convert all keys to lowercase for consistency
    patient = patient.map((row) =>
      Object.fromEntries(
        Object.entries(row).map(([key, value]) => [key.toLowerCase(), value])
      )
    );

    // Initialize a mutational spectrum for each 'patient' group
    let mutationalSpectrum = init_sbs_mutational_spectra();
    let promises = [];

    for (let i = 0; i < patient.length; i++) {
      // If patient[i]['build'] exists, use it to determine the genome
      // Otherwise, use the 'genome' argument or default to "hg19"
      if (patient[i]["build"]) {
        genome = patient[i]["build"];
      } else if (!genome) {
        genome = "hg19";
      }

      let chromosomeNumber;
      let referenceAllele;
      let mutatedTo;
      let position;
      let variantType;

      // Use a proper comparison or a simple "if (!tcga)" check
      if (!tcga) {
        chromosomeNumber = patient[i]["chromosome"];
        chromosomeNumber = extractFirstNumber(chromosomeNumber);
        referenceAllele = patient[i]["reference_allele"];
        mutatedTo = patient[i]["tumor_seq_allele2"];
        position = patient[i]["start_position"];
        variantType = patient[i]["variant_type"];
      } else {
        chromosomeNumber = patient[i]["chromosome"];
        referenceAllele = patient[i]["reference_genome_allele"];
        mutatedTo = patient[i]["mutated_to_allele"];
        position = patient[i]["chromosome_start"];
        variantType = patient[i]["mutation_type"];
      }

      // Check if chromosomeNumber, genome, and position are defined
      if (chromosomeNumber === null ||chromosomeNumber === undefined || genome === undefined || position === undefined) {
        console.log(patient[i]["chromosome"]);
        console.log(`Missing values at index ${i}:`, {
          chromosomeNumber,
          genome,
          position
        });
        continue; // Skip this iteration if any value is missing. This is usually due to the chromosomeNumber being a sex chromosome
      }

      // Get the trinucleotide context for this mutation
      let promise = getMutationalContext(chromosomeNumber, genome, parseInt(position))
        .then((sequence) => {
          if (sequence === undefined) {
            console.log(`Undefined sequence for chromosome ${chromosomeNumber}, genome ${genome}, position ${position}`);
          }
          sequence = standardize_trinucleotide(sequence);

          const fivePrime = sequence[0];
          const threePrime = sequence[2];
          const standardizedSubstitution = standardize_substitution(referenceAllele, mutatedTo);
          const mutationType = `${fivePrime}[${standardizedSubstitution}]${threePrime}`.toUpperCase();

          // Only count valid single base substitutions
          if (
            (variantType === "SNP" || variantType === "single base substitution") &&
            !mutationType.includes("N") &&
            !mutationType.includes("U")
          ) {
            mutationalSpectrum[mutationType] = Number(mutationalSpectrum[mutationType]) + 1;
          }
        })
        .catch((error) => {
          console.error(`Error fetching sequence for chromosome ${chromosomeNumber}, genome ${genome}, position ${position}:`, error);
        });

      promises.push(promise);

      // Batch processing to avoid too many parallel requests
      if (i % batch_size === 0 || i === patient.length - 1) {
        await Promise.all(promises);
        promises = [];
      }
    }

    // Use the patient's group_by field (e.g., project_code) as the key
    mutationalSpectra[patient[0][group_by]] = mutationalSpectrum;
  }

  return mutationalSpectra;
}

async function getMutationalContext(chromosomeNumber, genome, startPosition) {
  const chrName = String(chromosomeNumber);
  const startByte = startPosition - 2;
  const endByte = startPosition;

  const alternative = await (
    await fetchURLAndCache("HG19",
      `https://api.genome.ucsc.edu/getData/sequence?genome=${genome};chrom=chr${chrName};start=${startByte};end=${endByte + 1
      }`
    )
  ).json();

  const sequence = alternative.dna;
  return sequence;
}


export {
  get_sbs_trinucleotide_contexts,
  standardize_substitution,
  init_sbs_mutational_spectra,
  standardize_trinucleotide,
  convertMatrix,
  getMutationalContext,
};