genenetwork2 - GeneNetwork (2nd generation)

import os
import math
import string
import random
import json
import logging

from gn2.base import webqtlConfig
from gn2.base.trait import create_trait
from gn2.base.data_set import create_dataset
from gn2.utility.redis_tools import get_redis_conn
from gn2.utility.tools import flat_files, assert_file
from gn2.utility.tools import GEMMA_WRAPPER_COMMAND
from gn2.utility.tools import TEMPDIR
from gn2.utility.tools import WEBSERVER_MODE
from gn2.utility.tools import get_setting
from gn2.wqflask.database import database_connection
from gn3.computations.gemma import generate_hash_of_string


GEMMAOPTS = "-debug"
if WEBSERVER_MODE == 'PROD':
    GEMMAOPTS = "-no-check"


def generate_random_n_string(n):
    return ''.join(random.choice(string.ascii_uppercase + string.digits)
                   for _ in range(n))


def run_gemma(this_trait, this_dataset, samples, vals, covariates, use_loco,
              maf=0.01, first_run=True, output_files=None):
    """Generates p-values for each marker using GEMMA"""

    if this_dataset.group.genofile is not None:
        genofile_name = this_dataset.group.genofile[:-5]
    else:
        genofile_name = this_dataset.group.name

    if first_run:
        pheno_filename = gen_pheno_txt_file(this_dataset, genofile_name, vals)

        if not os.path.isfile(f"{webqtlConfig.GENERATED_IMAGE_DIR}"
                              f"{genofile_name}_output.assoc.txt"):
            open((f"{webqtlConfig.GENERATED_IMAGE_DIR}"
                  f"{genofile_name}_output.assoc.txt"),
                 "w+")

        k_output_filename = (f"{this_dataset.group.name}_K_"
                             f"{generate_random_n_string(6)}")
        gwa_output_filename = (f"{this_dataset.group.name}_GWA_"
                               f"{generate_random_n_string(6)}")


        this_chromosomes_name = []
        with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as db_cursor:
            for this_chr in this_dataset.species.chromosomes.chromosomes(db_cursor):
                this_chromosomes_name.append(this_dataset.species.chromosomes.chromosomes(db_cursor)[this_chr].name)

        chr_list_string = ",".join(this_chromosomes_name)
        if covariates != "":
            covar_filename = gen_covariates_file(this_dataset, covariates, samples)
        if str(use_loco).lower() == "true":
            bimbam_dir = flat_files('genotype/bimbam')
            geno_filepath = assert_file(
                f"{bimbam_dir}/{genofile_name}_geno.txt")
            pheno_filepath = f"{TEMPDIR}/gn2/{pheno_filename}.txt"
            snps_filepath = assert_file(
                f"{bimbam_dir}/{genofile_name}_snps.txt")
            k_json_output_filepath = f"{TEMPDIR}/gn2/{k_output_filename}.json"
            generate_k_command = (f"{GEMMA_WRAPPER_COMMAND} --json --loco "
                                  f"{chr_list_string} -- {GEMMAOPTS} "
                                  f"-g {geno_filepath} -p "
                                  f"{pheno_filepath} -a "
                                  f"{snps_filepath} -gk > "
                                  f"{k_json_output_filepath}")
            logging.debug("generate_k_command: %s", generate_k_command)
            os.system(generate_k_command)

            gemma_command = (f"{GEMMA_WRAPPER_COMMAND} --json --loco "
                             f"--input {k_json_output_filepath} "
                             f"-- {GEMMAOPTS} "
                             f"-g {geno_filepath} "
                             f"-p {pheno_filepath} ")
            if covariates != "":
                gemma_command += (f"-c {flat_files('mapping')}/"
                                  f"{covar_filename}.txt "
                                  f"-a {flat_files('genotype/bimbam')}/"
                                  f"{genofile_name}_snps.txt "
                                  f"-lmm 9 -maf {maf} > {TEMPDIR}/gn2/"
                                  f"{gwa_output_filename}.json")
            else:
                gemma_command += (f"-a {flat_files('genotype/bimbam')}/"
                                  f"{genofile_name}_snps.txt -lmm 9 -maf "
                                  f"{maf} > "
                                  f"{TEMPDIR}/gn2/{gwa_output_filename}.json")

        else:
            generate_k_command = (f"{GEMMA_WRAPPER_COMMAND} --json -- "
                                  f"{GEMMAOPTS} "
                                  f" -g {flat_files('genotype/bimbam')}/"
                                  f"{genofile_name}_geno.txt -p "
                                  f"{TEMPDIR}/gn2/{pheno_filename}.txt -a "
                                  f"{flat_files('genotype/bimbam')}/"
                                  f"{genofile_name}_snps.txt -gk > "
                                  f"{TEMPDIR}/gn2/{k_output_filename}.json")
            logging.debug("generate_k_command: %s", generate_k_command)
            os.system(generate_k_command)

            gemma_command = (f"{GEMMA_WRAPPER_COMMAND} --json --input "
                             f"{TEMPDIR}/gn2/{k_output_filename}.json -- "
                             f"{GEMMAOPTS} "
                             f"-a {flat_files('genotype/bimbam')}/"
                             f"{genofile_name}_snps.txt "
                             f"-lmm 9 -g {flat_files('genotype/bimbam')}/"
                             f"{genofile_name}_geno.txt -p "
                             f"{TEMPDIR}/gn2/{pheno_filename}.txt ")

            if covariates != "":
                gemma_command += (f" -c {flat_files('mapping')}/"
                                  f"{covar_filename}.txt > "
                                  f"{TEMPDIR}/gn2/{gwa_output_filename}.json")
            else:
                gemma_command += f" > {TEMPDIR}/gn2/{gwa_output_filename}.json"

        logging.debug("gemma_command: %s", gemma_command)
        os.system(gemma_command)
    else:
        gwa_output_filename = output_files

    if use_loco == "True":
        marker_obs = parse_loco_output(this_dataset, gwa_output_filename)
        return marker_obs, gwa_output_filename
    else:
        marker_obs = parse_loco_output(
            this_dataset, gwa_output_filename, use_loco)
        return marker_obs, gwa_output_filename


def gen_pheno_txt_file(this_dataset, genofile_name, vals):
    """Generates phenotype file for GEMMA"""

    filename = "PHENO_" + generate_hash_of_string(this_dataset.name + str(vals)).replace("/", "_")

    with open(f"{TEMPDIR}/gn2/{filename}.txt", "w") as outfile:
        for value in vals:
            if value == "x":
                outfile.write("NA\n")
            else:
                outfile.write(value + "\n")

    return filename


def gen_covariates_file(this_dataset, covariates, samples):
    covariate_list = covariates.split(",")
    covariate_data_object = []
    for covariate in covariate_list:
        this_covariate_data = []
        trait_name = covariate.split(":")[0]
        dataset_name = covariate.split(":")[1]
        if dataset_name == "Temp":
            temp_group = trait_name.split("_")[2]
            dataset_ob = create_dataset(
                dataset_name="Temp", dataset_type="Temp", group_name=temp_group)
        else:
            dataset_ob = create_dataset(covariate.split(":")[1])
        trait_ob = create_trait(dataset=dataset_ob,
                                name=trait_name,
                                cellid=None)
        this_dataset.group.get_samplelist(redis_conn=get_redis_conn())
        trait_samples = this_dataset.group.samplelist
        trait_sample_data = trait_ob.data
        for index, sample in enumerate(trait_samples):
            if sample in samples:
                if sample in trait_sample_data:
                    sample_value = trait_sample_data[sample].value
                    this_covariate_data.append(sample_value)
                else:
                    this_covariate_data.append("-9")
        covariate_data_object.append(this_covariate_data)

    filename = "COVAR_" + generate_hash_of_string(this_dataset.name + str(covariate_data_object)).replace("/", "_")

    with open((f"{flat_files('mapping')}/"
               f"{filename}.txt"),
              "w") as outfile:
        for i in range(len(covariate_data_object[0])):
            for this_covariate in covariate_data_object:
                outfile.write(str(this_covariate[i]) + "\t")
            outfile.write("\n")

    return filename


def parse_loco_output(this_dataset, gwa_output_filename, loco="True"):

    output_filename = f"{TEMPDIR}/gn2/{gwa_output_filename}.json"
    if os.stat(output_filename).st_size == 0:
        return {}

    output_filelist = []
    with open(output_filename) as data_file:
        data = json.load(data_file)

    files = data['files']
    for file in files:
        output_filelist.append(file[2])

    included_markers = []
    p_values = []
    marker_obs = []
    previous_chr = 0

    for this_file in output_filelist:
        if not os.path.isfile(this_file):
            break
        with open(this_file) as output_file:
            for line in output_file:
                if line.startswith("chr\t"):
                    continue
                else:
                    marker = {}
                    marker['name'] = line.split("\t")[1]
                    if line.split("\t")[0] != "X" and line.split("\t")[0] != "X/Y" and line.split("\t")[0] != "Y" and line.split("\t")[0] != "M":
                        if "chr" in line.split("\t")[0]:
                            marker['chr'] = int(line.split("\t")[0][3:])
                        else:
                            marker['chr'] = int(line.split("\t")[0])
                        if marker['chr'] > int(previous_chr):
                            previous_chr = marker['chr']
                        elif marker['chr'] < int(previous_chr):
                            break
                    else:
                        marker['chr'] = line.split("\t")[0]
                    marker['Mb'] = float(line.split("\t")[2]) / 1000000
                    marker['p_value'] = float(line.split("\t")[10])
                    marker['additive'] = -(float(line.split("\t")[7])/2)
                    if math.isnan(marker['p_value']) or (marker['p_value'] <= 0):
                        marker['lod_score'] = marker['p_value'] = 0
                    else:
                        marker['lod_score'] = -math.log10(marker['p_value'])
                    marker_obs.append(marker)

                    included_markers.append(line.split("\t")[1])
                    p_values.append(float(line.split("\t")[9]))

    return marker_obs