diff options
-rw-r--r-- | wqflask/wqflask/marker_regression/gemma_mapping.py | 208 |
1 files changed, 115 insertions, 93 deletions
diff --git a/wqflask/wqflask/marker_regression/gemma_mapping.py b/wqflask/wqflask/marker_regression/gemma_mapping.py index 7777735d..83ebcdf9 100644 --- a/wqflask/wqflask/marker_regression/gemma_mapping.py +++ b/wqflask/wqflask/marker_regression/gemma_mapping.py @@ -1,120 +1,145 @@ -import os, math, string, random, json +import os +import math +import string +import random +import json from base import webqtlConfig from base.trait import create_trait from base.data_set import create_dataset -from utility.tools import flat_files, GEMMA_COMMAND, GEMMA_WRAPPER_COMMAND, TEMPDIR, WEBSERVER_MODE +from utility.tools import flat_files +from utility.tools import GEMMA_WRAPPER_COMMAND +from utility.tools import TEMPDIR +from utility.tools import WEBSERVER_MODE import utility.logger -logger = utility.logger.getLogger(__name__ ) +logger = utility.logger.getLogger(__name__) GEMMAOPTS = "-debug" if WEBSERVER_MODE == 'PROD': - GEMMAOPTS = "-no-check" + GEMMAOPTS = "-no-check" + + +def generate_random_n_string(n): + return ''.join(random.choice(string.ascii_uppercase + string.digits) + for _ in range(n)) -def run_gemma(this_trait, this_dataset, samples, vals, covariates, use_loco, maf=0.01, first_run=True, output_files=None): - """Generates p-values for each marker using GEMMA""" - if this_dataset.group.genofile != None: +def run_gemma(this_trait, this_dataset, samples, vals, covariates, use_loco, + maf=0.01, first_run=True, output_files=None): + """Generates p-values for each marker using GEMMA""" + if this_dataset.group.genofile is not None: genofile_name = this_dataset.group.genofile[:-5] else: genofile_name = this_dataset.group.name if first_run: - trait_filename = str(this_dataset.group.name) + "_" + str(this_trait.name) + "_" + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) - gen_pheno_txt_file(this_dataset, genofile_name, vals, trait_filename) - - if not os.path.isfile("{}{}_output.assoc.txt".format(webqtlConfig.GENERATED_IMAGE_DIR, genofile_name)): - open("{}{}_output.assoc.txt".format(webqtlConfig.GENERATED_IMAGE_DIR, genofile_name), "w+") - - k_output_filename = this_dataset.group.name + "_K_" + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) - gwa_output_filename = this_dataset.group.name + "_GWA_" + ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6)) - - this_chromosomes = this_dataset.species.chromosomes.chromosomes - this_chromosomes_name=[this_chromosomes[chromosome].name for chromosome in this_chromosomes] - - - chr_list_string=",".join(this_chromosomes_name) - if covariates != "": - gen_covariates_file(this_dataset, covariates, samples) - if use_loco == "True": - generate_k_command = GEMMA_WRAPPER_COMMAND + ' --json --loco ' + chr_list_string + ' -- ' + GEMMAOPTS + ' -g %s/%s_geno.txt -p %s/gn2/%s.txt -a %s/%s_snps.txt -gk > %s/gn2/%s.json' % (flat_files('genotype/bimbam'), - genofile_name, - TEMPDIR, - trait_filename, - flat_files('genotype/bimbam'), - genofile_name, - TEMPDIR, - k_output_filename) - - os.system(generate_k_command) - - gemma_command = GEMMA_WRAPPER_COMMAND + ' --json --loco --input %s/gn2/%s.json -- ' % (TEMPDIR, k_output_filename) + GEMMAOPTS + ' -g %s/%s_geno.txt -p %s/gn2/%s.txt' % (flat_files('genotype/bimbam'), - genofile_name, - TEMPDIR, - trait_filename) - if covariates != "": - gemma_command += ' -c %s/%s_covariates.txt -a %s/%s_snps.txt -lmm 9 -maf %s > %s/gn2/%s.json' % (flat_files('mapping'), - this_dataset.group.name, - flat_files('genotype/bimbam'), - genofile_name, - maf, - TEMPDIR, - gwa_output_filename) - else: - gemma_command += ' -a %s/%s_snps.txt -lmm 9 -maf %s > %s/gn2/%s.json' % (flat_files('genotype/bimbam'), - genofile_name, - maf, - TEMPDIR, - gwa_output_filename) - - else: - generate_k_command = GEMMA_WRAPPER_COMMAND + ' --json -- ' + GEMMAOPTS + ' -g %s/%s_geno.txt -p %s/gn2/%s.txt -a %s/%s_snps.txt -gk > %s/gn2/%s.json' % (flat_files('genotype/bimbam'), - genofile_name, - TEMPDIR, - trait_filename, - flat_files('genotype/bimbam'), - genofile_name, - TEMPDIR, - k_output_filename) - - os.system(generate_k_command) - - gemma_command = GEMMA_WRAPPER_COMMAND + ' --json --input %s/gn2/%s.json -- ' % (TEMPDIR, k_output_filename) + GEMMAOPTS + ' -a %s/%s_snps.txt -lmm 9 -g %s/%s_geno.txt -p %s/gn2/%s.txt' % (flat_files('genotype/bimbam'), - genofile_name, - flat_files('genotype/bimbam'), - genofile_name, - TEMPDIR, - trait_filename) - - - if covariates != "": - gemma_command += ' -c %s/%s_covariates.txt > %s/gn2/%s.json' % (flat_files('mapping'), this_dataset.group.name, TEMPDIR, gwa_output_filename) - else: - gemma_command += ' > %s/gn2/%s.json' % (TEMPDIR, gwa_output_filename) - - os.system(gemma_command) + trait_filename = (f"{str(this_dataset.group.name)}_" + f"{str(this_trait.name)}_" + f"{generate_random_n_string(6)}") + gen_pheno_txt_file(this_dataset, genofile_name, vals, trait_filename) + + if not os.path.isfile(f"{webqtlConfig.GENERATED_IMAGE_DIR}" + f"{genofile_name}_output.assoc.txt"): + open((f"{webqtlConfig.GENERATED_IMAGE_DIR}" + f"{genofile_name}_output.assoc.txt"), + "w+") + + k_output_filename = (f"{this_dataset.group.name}_K_" + f"{generate_random_n_string(6)}") + gwa_output_filename = (f"{this_dataset.group.name}_GWA_" + f"{generate_random_n_string(6)}") + + this_chromosomes = this_dataset.species.chromosomes.chromosomes + this_chromosomes_name = [this_chromosomes[chromosome].name + for chromosome in this_chromosomes] + + chr_list_string = ",".join(this_chromosomes_name) + if covariates != "": + gen_covariates_file(this_dataset, covariates, samples) + if use_loco == "True": + generate_k_command = (f"{GEMMA_WRAPPER_COMMAND} --json --loco " + f"{chr_list_string} -- {GEMMAOPTS} " + f"-g {flat_files('genotype/bimbam')}/" + f"{genofile_name}_geno.txt -p " + f"{TEMPDIR}/gn2/{trait_filename}.txt -a " + f"{flat_files('genotype/bimbam')}/" + f"{genofile_name}_snps.txt -gk > " + f"{TEMPDIR}/gn2/{k_output_filename}.json") + os.system(generate_k_command) + + gemma_command = (f"{GEMMA_WRAPPER_COMMAND} --json --loco " + f"--input {TEMPDIR}/gn2/{k_output_filename}.json " + f"-- {GEMMAOPTS} " + f"-g {flat_files('genotype/bimbam')}/" + f"{genofile_name}_geno.txt " + f"-p {TEMPDIR}/gn2/{trait_filename}.txt ") + if covariates != "": + gemma_command += (f"-c {flat_files('mapping')}/" + f"{this_dataset.group.name}_covariates.txt " + f"-a {flat_files('genotype/bimbam')}/" + f"{genofile_name}_snps.txt " + f"-lmm 9 -maf {maf} > {TEMPDIR}/gn2/" + f"{gwa_output_filename}.json") + else: + gemma_command += (f"-a {flat_files('genotype/bimbam')}/" + f"{genofile_name}_snps.txt -lmm 9 -maf " + f"{maf} > " + f"{TEMPDIR}/gn2/{gwa_output_filename}.json") + + else: + generate_k_command = (f"{GEMMA_WRAPPER_COMMAND} --json -- " + f"{GEMMAOPTS} " + f" -g {flat_files('genotype/bimbam')}/" + f"{genofile_name}_geno.txt -p " + f"{TEMPDIR}/gn2/{trait_filename}.txt -a " + f"{flat_files('genotype/bimbam')}/" + f"{genofile_name}_snps.txt -gk > " + f"{TEMPDIR}/gn2/{k_output_filename}.json") + + os.system(generate_k_command) + + gemma_command = (f"{GEMMA_WRAPPER_COMMAND} --json --input " + f"{TEMPDIR}/gn2/{k_output_filename}.json -- " + f"{GEMMAOPTS} " + f"-a {flat_files('genotype/bimbam')}/" + f"{genofile_name}_snps.txt " + f"-lmm 9 -g {flat_files('genotype/bimbam')}/" + f"{genofile_name}_geno.txt -p " + f"{TEMPDIR}/gn2/{trait_filename}.txt ") + + if covariates != "": + gemma_command += (f" -c {flat_files('mapping')}/" + f"{this_dataset.group.name}" + f"_covariates.txt > " + f"{TEMPDIR}/gn2/{gwa_output_filename}.json") + else: + gemma_command += f" > {TEMPDIR}/gn2/{gwa_output_filename}.json" + + os.system(gemma_command) else: - gwa_output_filename = output_files + gwa_output_filename = output_files if use_loco == "True": marker_obs = parse_loco_output(this_dataset, gwa_output_filename) return marker_obs, gwa_output_filename else: - marker_obs = parse_loco_output(this_dataset, gwa_output_filename, use_loco) + marker_obs = parse_loco_output( + this_dataset, gwa_output_filename, use_loco) return marker_obs, gwa_output_filename + def gen_pheno_txt_file(this_dataset, genofile_name, vals, trait_filename): """Generates phenotype file for GEMMA""" - current_file_data = [] - with open("{}/gn2/{}.txt".format(TEMPDIR, trait_filename), "w") as outfile: + with open(f"{TEMPDIR}/gn2/{trait_filename}.txt", "w") as outfile: for value in vals: if value == "x": outfile.write("NA\n") else: outfile.write(value + "\n") + def gen_covariates_file(this_dataset, covariates, samples): covariate_list = covariates.split(",") covariate_data_object = [] @@ -125,8 +150,6 @@ def gen_covariates_file(this_dataset, covariates, samples): trait_ob = create_trait(dataset=dataset_ob, name=trait_name, cellid=None) - - #trait_samples = this_dataset.group.all_samples_ordered() this_dataset.group.get_samplelist() trait_samples = this_dataset.group.samplelist trait_sample_data = trait_ob.data @@ -139,17 +162,20 @@ def gen_covariates_file(this_dataset, covariates, samples): this_covariate_data.append("-9") covariate_data_object.append(this_covariate_data) - with open("{}/{}_covariates.txt".format(flat_files('mapping'), this_dataset.group.name), "w") as outfile: + with open((f"{flat_files('mapping')}/" + f"{this_dataset.group.name}_covariates.txt"), + "w") as outfile: for i in range(len(covariate_data_object[0])): for this_covariate in covariate_data_object: outfile.write(str(this_covariate[i]) + "\t") outfile.write("\n") + def parse_loco_output(this_dataset, gwa_output_filename, loco="True"): output_filelist = [] - with open("{}/gn2/".format(TEMPDIR) + gwa_output_filename + ".json") as data_file: - data = json.load(data_file) + with open(f"{TEMPDIR}/gn2/{gwa_output_filename}.json") as data_file: + data = json.load(data_file) files = data['files'] for file in files: @@ -160,10 +186,8 @@ def parse_loco_output(this_dataset, gwa_output_filename, loco="True"): marker_obs = [] previous_chr = 0 - no_results = False for this_file in output_filelist: if not os.path.isfile(this_file): - no_results = True break with open(this_file) as output_file: for line in output_file: @@ -188,10 +212,8 @@ def parse_loco_output(this_dataset, gwa_output_filename, loco="True"): marker['additive'] = float(line.split("\t")[7]) if math.isnan(marker['p_value']) or (marker['p_value'] <= 0): marker['lod_score'] = 0 - #marker['lrs_value'] = 0 else: marker['lod_score'] = -math.log10(marker['p_value']) - #marker['lrs_value'] = -math.log10(marker['p_value']) * 4.61 marker_obs.append(marker) included_markers.append(line.split("\t")[1]) |