diff options
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/gen_ind_genofiles.py | 249 | ||||
-rw-r--r-- | wqflask/maintenance/gen_select_dataset.py | 36 | ||||
-rw-r--r-- | wqflask/maintenance/generate_probesetfreeze_file.py | 26 | ||||
-rw-r--r-- | wqflask/maintenance/quantile_normalize.py | 27 | ||||
-rw-r--r-- | wqflask/maintenance/set_resource_defaults.py | 35 |
5 files changed, 302 insertions, 71 deletions
diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py new file mode 100644 index 00000000..8b958efa --- /dev/null +++ b/wqflask/maintenance/gen_ind_genofiles.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""A script that generates the genotype files for groups of individuals, using an existing strain genotype file as a basis + +Example commands: +python3 gen_ind_genofiles.py + /home/zas1024/gn2-zach/genotype_files/genotype/ + /home/zas1024/gn2-zach/new_geno/ + BXD-Micturition.geno + BXD.json +python3 gen_ind_genofiles.py + /home/zas1024/gn2-zach/genotype_files/genotype + /home/zas1024/gn2-zach/new_geno/ + BXD-Micturition.geno + BXD.2.geno BXD.4.geno BXD.5.geno + +""" + +import json +import os +import sys +from typing import List + +import MySQLdb + +def conn(): + return MySQLdb.Connect(db=os.environ.get("DB_NAME"), + user=os.environ.get("DB_USER"), + passwd=os.environ.get("DB_PASS"), + host=os.environ.get("DB_HOST")) + +def main(args): + + # Directory in which .geno files are located + geno_dir = args[1] + + # Directory in which to output new files + out_dir = args[2] + + # The individuals group that we want to generate a .geno file for + target_file = geno_dir + args[3] + + # The source group(s) we're generating the .geno files from + # This can be passed as either a specific .geno file (or set of files as multiple arguments), + # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists) + geno_json = {} + source_files = [] + if ".json" in args[4]: + geno_json = json.load(open(geno_dir + args[4], "r")) + par_f1s = { + "mat": geno_json['mat'], + "pat": geno_json['pat'], + "f1s": geno_json['f1s'] + } + + # List of file titles and locations from JSON + source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']] + else: + par_f1s = {} + # List of files directly taken from command line arguments, with titles just set to the filename + for group in args[4:]: + file_name = geno_dir + group + ".geno" if ".geno" not in group else group + source_files.append({'title': file_name[:-5], 'location': file_name}) + + if len(source_files) > 1: + # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files + target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json" + target_json = {'genofile': []} + + # Generate the output .geno files + for source_file in source_files: + filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir) + + target_json['genofile'].append({ + 'location': filename.split("/")[-1], + 'title': source_file['title'], + 'sample_list': samples + }) + + json.dump(target_json, open(target_json_loc, "w")) + +def get_strain_for_sample(sample): + query = ( + "SELECT CaseAttributeXRefNew.Value " + "FROM CaseAttributeXRefNew, Strain " + "WHERE CaseAttributeXRefNew.CaseAttributeId=11 " + "AND CaseAttributeXRefNew.StrainId = Strain.Id " + "AND Strain.Name = %(name)s" ) + + with conn().cursor() as cursor: + cursor.execute(query, {"name": sample.strip()}) + return cursor.fetchone()[0] + +def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir): + source_samples = group_samples(source_genofile) + source_genotypes = strain_genotypes(source_genofile) + target_samples = group_samples(target_genofile) + strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s) + + if len(source_genofile.split("/")[-1].split(".")) > 2: + # The number in the source genofile; for example 4 in BXD.4.geno + source_num = source_genofile.split("/")[-1].split(".")[-2] + target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno" + else: + target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno" + + file_location = out_dir + target_filename + + with open(file_location, "w") as fh: + for metadata in ["name", "type", "mat", "pat", "het", "unk"]: + fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n") + + header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples + fh.write("\t".join(header_line)) + + for marker in source_genotypes['markers']: + line_items = [ + marker['Chr'], + marker['Locus'], + marker['cM'], + marker['Mb'] + ] + + for pos in strain_pos_map: + if isinstance(pos, int): + line_items.append(marker['genotypes'][pos]) + else: + if pos in ["mat", "pat"]: + line_items.append(source_genotypes[pos]) + elif pos == "f1s": + line_items.append("H") + else: + line_items.append("U") + + fh.write("\t".join(line_items) + "\n") + + return file_location, target_samples + +def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s): + """ + Retrieve corresponding strain position for each sample in the target group + + This is so the genotypes from the base genofile can be mapped to the samples in the target group + + For example: + Base strains: BXD1, BXD2, BXD3 + Target samples: BXD1_1, BXD1_2, BXD2_1, BXD3_1, BXD3_2, BXD3_3 + Returns: [0, 0, 1, 2, 2, 2] + """ + pos_map = [] + for sample in target_samples: + sample_strain = get_strain_for_sample(sample) + if sample_strain in source_samples: + pos_map.append(source_samples.index(sample_strain)) + else: + val = "U" + for key in par_f1s.keys(): + if sample_strain in par_f1s[key]: + val = key + pos_map.append(val) + + return pos_map + +def group_samples(target_file: str) -> List: + """ + Get the group samples from its "dummy" .geno file (which still contains the sample list) + """ + + sample_list = [] + with open(target_file, "r") as target_geno: + for i, line in enumerate(target_geno): + # Skip header lines + if line[0] in ["#", "@"] or not len(line): + continue + + line_items = line.split("\t") + sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]] + break + + return sample_list + +def strain_genotypes(strain_genofile: str) -> List: + """ + Read genotypes from source strain .geno file + + :param strain_genofile: string of genofile filename + :return: a list of dictionaries representing each marker's genotypes + + Example output: [ + { + 'Chr': '1', + 'Locus': 'marker1', + 'Mb': '10.0', + 'cM': '8.0', + 'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...] + }, + ... + ] + """ + + geno_dict = {} + + geno_start_col = None + header_columns = [] + sample_list = [] + markers = [] + with open(strain_genofile, "r") as source_geno: + for i, line in enumerate(source_geno): + if line[0] == "@": + metadata_type = line[1:].split(":")[0] + if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']: + geno_dict[metadata_type] = line.split(":")[1].strip() + + continue + + # Skip other header lines + if line[0] == "#" or not len(line): + continue + + line_items = line.split("\t") + if "Chr" in line_items: # Header row + # Get the first column index containing genotypes + header_columns = line_items + for j, item in enumerate(line_items): + if item not in ["Chr", "Locus", "Mb", "cM"]: + geno_start_col = j + break + + sample_list = line_items[geno_start_col:] + if not geno_start_col: + print("Check .geno file - expected columns not found") + sys.exit() + else: # Marker rows + this_marker = { + 'Chr': line_items[header_columns.index("Chr")], + 'Locus': line_items[header_columns.index("Locus")], + 'Mb': line_items[header_columns.index("Mb")], + 'cM': line_items[header_columns.index("cM")], + 'genotypes': [item.strip() for item in line_items][geno_start_col:] + } + + markers.append(this_marker) + + geno_dict['markers'] = markers + + return geno_dict + +if __name__ == "__main__": + main(sys.argv) + diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py index db65a11f..9f4b670d 100644 --- a/wqflask/maintenance/gen_select_dataset.py +++ b/wqflask/maintenance/gen_select_dataset.py @@ -39,21 +39,13 @@ from wqflask import app from utility.tools import locate, locate_ignore_error, TEMPDIR, SQL_URI -import MySQLdb - import simplejson as json import urllib.parse -#import sqlalchemy as sa - from pprint import pformat as pf -#Engine = sa.create_engine(zach_settings.SQL_URI) - -# build MySql database connection - -#conn = Engine.connect() +from wqflask.database import database_connection def parse_db_uri(): @@ -71,19 +63,19 @@ def parse_db_uri(): return db_conn_info -def get_species(): +def get_species(cursor): """Build species list""" - #Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId") - Cursor.execute("select Name, MenuName from Species order by OrderId") - species = list(Cursor.fetchall()) + #cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId") + cursor.execute("select Name, MenuName from Species order by OrderId") + species = list(cursor.fetchall()) return species -def get_groups(species): +def get_groups(cursor, species): """Build groups list""" groups = {} for species_name, _species_full_name in species: - Cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet, + cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet, Species, ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s' and InbredSet.SpeciesId = Species.Id and @@ -92,7 +84,7 @@ def get_groups(species): or ProbeFreeze.InbredSetId = InbredSet.Id) group by InbredSet.Name order by InbredSet.FullName""" % species_name) - results = Cursor.fetchall() + results = cursor.fetchall() groups[species_name] = list(results) return groups @@ -273,13 +265,13 @@ def build_datasets(species, group, type_name): return datasets -def main(): +def main(cursor): """Generates and outputs (as json file) the data for the main dropdown menus on the home page""" parse_db_uri() - species = get_species() - groups = get_groups(species) + species = get_species(cursor) + groups = get_groups(cursor, species) types = get_types(groups) datasets = get_datasets(types) @@ -316,6 +308,6 @@ def _test_it(): if __name__ == '__main__': - Conn = MySQLdb.Connect(**parse_db_uri()) - Cursor = Conn.cursor() - main() + with database_connection() as conn: + with conn.cursor() as cursor: + main(cursor) diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py index e964c8ed..f43f952b 100644 --- a/wqflask/maintenance/generate_probesetfreeze_file.py +++ b/wqflask/maintenance/generate_probesetfreeze_file.py @@ -8,20 +8,11 @@ import os import collections import csv -import MySQLdb - from base import webqtlConfig from pprint import pformat as pf - -def get_cursor(): - con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME, - host=webqtlConfig.MYSQL_UPDSERVER, - user=webqtlConfig.DB_UPDUSER, - passwd=webqtlConfig.DB_UPDPASSWD) - cursor = con.cursor() - return cursor +from wqflask.database import database_connection def show_progress(process, counter): @@ -116,13 +107,14 @@ def main(): "(Oct08)_RankInv_Beta.txt") dataset_name = "Eye_AXBXA_1008_RankInv" - cursor = get_cursor() - strains = get_strains(cursor) - print("Getting probset_vals") - probeset_vals = get_probeset_vals(cursor, dataset_name) - print("Finished getting probeset_vals") - trimmed_strains = trim_strains(strains, probeset_vals) - write_data_matrix_file(trimmed_strains, probeset_vals, filename) + with database_connection as conn: + with conn.cursor() as cursor: + strains = get_strains(cursor) + print("Getting probset_vals") + probeset_vals = get_probeset_vals(cursor, dataset_name) + print("Finished getting probeset_vals") + trimmed_strains = trim_strains(strains, probeset_vals) + write_data_matrix_file(trimmed_strains, probeset_vals, filename) if __name__ == '__main__': diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py index 32780ca6..90ec72de 100644 --- a/wqflask/maintenance/quantile_normalize.py +++ b/wqflask/maintenance/quantile_normalize.py @@ -1,6 +1,5 @@ import sys sys.path.insert(0, './') -import MySQLdb import urllib.parse import numpy as np @@ -9,6 +8,7 @@ import pandas as pd from flask import Flask, g, request from wqflask import app +from wqflask.database import database_connection def parse_db_uri(): @@ -52,7 +52,7 @@ def quantileNormalize(df_input): return df -def set_data(dataset_name): +def set_data(cursor, dataset_name): orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" sample_list = [] @@ -80,8 +80,8 @@ def set_data(dataset_name): ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and ProbeSetXRef.ProbeSetId = ProbeSet.Id and ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0]) - Cursor.execute(query) - result_info = Cursor.fetchone() + cursor.execute(query) + result_info = cursor.fetchone() yield { "_index": "traits", @@ -99,15 +99,14 @@ def set_data(dataset_name): if __name__ == '__main__': - Conn = MySQLdb.Connect(**parse_db_uri()) - Cursor = Conn.cursor() + with database_connection as conn: + with conn.cursor() as cursor: + success, _ = bulk(es, set_data(cursor, sys.argv[1])) - success, _ = bulk(es, set_data(sys.argv[1])) - - response = es.search( - index="traits", doc_type="trait", body={ - "query": {"match": {"name": "ENSMUSG00000028982"}} - } - ) + response = es.search( + index="traits", doc_type="trait", body={ + "query": {"match": {"name": "ENSMUSG00000028982"}} + } + ) - print(response) + print(response) diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py index 0f472494..22d73ba3 100644 --- a/wqflask/maintenance/set_resource_defaults.py +++ b/wqflask/maintenance/set_resource_defaults.py @@ -30,10 +30,9 @@ from utility.tools import SQL_URI from utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info Redis = get_redis_conn() -import MySQLdb - import urllib.parse +from wqflask.database import database_connection from utility.logger import getLogger logger = getLogger(__name__) @@ -53,14 +52,14 @@ def parse_db_uri(): return db_conn_info -def insert_probeset_resources(default_owner_id): +def insert_probeset_resources(cursor, default_owner_id): current_resources = Redis.hgetall("resources") - Cursor.execute(""" SELECT + cursor.execute(""" SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public FROM ProbeSetFreeze""") - resource_results = Cursor.fetchall() + resource_results = cursor.fetchall() for i, resource in enumerate(resource_results): resource_ob = {} resource_ob['name'] = resource[1] @@ -80,9 +79,9 @@ def insert_probeset_resources(default_owner_id): add_resource(resource_ob, update=False) -def insert_publish_resources(default_owner_id): +def insert_publish_resources(cursor, default_owner_id): current_resources = Redis.hgetall("resources") - Cursor.execute(""" SELECT + cursor.execute(""" SELECT PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode FROM PublishXRef, PublishFreeze, InbredSet, Publication @@ -91,7 +90,7 @@ def insert_publish_resources(default_owner_id): InbredSet.Id = PublishXRef.InbredSetId AND Publication.Id = PublishXRef.PublicationId""") - resource_results = Cursor.fetchall() + resource_results = cursor.fetchall() for resource in resource_results: if resource[2]: resource_ob = {} @@ -114,14 +113,14 @@ def insert_publish_resources(default_owner_id): continue -def insert_geno_resources(default_owner_id): +def insert_geno_resources(cursor, default_owner_id): current_resources = Redis.hgetall("resources") - Cursor.execute(""" SELECT + cursor.execute(""" SELECT GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality FROM GenoFreeze""") - resource_results = Cursor.fetchall() + resource_results = cursor.fetchall() for i, resource in enumerate(resource_results): resource_ob = {} resource_ob['name'] = resource[1] @@ -147,15 +146,15 @@ def insert_geno_resources(default_owner_id): def insert_resources(default_owner_id): current_resources = get_resources() print("START") - insert_publish_resources(default_owner_id) + insert_publish_resources(cursor, default_owner_id) print("AFTER PUBLISH") - insert_geno_resources(default_owner_id) + insert_geno_resources(cursor, default_owner_id) print("AFTER GENO") - insert_probeset_resources(default_owner_id) + insert_probeset_resources(cursor, default_owner_id) print("AFTER PROBESET") -def main(): +def main(cursor): """Generates and outputs (as json file) the data for the main dropdown menus on the home page""" Redis.delete("resources") @@ -166,6 +165,6 @@ def main(): if __name__ == '__main__': - Conn = MySQLdb.Connect(**parse_db_uri()) - Cursor = Conn.cursor() - main() + with database_connection() as conn: + with conn.cursor() as cursor: + main(cursor) |