aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
authorzsloan2022-03-22 19:02:15 +0000
committerzsloan2022-03-22 19:02:15 +0000
commita49da43ba00245cf23a2b72c314127986f567f28 (patch)
treece64e83370c52add94927bc050febf5d242722db /wqflask/maintenance
parent68ac19153b128f60b660e11365e5fd4304c95300 (diff)
parent32cb57b82db328bc84753af9d25e9aaa1bd31152 (diff)
downloadgenenetwork2-a49da43ba00245cf23a2b72c314127986f567f28.tar.gz
Merge remote-tracking branch 'origin/testing' into feature/add_rqtl_pairscan
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/gen_ind_genofiles.py249
-rw-r--r--wqflask/maintenance/gen_select_dataset.py36
-rw-r--r--wqflask/maintenance/generate_probesetfreeze_file.py26
-rw-r--r--wqflask/maintenance/quantile_normalize.py27
-rw-r--r--wqflask/maintenance/set_resource_defaults.py35
5 files changed, 302 insertions, 71 deletions
diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
new file mode 100644
index 00000000..8b958efa
--- /dev/null
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""A script that generates the genotype files for groups of individuals, using an existing strain genotype file as a basis
+
+Example commands:
+python3 gen_ind_genofiles.py
+ /home/zas1024/gn2-zach/genotype_files/genotype/
+ /home/zas1024/gn2-zach/new_geno/
+ BXD-Micturition.geno
+ BXD.json
+python3 gen_ind_genofiles.py
+ /home/zas1024/gn2-zach/genotype_files/genotype
+ /home/zas1024/gn2-zach/new_geno/
+ BXD-Micturition.geno
+ BXD.2.geno BXD.4.geno BXD.5.geno
+
+"""
+
+import json
+import os
+import sys
+from typing import List
+
+import MySQLdb
+
+def conn():
+ return MySQLdb.Connect(db=os.environ.get("DB_NAME"),
+ user=os.environ.get("DB_USER"),
+ passwd=os.environ.get("DB_PASS"),
+ host=os.environ.get("DB_HOST"))
+
+def main(args):
+
+ # Directory in which .geno files are located
+ geno_dir = args[1]
+
+ # Directory in which to output new files
+ out_dir = args[2]
+
+ # The individuals group that we want to generate a .geno file for
+ target_file = geno_dir + args[3]
+
+ # The source group(s) we're generating the .geno files from
+ # This can be passed as either a specific .geno file (or set of files as multiple arguments),
+ # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists)
+ geno_json = {}
+ source_files = []
+ if ".json" in args[4]:
+ geno_json = json.load(open(geno_dir + args[4], "r"))
+ par_f1s = {
+ "mat": geno_json['mat'],
+ "pat": geno_json['pat'],
+ "f1s": geno_json['f1s']
+ }
+
+ # List of file titles and locations from JSON
+ source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']]
+ else:
+ par_f1s = {}
+ # List of files directly taken from command line arguments, with titles just set to the filename
+ for group in args[4:]:
+ file_name = geno_dir + group + ".geno" if ".geno" not in group else group
+ source_files.append({'title': file_name[:-5], 'location': file_name})
+
+ if len(source_files) > 1:
+ # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files
+ target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json"
+ target_json = {'genofile': []}
+
+ # Generate the output .geno files
+ for source_file in source_files:
+ filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir)
+
+ target_json['genofile'].append({
+ 'location': filename.split("/")[-1],
+ 'title': source_file['title'],
+ 'sample_list': samples
+ })
+
+ json.dump(target_json, open(target_json_loc, "w"))
+
+def get_strain_for_sample(sample):
+ query = (
+ "SELECT CaseAttributeXRefNew.Value "
+ "FROM CaseAttributeXRefNew, Strain "
+ "WHERE CaseAttributeXRefNew.CaseAttributeId=11 "
+ "AND CaseAttributeXRefNew.StrainId = Strain.Id "
+ "AND Strain.Name = %(name)s" )
+
+ with conn().cursor() as cursor:
+ cursor.execute(query, {"name": sample.strip()})
+ return cursor.fetchone()[0]
+
+def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir):
+ source_samples = group_samples(source_genofile)
+ source_genotypes = strain_genotypes(source_genofile)
+ target_samples = group_samples(target_genofile)
+ strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s)
+
+ if len(source_genofile.split("/")[-1].split(".")) > 2:
+ # The number in the source genofile; for example 4 in BXD.4.geno
+ source_num = source_genofile.split("/")[-1].split(".")[-2]
+ target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno"
+ else:
+ target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno"
+
+ file_location = out_dir + target_filename
+
+ with open(file_location, "w") as fh:
+ for metadata in ["name", "type", "mat", "pat", "het", "unk"]:
+ fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n")
+
+ header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples
+ fh.write("\t".join(header_line))
+
+ for marker in source_genotypes['markers']:
+ line_items = [
+ marker['Chr'],
+ marker['Locus'],
+ marker['cM'],
+ marker['Mb']
+ ]
+
+ for pos in strain_pos_map:
+ if isinstance(pos, int):
+ line_items.append(marker['genotypes'][pos])
+ else:
+ if pos in ["mat", "pat"]:
+ line_items.append(source_genotypes[pos])
+ elif pos == "f1s":
+ line_items.append("H")
+ else:
+ line_items.append("U")
+
+ fh.write("\t".join(line_items) + "\n")
+
+ return file_location, target_samples
+
+def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s):
+ """
+ Retrieve corresponding strain position for each sample in the target group
+
+ This is so the genotypes from the base genofile can be mapped to the samples in the target group
+
+ For example:
+ Base strains: BXD1, BXD2, BXD3
+ Target samples: BXD1_1, BXD1_2, BXD2_1, BXD3_1, BXD3_2, BXD3_3
+ Returns: [0, 0, 1, 2, 2, 2]
+ """
+ pos_map = []
+ for sample in target_samples:
+ sample_strain = get_strain_for_sample(sample)
+ if sample_strain in source_samples:
+ pos_map.append(source_samples.index(sample_strain))
+ else:
+ val = "U"
+ for key in par_f1s.keys():
+ if sample_strain in par_f1s[key]:
+ val = key
+ pos_map.append(val)
+
+ return pos_map
+
+def group_samples(target_file: str) -> List:
+ """
+ Get the group samples from its "dummy" .geno file (which still contains the sample list)
+ """
+
+ sample_list = []
+ with open(target_file, "r") as target_geno:
+ for i, line in enumerate(target_geno):
+ # Skip header lines
+ if line[0] in ["#", "@"] or not len(line):
+ continue
+
+ line_items = line.split("\t")
+ sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]]
+ break
+
+ return sample_list
+
+def strain_genotypes(strain_genofile: str) -> List:
+ """
+ Read genotypes from source strain .geno file
+
+ :param strain_genofile: string of genofile filename
+ :return: a list of dictionaries representing each marker's genotypes
+
+ Example output: [
+ {
+ 'Chr': '1',
+ 'Locus': 'marker1',
+ 'Mb': '10.0',
+ 'cM': '8.0',
+ 'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...]
+ },
+ ...
+ ]
+ """
+
+ geno_dict = {}
+
+ geno_start_col = None
+ header_columns = []
+ sample_list = []
+ markers = []
+ with open(strain_genofile, "r") as source_geno:
+ for i, line in enumerate(source_geno):
+ if line[0] == "@":
+ metadata_type = line[1:].split(":")[0]
+ if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']:
+ geno_dict[metadata_type] = line.split(":")[1].strip()
+
+ continue
+
+ # Skip other header lines
+ if line[0] == "#" or not len(line):
+ continue
+
+ line_items = line.split("\t")
+ if "Chr" in line_items: # Header row
+ # Get the first column index containing genotypes
+ header_columns = line_items
+ for j, item in enumerate(line_items):
+ if item not in ["Chr", "Locus", "Mb", "cM"]:
+ geno_start_col = j
+ break
+
+ sample_list = line_items[geno_start_col:]
+ if not geno_start_col:
+ print("Check .geno file - expected columns not found")
+ sys.exit()
+ else: # Marker rows
+ this_marker = {
+ 'Chr': line_items[header_columns.index("Chr")],
+ 'Locus': line_items[header_columns.index("Locus")],
+ 'Mb': line_items[header_columns.index("Mb")],
+ 'cM': line_items[header_columns.index("cM")],
+ 'genotypes': [item.strip() for item in line_items][geno_start_col:]
+ }
+
+ markers.append(this_marker)
+
+ geno_dict['markers'] = markers
+
+ return geno_dict
+
+if __name__ == "__main__":
+ main(sys.argv)
+
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index db65a11f..9f4b670d 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -39,21 +39,13 @@ from wqflask import app
from utility.tools import locate, locate_ignore_error, TEMPDIR, SQL_URI
-import MySQLdb
-
import simplejson as json
import urllib.parse
-#import sqlalchemy as sa
-
from pprint import pformat as pf
-#Engine = sa.create_engine(zach_settings.SQL_URI)
-
-# build MySql database connection
-
-#conn = Engine.connect()
+from wqflask.database import database_connection
def parse_db_uri():
@@ -71,19 +63,19 @@ def parse_db_uri():
return db_conn_info
-def get_species():
+def get_species(cursor):
"""Build species list"""
- #Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
- Cursor.execute("select Name, MenuName from Species order by OrderId")
- species = list(Cursor.fetchall())
+ #cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
+ cursor.execute("select Name, MenuName from Species order by OrderId")
+ species = list(cursor.fetchall())
return species
-def get_groups(species):
+def get_groups(cursor, species):
"""Build groups list"""
groups = {}
for species_name, _species_full_name in species:
- Cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
+ cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
Species,
ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s'
and InbredSet.SpeciesId = Species.Id and
@@ -92,7 +84,7 @@ def get_groups(species):
or ProbeFreeze.InbredSetId = InbredSet.Id)
group by InbredSet.Name
order by InbredSet.FullName""" % species_name)
- results = Cursor.fetchall()
+ results = cursor.fetchall()
groups[species_name] = list(results)
return groups
@@ -273,13 +265,13 @@ def build_datasets(species, group, type_name):
return datasets
-def main():
+def main(cursor):
"""Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
parse_db_uri()
- species = get_species()
- groups = get_groups(species)
+ species = get_species(cursor)
+ groups = get_groups(cursor, species)
types = get_types(groups)
datasets = get_datasets(types)
@@ -316,6 +308,6 @@ def _test_it():
if __name__ == '__main__':
- Conn = MySQLdb.Connect(**parse_db_uri())
- Cursor = Conn.cursor()
- main()
+ with database_connection() as conn:
+ with conn.cursor() as cursor:
+ main(cursor)
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py
index e964c8ed..f43f952b 100644
--- a/wqflask/maintenance/generate_probesetfreeze_file.py
+++ b/wqflask/maintenance/generate_probesetfreeze_file.py
@@ -8,20 +8,11 @@ import os
import collections
import csv
-import MySQLdb
-
from base import webqtlConfig
from pprint import pformat as pf
-
-def get_cursor():
- con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME,
- host=webqtlConfig.MYSQL_UPDSERVER,
- user=webqtlConfig.DB_UPDUSER,
- passwd=webqtlConfig.DB_UPDPASSWD)
- cursor = con.cursor()
- return cursor
+from wqflask.database import database_connection
def show_progress(process, counter):
@@ -116,13 +107,14 @@ def main():
"(Oct08)_RankInv_Beta.txt")
dataset_name = "Eye_AXBXA_1008_RankInv"
- cursor = get_cursor()
- strains = get_strains(cursor)
- print("Getting probset_vals")
- probeset_vals = get_probeset_vals(cursor, dataset_name)
- print("Finished getting probeset_vals")
- trimmed_strains = trim_strains(strains, probeset_vals)
- write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+ with database_connection as conn:
+ with conn.cursor() as cursor:
+ strains = get_strains(cursor)
+ print("Getting probset_vals")
+ probeset_vals = get_probeset_vals(cursor, dataset_name)
+ print("Finished getting probeset_vals")
+ trimmed_strains = trim_strains(strains, probeset_vals)
+ write_data_matrix_file(trimmed_strains, probeset_vals, filename)
if __name__ == '__main__':
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index 32780ca6..90ec72de 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -1,6 +1,5 @@
import sys
sys.path.insert(0, './')
-import MySQLdb
import urllib.parse
import numpy as np
@@ -9,6 +8,7 @@ import pandas as pd
from flask import Flask, g, request
from wqflask import app
+from wqflask.database import database_connection
def parse_db_uri():
@@ -52,7 +52,7 @@ def quantileNormalize(df_input):
return df
-def set_data(dataset_name):
+def set_data(cursor, dataset_name):
orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
sample_list = []
@@ -80,8 +80,8 @@ def set_data(dataset_name):
ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
ProbeSetXRef.ProbeSetId = ProbeSet.Id and
ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
- Cursor.execute(query)
- result_info = Cursor.fetchone()
+ cursor.execute(query)
+ result_info = cursor.fetchone()
yield {
"_index": "traits",
@@ -99,15 +99,14 @@ def set_data(dataset_name):
if __name__ == '__main__':
- Conn = MySQLdb.Connect(**parse_db_uri())
- Cursor = Conn.cursor()
+ with database_connection as conn:
+ with conn.cursor() as cursor:
+ success, _ = bulk(es, set_data(cursor, sys.argv[1]))
- success, _ = bulk(es, set_data(sys.argv[1]))
-
- response = es.search(
- index="traits", doc_type="trait", body={
- "query": {"match": {"name": "ENSMUSG00000028982"}}
- }
- )
+ response = es.search(
+ index="traits", doc_type="trait", body={
+ "query": {"match": {"name": "ENSMUSG00000028982"}}
+ }
+ )
- print(response)
+ print(response)
diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py
index 0f472494..22d73ba3 100644
--- a/wqflask/maintenance/set_resource_defaults.py
+++ b/wqflask/maintenance/set_resource_defaults.py
@@ -30,10 +30,9 @@ from utility.tools import SQL_URI
from utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info
Redis = get_redis_conn()
-import MySQLdb
-
import urllib.parse
+from wqflask.database import database_connection
from utility.logger import getLogger
logger = getLogger(__name__)
@@ -53,14 +52,14 @@ def parse_db_uri():
return db_conn_info
-def insert_probeset_resources(default_owner_id):
+def insert_probeset_resources(cursor, default_owner_id):
current_resources = Redis.hgetall("resources")
- Cursor.execute(""" SELECT
+ cursor.execute(""" SELECT
ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public
FROM
ProbeSetFreeze""")
- resource_results = Cursor.fetchall()
+ resource_results = cursor.fetchall()
for i, resource in enumerate(resource_results):
resource_ob = {}
resource_ob['name'] = resource[1]
@@ -80,9 +79,9 @@ def insert_probeset_resources(default_owner_id):
add_resource(resource_ob, update=False)
-def insert_publish_resources(default_owner_id):
+def insert_publish_resources(cursor, default_owner_id):
current_resources = Redis.hgetall("resources")
- Cursor.execute(""" SELECT
+ cursor.execute(""" SELECT
PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode
FROM
PublishXRef, PublishFreeze, InbredSet, Publication
@@ -91,7 +90,7 @@ def insert_publish_resources(default_owner_id):
InbredSet.Id = PublishXRef.InbredSetId AND
Publication.Id = PublishXRef.PublicationId""")
- resource_results = Cursor.fetchall()
+ resource_results = cursor.fetchall()
for resource in resource_results:
if resource[2]:
resource_ob = {}
@@ -114,14 +113,14 @@ def insert_publish_resources(default_owner_id):
continue
-def insert_geno_resources(default_owner_id):
+def insert_geno_resources(cursor, default_owner_id):
current_resources = Redis.hgetall("resources")
- Cursor.execute(""" SELECT
+ cursor.execute(""" SELECT
GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality
FROM
GenoFreeze""")
- resource_results = Cursor.fetchall()
+ resource_results = cursor.fetchall()
for i, resource in enumerate(resource_results):
resource_ob = {}
resource_ob['name'] = resource[1]
@@ -147,15 +146,15 @@ def insert_geno_resources(default_owner_id):
def insert_resources(default_owner_id):
current_resources = get_resources()
print("START")
- insert_publish_resources(default_owner_id)
+ insert_publish_resources(cursor, default_owner_id)
print("AFTER PUBLISH")
- insert_geno_resources(default_owner_id)
+ insert_geno_resources(cursor, default_owner_id)
print("AFTER GENO")
- insert_probeset_resources(default_owner_id)
+ insert_probeset_resources(cursor, default_owner_id)
print("AFTER PROBESET")
-def main():
+def main(cursor):
"""Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
Redis.delete("resources")
@@ -166,6 +165,6 @@ def main():
if __name__ == '__main__':
- Conn = MySQLdb.Connect(**parse_db_uri())
- Cursor = Conn.cursor()
- main()
+ with database_connection() as conn:
+ with conn.cursor() as cursor:
+ main(cursor)