aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/convert_dryad_to_bimbam.py70
-rw-r--r--wqflask/maintenance/convert_geno_to_bimbam.py77
-rw-r--r--wqflask/maintenance/gen_select_dataset.py9
-rw-r--r--wqflask/maintenance/geno_to_json.py195
-rw-r--r--wqflask/maintenance/get_group_samplelists.py10
-rw-r--r--wqflask/maintenance/quantile_normalize.py129
6 files changed, 410 insertions, 80 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py
new file mode 100644
index 00000000..e833b395
--- /dev/null
+++ b/wqflask/maintenance/convert_dryad_to_bimbam.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+"""
+Convert data dryad files to a BIMBAM _geno and _snps file
+
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+
+
+def read_dryad_file(filename):
+ exclude_count = 0
+ marker_list = []
+ sample_dict = {}
+ sample_list = []
+ geno_rows = []
+ with open(filename, 'r') as the_file:
+ for i, line in enumerate(the_file):
+ if i > 0:
+ if line.split(" ")[1] == "no":
+ sample_name = line.split(" ")[0]
+ sample_list.append(sample_name)
+ sample_dict[sample_name] = line.split(" ")[2:]
+ else:
+ exclude_count += 1
+ else:
+ marker_list = line.split(" ")[2:]
+
+ for i, marker in enumerate(marker_list):
+ this_row = []
+ this_row.append(marker)
+ this_row.append("X")
+ this_row.append("Y")
+ for sample in sample_list:
+ this_row.append(sample_dict[sample][i])
+ geno_rows.append(this_row)
+
+ print(exclude_count)
+
+ return geno_rows
+
+ #for i, marker in enumerate(marker_list):
+ # this_row = []
+ # this_row.append(marker)
+ # this_row.append("X")
+ # this_row.append("Y")
+ # with open(filename, 'r') as the_file:
+ # for j, line in enumerate(the_file):
+ # if j > 0:
+ # this_row.append(line.split(" ")[i+2])
+ # print("row: " + str(i))
+ # geno_rows.append(this_row)
+ #
+ #return geno_rows
+
+def write_bimbam_files(geno_rows):
+ with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
+ for row in geno_rows:
+ geno_fh.write(", ".join(row) + "\n")
+
+def convert_dryad_to_bimbam(filename):
+ geno_file_rows = read_dryad_file(filename)
+ write_bimbam_files(geno_file_rows)
+
+if __name__=="__main__":
+ input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+ convert_dryad_to_bimbam(input_filename) \ No newline at end of file
diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py
index 05006d5c..45522705 100644
--- a/wqflask/maintenance/convert_geno_to_bimbam.py
+++ b/wqflask/maintenance/convert_geno_to_bimbam.py
@@ -17,17 +17,12 @@ import glob
import traceback
import gzip
-#import numpy as np
-#from pyLMM import lmm
-
import simplejson as json
from pprint import pformat as pf
class EmptyConfigurations(Exception): pass
-
-
class Marker(object):
def __init__(self):
self.name = None
@@ -39,47 +34,34 @@ class Marker(object):
class ConvertGenoFile(object):
def __init__(self, input_file, output_files):
-
self.input_file = input_file
self.output_files = output_files
-
+
self.mb_exists = False
self.cm_exists = False
self.markers = []
-
+
self.latest_row_pos = None
self.latest_col_pos = None
-
+
self.latest_row_value = None
self.latest_col_value = None
-
- def convert(self):
+ def convert(self):
self.haplotype_notation = {
'@mat': "1",
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
}
-
+
self.configurations = {}
- #self.skipped_cols = 3
-
- #if self.input_file.endswith(".geno.gz"):
- # print("self.input_file: ", self.input_file)
- # self.input_fh = gzip.open(self.input_file)
- #else:
self.input_fh = open(self.input_file)
-
- with open(self.output_files[0], "w") as self.geno_fh:
- #if self.file_type == "geno":
- self.process_csv()
- #elif self.file_type == "snps":
- # self.process_snps_file()
+ self.process_csv()
def process_csv(self):
- for row_count, row in enumerate(self.process_rows()):
+ for row in self.process_rows():
row_items = row.split("\t")
this_marker = Marker()
@@ -102,53 +84,30 @@ class ConvertGenoFile(object):
this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
else:
this_marker.genotypes.append("NA")
-
- #print("this_marker is:", pf(this_marker.__dict__))
- #if this_marker.chr == "14":
+
self.markers.append(this_marker.__dict__)
self.write_to_bimbam()
-
- # with open(self.output_file, 'w') as fh:
- # json.dump(self.markers, fh, indent=" ", sort_keys=True)
-
- # print('configurations:', str(configurations))
- #self.latest_col_pos = item_count + self.skipped_cols
- #self.latest_col_value = item
-
- #if item_count != 0:
- # self.output_fh.write(" ")
- #self.output_fh.write(self.configurations[item.upper()])
-
- #self.output_fh.write("\n")
def write_to_bimbam(self):
with open(self.output_files[0], "w") as geno_fh:
- # geno_fh.write(str(len(self.sample_list)) + "\n")
- # geno_fh.write("2\n")
- # geno_fh.write("IND")
- # for sample in self.sample_list:
- # geno_fh.write(" " + sample)
- # geno_fh.write("\n")
for marker in self.markers:
geno_fh.write(marker['name'])
geno_fh.write(", X, Y")
geno_fh.write(", " + ", ".join(marker['genotypes']))
geno_fh.write("\n")
-
- #pheno_fh = open(self.output_files[1], 'w')
+
with open(self.output_files[1], "w") as pheno_fh:
for sample in self.sample_list:
pheno_fh.write("1\n")
-
+
with open(self.output_files[2], "w") as snp_fh:
for marker in self.markers:
if self.mb_exists:
snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
else:
snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")
-
-
+
def get_sample_list(self, row_contents):
self.sample_list = []
if self.mb_exists:
@@ -164,8 +123,6 @@ class ConvertGenoFile(object):
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
- #if self.input_file.endswith(".geno.gz"):
- # print("row: ", row)
self.latest_row_value = row
# Take care of headers
if not row.strip():
@@ -208,10 +165,8 @@ class ConvertGenoFile(object):
convertob.convert()
except EmptyConfigurations as why:
print(" No config info? Continuing...")
- #excepted = True
continue
except Exception as why:
-
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
@@ -219,12 +174,6 @@ class ConvertGenoFile(object):
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-
- #def process_snps_file(cls, snps_file, new_directory):
- # output_file = os.path.join(new_directory, "mouse_families.json")
- # print("%s -> %s" % (snps_file, output_file))
- # convertob = ConvertGenoFile(input_file, output_file)
-
if __name__=="__main__":
Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/"""
@@ -234,6 +183,4 @@ if __name__=="__main__":
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
#convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
- #ConvertGenoFiles(Geno_Directory)
-
- #process_csv(Input_File, Output_File) \ No newline at end of file
+ #ConvertGenoFiles(Geno_Directory) \ No newline at end of file
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index 79242661..18b2dac9 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -63,7 +63,7 @@ from pprint import pformat as pf
#conn = Engine.connect()
-def parse_db_uri(db_uri):
+def parse_db_uri():
"""Converts a database URI to the db name, host name, user name, and password"""
parsed_uri = urlparse.urlparse(SQL_URI)
@@ -78,7 +78,6 @@ def parse_db_uri(db_uri):
return db_conn_info
-
def get_species():
"""Build species list"""
Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
@@ -265,7 +264,7 @@ def build_datasets(species, group, type_name):
def main():
"""Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
- parse_db_uri(SQL_URI)
+ parse_db_uri()
species = get_species()
groups = get_groups(species)
@@ -304,6 +303,6 @@ def _test_it():
#print("build_datasets:", pf(datasets))
if __name__ == '__main__':
- Conn = MySQLdb.Connect(**parse_db_uri(SQL_URI))
+ Conn = MySQLdb.Connect(**parse_db_uri())
Cursor = Conn.cursor()
- main()
+ main() \ No newline at end of file
diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py
new file mode 100644
index 00000000..789a1691
--- /dev/null
+++ b/wqflask/maintenance/geno_to_json.py
@@ -0,0 +1,195 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+#import numpy as np
+#from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+class EmptyConfigurations(Exception): pass
+
+
+
+class Marker(object):
+ def __init__(self):
+ self.name = None
+ self.chr = None
+ self.cM = None
+ self.Mb = None
+ self.genotypes = []
+
+class ConvertGenoFile(object):
+
+ def __init__(self, input_file, output_file):
+
+ self.input_file = input_file
+ self.output_file = output_file
+
+ self.mb_exists = False
+ self.cm_exists = False
+ self.markers = []
+
+ self.latest_row_pos = None
+ self.latest_col_pos = None
+
+ self.latest_row_value = None
+ self.latest_col_value = None
+
+ def convert(self):
+
+ self.haplotype_notation = {
+ '@mat': "1",
+ '@pat': "0",
+ '@het': "0.5",
+ '@unk': "NA"
+ }
+
+ self.configurations = {}
+ #self.skipped_cols = 3
+
+ #if self.input_file.endswith(".geno.gz"):
+ # print("self.input_file: ", self.input_file)
+ # self.input_fh = gzip.open(self.input_file)
+ #else:
+ self.input_fh = open(self.input_file)
+
+ with open(self.output_file, "w") as self.output_fh:
+ #if self.file_type == "geno":
+ self.process_csv()
+ #elif self.file_type == "snps":
+ # self.process_snps_file()
+
+
+ def process_csv(self):
+ for row_count, row in enumerate(self.process_rows()):
+ row_items = row.split("\t")
+
+ this_marker = Marker()
+ this_marker.name = row_items[1]
+ this_marker.chr = row_items[0]
+ if self.cm_exists and self.mb_exists:
+ this_marker.cM = row_items[2]
+ this_marker.Mb = row_items[3]
+ genotypes = row_items[4:]
+ elif self.cm_exists:
+ this_marker.cM = row_items[2]
+ genotypes = row_items[3:]
+ elif self.mb_exists:
+ this_marker.Mb = row_items[2]
+ genotypes = row_items[3:]
+ else:
+ genotypes = row_items[2:]
+ for item_count, genotype in enumerate(genotypes):
+ if genotype.upper() in self.configurations:
+ this_marker.genotypes.append(self.configurations[genotype.upper()])
+ else:
+ this_marker.genotypes.append("NA")
+
+ #print("this_marker is:", pf(this_marker.__dict__))
+ #if this_marker.chr == "14":
+ self.markers.append(this_marker.__dict__)
+
+ with open(self.output_file, 'w') as fh:
+ json.dump(self.markers, fh, indent=" ", sort_keys=True)
+
+ # print('configurations:', str(configurations))
+ #self.latest_col_pos = item_count + self.skipped_cols
+ #self.latest_col_value = item
+
+ #if item_count != 0:
+ # self.output_fh.write(" ")
+ #self.output_fh.write(self.configurations[item.upper()])
+
+ #self.output_fh.write("\n")
+
+
+ def process_rows(self):
+ for self.latest_row_pos, row in enumerate(self.input_fh):
+ #if self.input_file.endswith(".geno.gz"):
+ # print("row: ", row)
+ self.latest_row_value = row
+ # Take care of headers
+ if not row.strip():
+ continue
+ if row.startswith('#'):
+ continue
+ if row.startswith('Chr'):
+ if 'Mb' in row.split():
+ self.mb_exists = True
+ if 'cM' in row.split():
+ self.cm_exists = True
+ continue
+ if row.startswith('@'):
+ key, _separater, value = row.partition(':')
+ key = key.strip()
+ value = value.strip()
+ if key in self.haplotype_notation:
+ self.configurations[value] = self.haplotype_notation[key]
+ continue
+ if not len(self.configurations):
+ raise EmptyConfigurations
+ yield row
+
+ @classmethod
+ def process_all(cls, old_directory, new_directory):
+ os.chdir(old_directory)
+ for input_file in glob.glob("*"):
+ if not input_file.endswith(('geno', '.geno.gz')):
+ continue
+ group_name = ".".join(input_file.split('.')[:-1])
+ output_file = os.path.join(new_directory, group_name + ".json")
+ print("%s -> %s" % (
+ os.path.join(old_directory, input_file), output_file))
+ convertob = ConvertGenoFile(input_file, output_file)
+ try:
+ convertob.convert()
+ except EmptyConfigurations as why:
+ print(" No config info? Continuing...")
+ #excepted = True
+ continue
+ except Exception as why:
+
+ print(" Exception:", why)
+ print(traceback.print_exc())
+ print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+ convertob.latest_col_pos))
+ print(" Column is:", convertob.latest_col_value)
+ print(" Row is:", convertob.latest_row_value)
+ break
+
+ #def process_snps_file(cls, snps_file, new_directory):
+ # output_file = os.path.join(new_directory, "mouse_families.json")
+ # print("%s -> %s" % (snps_file, output_file))
+ # convertob = ConvertGenoFile(input_file, output_file)
+
+
+
+if __name__=="__main__":
+ Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/"""
+ New_Geno_Directory = """/home/zas1024/genotype_files/genotype/json/"""
+ #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+ #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+ #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+ #convertob.convert()
+ ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+ #ConvertGenoFiles(Geno_Directory)
+
+ #process_csv(Input_File, Output_File) \ No newline at end of file
diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py
index 04e94886..1dc6c46c 100644
--- a/wqflask/maintenance/get_group_samplelists.py
+++ b/wqflask/maintenance/get_group_samplelists.py
@@ -6,16 +6,6 @@ import gzip
from base import webqtlConfig
-def process_genofiles(geno_dir=webqtlConfig.GENODIR):
- print("Yabba")
- #sys.exit("Dabba")
- os.chdir(geno_dir)
- for geno_file in glob.glob("*"):
- if geno_file.lower().endswith(('.geno', '.geno.gz')):
- #group_name = genofilename.split('.')[0]
- sample_list = get_samplelist(geno_file)
-
-
def get_samplelist(file_type, geno_file):
if file_type == "geno":
return get_samplelist_from_geno(geno_file)
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
new file mode 100644
index 00000000..41a3aad8
--- /dev/null
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import, print_function, division
+
+import sys
+sys.path.insert(0,'./')
+
+from itertools import izip
+
+import MySQLdb
+import urlparse
+
+import numpy as np
+import pandas as pd
+from elasticsearch import Elasticsearch, TransportError
+from elasticsearch.helpers import bulk
+
+from flask import Flask, g, request
+
+from wqflask import app
+from utility.elasticsearch_tools import get_elasticsearch_connection
+from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI
+
+def parse_db_uri():
+ """Converts a database URI to the db name, host name, user name, and password"""
+
+ parsed_uri = urlparse.urlparse(SQL_URI)
+
+ db_conn_info = dict(
+ db = parsed_uri.path[1:],
+ host = parsed_uri.hostname,
+ user = parsed_uri.username,
+ passwd = parsed_uri.password)
+
+ print(db_conn_info)
+ return db_conn_info
+
+def create_dataframe(input_file):
+ with open(input_file) as f:
+ ncols = len(f.readline().split("\t"))
+
+ input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols))
+ return pd.DataFrame(input_array)
+
+#This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+def quantileNormalize(df_input):
+ df = df_input.copy()
+ #compute rank
+ dic = {}
+ for col in df:
+ dic.update({col : sorted(df[col])})
+ sorted_df = pd.DataFrame(dic)
+ rank = sorted_df.mean(axis = 1).tolist()
+ #sort
+ for col in df:
+ t = np.searchsorted(np.sort(df[col]), df[col])
+ df[col] = [rank[i] for i in t]
+ return df
+
+def set_data(dataset_name):
+ orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
+
+ sample_list = []
+ with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
+ for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
+ trait_dict = {}
+ sample_list = []
+ if i == 0:
+ sample_names = line1.split('\t')[1:]
+ else:
+ trait_name = line1.split('\t')[0]
+ for i, sample in enumerate(sample_names):
+ this_sample = {
+ "name": sample,
+ "value": line1.split('\t')[i+1],
+ "qnorm": line2.split('\t')[i+1]
+ }
+ sample_list.append(this_sample)
+ query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
+ FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
+ WHERE Species.Id = InbredSet.SpeciesId and
+ InbredSet.Id = ProbeFreeze.InbredSetId and
+ ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and
+ ProbeSetFreeze.Name = '%s' and
+ ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
+ ProbeSetXRef.ProbeSetId = ProbeSet.Id and
+ ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
+ Cursor.execute(query)
+ result_info = Cursor.fetchone()
+
+ yield {
+ "_index": "traits",
+ "_type": "trait",
+ "_source": {
+ "name": trait_name,
+ "species": result_info[0],
+ "group": result_info[1],
+ "dataset": dataset_name,
+ "dataset_fullname": result_info[2],
+ "samples": sample_list,
+ "transform_types": "qnorm"
+ }
+ }
+
+if __name__ == '__main__':
+ Conn = MySQLdb.Connect(**parse_db_uri())
+ Cursor = Conn.cursor()
+
+ #es = Elasticsearch([{
+ # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT
+ #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
+
+ es = get_elasticsearch_connection(for_user=False)
+
+ #input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+ #input_df = create_dataframe(input_filename)
+ #output_df = quantileNormalize(input_df)
+
+ #output_df.to_csv('quant_norm.csv', sep='\t')
+
+ #out_filename = sys.argv[1][:-4] + '_quantnorm.txt'
+
+ success, _ = bulk(es, set_data(sys.argv[1]))
+
+ response = es.search(
+ index = "traits", doc_type = "trait", body = {
+ "query": { "match": { "name": "ENSMUSG00000028982" } }
+ }
+ )
+
+ print(response) \ No newline at end of file