Add code generating the new genotype files

Also made a large number of other fixes that proved necessary during testing
author: zsloan 2022-03-10 00:45:11 +0000
committer: zsloan 2022-03-16 14:41:09 -0500
commit: a51f95bea5fa9a3b767aaebf75adfa706cf7940f (patch)
tree: bafe10f09d4021d69de32c1b10198606c14ec0a2 /wqflask/maintenance
parent: 7e3b91d11ee59c34fc4d59c7ca94d6702ec7c5bd (diff)
download: genenetwork2-a51f95bea5fa9a3b767aaebf75adfa706cf7940f.tar.gz
1 files changed, 85 insertions, 29 deletions
diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index 9a97626d..e705119f 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -2,6 +2,7 @@
 # python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.json
 # python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.2.geno BXD.4.geno BXD.5.geno
 
+import json
 import os
 import sys
 from typing import List
@@ -28,23 +29,37 @@ def main(args):
     # The source group(s) we're generating the .geno files from
     # This can be passed as either a specific .geno file (or set of files as multiple arguments),
     # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists)
+    geno_json = {}
+    source_files = []
     if ".json" in args[4]:
-        source_files = [geno_dir + genofile['location'] for genofile in json.load(args[4])['genofile']]
+        geno_json = json.load(open(geno_dir + args[4], "r"))
+        par_f1s = {
+            "mat": geno_json['mat'],
+            "pat": geno_json['pat'],
+            "f1s": geno_json['f1s']
+        }
+
+        # List of file titles and locations from JSON
+        source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']]
     else:
-        source_files = [geno_dir + group + ".geno" if ".geno" not in group else group for group in args[4:]]
+        par_f1s = {}
+        # List of files directly taken from command line arguments, with titles just set to the filename
+        for group in args[4:]:
+            file_name = geno_dir + group + ".geno" if ".geno" not in group else group
+            source_files.append({'title': file_name[:-5], 'location': file_name})
 
     if len(source_files) > 1:
         # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files
-        target_json_loc = out_dir + args[3].split(".")[:-1] + ".json"
+        target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json"
         target_json = {'genofile': []}
 
     # Generate the output .geno files
     for source_file in source_files:
-        filename, samples = generate_new_genofile(source_file, target_file)
+        filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir)
 
         target_json['genofile'].append({
             'location': filename.split("/")[-1],
-            'title': filename.split("/")[-1],
+            'title': source_file['title'],
             'sample_list': samples
         })
 
@@ -55,20 +70,59 @@ def get_strain_for_sample(sample):
         "SELECT CaseAttributeXRefNew.Value "
         "FROM CaseAttributeXRefNew, Strain "
         "WHERE CaseAttributeXRefNew.CaseAttributeId=11 "
-        "AND CaseAttributeXRef.New.StrainId = Strain.Id "
+        "AND CaseAttributeXRefNew.StrainId = Strain.Id "
         "AND Strain.Name = %(name)s" )
 
-    with conn.cursor() as cursor:
-        return cursor.execute(query, {"name": name}).fetchone()[0]
+    with conn().cursor() as cursor:
+        cursor.execute(query, {"name": sample.strip()})
+        return cursor.fetchone()[0]
 
-def generate_new_genofiles(source_genofile, target_genofile):
-    base_samples = group_samples(source_genofile)
-    base_genotypes = strain_genotypes(source_genofile)
+def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir):
+    source_samples = group_samples(source_genofile)
+    source_genotypes = strain_genotypes(source_genofile)
     target_samples = group_samples(target_genofile)
-    strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples)
+    strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s)
 
+    if len(source_genofile.split("/")[-1].split(".")) > 2:
+        # The number in the source genofile; for example 4 in BXD.4.geno
+        source_num = source_genofile.split("/")[-1].split(".")[-2]
+        target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno"
+    else:
+        target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno"
+
+    file_location = out_dir + target_filename
+
+    with open(file_location, "w") as fh:
+        for metadata in ["name", "type", "mat", "pat", "het", "unk"]:
+            fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n")
+
+        header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples
+        fh.write("\t".join(header_line))
+
+        for marker in source_genotypes['markers']:
+            line_items = [
+                marker['Chr'],
+                marker['Locus'],
+                marker['cM'],
+                marker['Mb']
+            ]
+
+            for pos in strain_pos_map:
+                if isinstance(pos, int):
+                    line_items.append(marker['genotypes'][pos])
+                else:
+                    if pos in ["mat", "pat"]:
+                        line_items.append(source_genotypes[pos])
+                    elif pos == "f1s":
+                        line_items.append("H")
+                    else:
+                        line_items.append("U")
 
-def map_strain_pos_to_target_group(base_samples, target_samples):
+            fh.write("\t".join(line_items) + "\n")
+
+    return file_location, target_samples
+
+def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s):
     """
     Retrieve corresponding strain position for each sample in the target group
 
@@ -82,7 +136,14 @@ def map_strain_pos_to_target_group(base_samples, target_samples):
     pos_map = []
     for sample in target_samples:
         sample_strain = get_strain_for_sample(sample)
-        pos_map.append(base_samples.index(sample_strain))
+        if sample_strain in source_samples:
+            pos_map.append(source_samples.index(sample_strain))
+        else:
+            val = "U"
+            for key in par_f1s.keys():
+                if sample_strain in par_f1s[key]:
+                    val = key
+            pos_map.append(val)
 
     return pos_map
 
@@ -128,27 +189,21 @@ def strain_genotypes(strain_genofile: str) -> List:
     geno_start_col = None
     header_columns = []
     sample_list = []
-    marker_genotypes = []
-    with open(file_location, "r") as source_geno:
+    markers = []
+    with open(strain_genofile, "r") as source_geno:
         for i, line in enumerate(source_geno):
             if line[0] == "@":
-                if "@type" in line:
-                    geno_dict['type'] = line.split(":")[1]
-                if "@mat" in line:
-                    geno_dict['mat'] = line.split(":")[1]
-                elif "@pat" in line:
-                    geno_dict['pat'] = line.split(":")[1]
-                elif "@het" in line:
-                    geno_dict['het'] = line.split(":")[1]
-                elif "@unk" in line:
-                    geno_dict['unk'] = line.split(":")[1]
+                metadata_type = line[1:].split(":")[0]
+                if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']:
+                    geno_dict[metadata_type] = line.split(":")[1].strip()
+
+                continue
 
             # Skip other header lines
             if line[0] == "#" or not len(line):
                 continue
 
             line_items = line.split("\t")
-
             if "Chr" in line_items: # Header row
                 # Get the first column index containing genotypes
                 header_columns = line_items
@@ -169,9 +224,10 @@ def strain_genotypes(strain_genofile: str) -> List:
                     'cM': line_items[header_columns.index("cM")],
                     'genotypes': [item.strip() for item in line_items][geno_start_col:]
                 }
-                marker_genotypes.append(this_marker)
 
-    geno_dict['genotypes'] = marker_genotypes
+                markers.append(this_marker)
+
+    geno_dict['markers'] = markers
 
     return geno_dict
author	zsloan	2022-03-10 00:45:11 +0000
committer	zsloan	2022-03-16 14:41:09 -0500
commit	a51f95bea5fa9a3b767aaebf75adfa706cf7940f (patch)
tree	bafe10f09d4021d69de32c1b10198606c14ec0a2 /wqflask/maintenance
parent	7e3b91d11ee59c34fc4d59c7ca94d6702ec7c5bd (diff)
download	genenetwork2-a51f95bea5fa9a3b767aaebf75adfa706cf7940f.tar.gz