From 4a7e2c1602ed82aabd7d04953067ba49cb1cebff Mon Sep 17 00:00:00 2001
From: Frederick Muriuki Muriithi
Date: Thu, 10 Mar 2022 08:55:26 +0300
Subject: Use context manager with database connection

Use the `with` context manager with database connections and cursors
to ensure that they are closed once they are no longer needed.

Where it was not feasible to use the `with` context manager without a
huge refactor/rewrite, the cursors and connections are closed manually.
---
 wqflask/maintenance/gen_select_dataset.py          | 36 +++++++++-------------
 .../maintenance/generate_probesetfreeze_file.py    | 26 ++++++----------
 wqflask/maintenance/quantile_normalize.py          | 27 ++++++++--------
 wqflask/maintenance/set_resource_defaults.py       | 35 ++++++++++-----------
 4 files changed, 53 insertions(+), 71 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index db65a11f..9f4b670d 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -39,21 +39,13 @@ from wqflask import app
 
 from utility.tools import locate, locate_ignore_error, TEMPDIR, SQL_URI
 
-import MySQLdb
-
 import simplejson as json
 import urllib.parse
 
 
-#import sqlalchemy as sa
-
 from pprint import pformat as pf
 
-#Engine = sa.create_engine(zach_settings.SQL_URI)
-
-# build MySql database connection
-
-#conn = Engine.connect()
+from wqflask.database import database_connection
 
 
 def parse_db_uri():
@@ -71,19 +63,19 @@ def parse_db_uri():
     return db_conn_info
 
 
-def get_species():
+def get_species(cursor):
     """Build species list"""
-    #Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
-    Cursor.execute("select Name, MenuName from Species order by OrderId")
-    species = list(Cursor.fetchall())
+    #cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
+    cursor.execute("select Name, MenuName from Species order by OrderId")
+    species = list(cursor.fetchall())
     return species
 
 
-def get_groups(species):
+def get_groups(cursor, species):
     """Build groups list"""
     groups = {}
     for species_name, _species_full_name in species:
-        Cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
+        cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
                        Species,
                        ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s'
                        and InbredSet.SpeciesId = Species.Id and
@@ -92,7 +84,7 @@ def get_groups(species):
                         or ProbeFreeze.InbredSetId = InbredSet.Id)
                         group by InbredSet.Name
                         order by InbredSet.FullName""" % species_name)
-        results = Cursor.fetchall()
+        results = cursor.fetchall()
         groups[species_name] = list(results)
     return groups
 
@@ -273,13 +265,13 @@ def build_datasets(species, group, type_name):
     return datasets
 
 
-def main():
+def main(cursor):
     """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
 
     parse_db_uri()
 
-    species = get_species()
-    groups = get_groups(species)
+    species = get_species(cursor)
+    groups = get_groups(cursor, species)
     types = get_types(groups)
     datasets = get_datasets(types)
 
@@ -316,6 +308,6 @@ def _test_it():
 
 
 if __name__ == '__main__':
-    Conn = MySQLdb.Connect(**parse_db_uri())
-    Cursor = Conn.cursor()
-    main()
+    with database_connection() as conn:
+        with conn.cursor() as cursor:
+            main(cursor)
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py
index e964c8ed..f43f952b 100644
--- a/wqflask/maintenance/generate_probesetfreeze_file.py
+++ b/wqflask/maintenance/generate_probesetfreeze_file.py
@@ -8,20 +8,11 @@ import os
 import collections
 import csv
 
-import MySQLdb
-
 from base import webqtlConfig
 
 from pprint import pformat as pf
 
-
-def get_cursor():
-    con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME,
-                          host=webqtlConfig.MYSQL_UPDSERVER,
-                          user=webqtlConfig.DB_UPDUSER,
-                          passwd=webqtlConfig.DB_UPDPASSWD)
-    cursor = con.cursor()
-    return cursor
+from wqflask.database import database_connection
 
 
 def show_progress(process, counter):
@@ -116,13 +107,14 @@ def main():
         "(Oct08)_RankInv_Beta.txt")
     dataset_name = "Eye_AXBXA_1008_RankInv"
 
-    cursor = get_cursor()
-    strains = get_strains(cursor)
-    print("Getting probset_vals")
-    probeset_vals = get_probeset_vals(cursor, dataset_name)
-    print("Finished getting probeset_vals")
-    trimmed_strains = trim_strains(strains, probeset_vals)
-    write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+    with database_connection as conn:
+        with conn.cursor() as cursor:
+            strains = get_strains(cursor)
+            print("Getting probset_vals")
+            probeset_vals = get_probeset_vals(cursor, dataset_name)
+            print("Finished getting probeset_vals")
+            trimmed_strains = trim_strains(strains, probeset_vals)
+            write_data_matrix_file(trimmed_strains, probeset_vals, filename)
 
 
 if __name__ == '__main__':
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index 32780ca6..2e2b0ec3 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -1,6 +1,5 @@
 import sys
 sys.path.insert(0, './')
-import MySQLdb
 import urllib.parse
 
 import numpy as np
@@ -9,6 +8,7 @@ import pandas as pd
 from flask import Flask, g, request
 
 from wqflask import app
+from wqflask.database import database_connection
 
 
 def parse_db_uri():
@@ -52,7 +52,7 @@ def quantileNormalize(df_input):
     return df
 
 
-def set_data(dataset_name):
+def set_data(cursor, dataset_name):
     orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
 
     sample_list = []
@@ -80,8 +80,8 @@ def set_data(dataset_name):
                                  ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
                                  ProbeSetXRef.ProbeSetId = ProbeSet.Id and
                                  ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
-                Cursor.execute(query)
-                result_info = Cursor.fetchone()
+                cursor.execute(query)
+                result_info = cursor.fetchone()
 
                 yield {
                     "_index": "traits",
@@ -99,15 +99,14 @@ def set_data(dataset_name):
 
 
 if __name__ == '__main__':
-    Conn = MySQLdb.Connect(**parse_db_uri())
-    Cursor = Conn.cursor()
+    with database_connection as conn:
+        with conn.cursor as cursor:
+            success, _ = bulk(es, set_data(cursor, sys.argv[1]))
 
-    success, _ = bulk(es, set_data(sys.argv[1]))
-
-    response = es.search(
-        index="traits", doc_type="trait", body={
-            "query": {"match": {"name": "ENSMUSG00000028982"}}
-        }
-    )
+            response = es.search(
+                index="traits", doc_type="trait", body={
+                    "query": {"match": {"name": "ENSMUSG00000028982"}}
+                }
+            )
 
-    print(response)
+            print(response)
diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py
index 0f472494..22d73ba3 100644
--- a/wqflask/maintenance/set_resource_defaults.py
+++ b/wqflask/maintenance/set_resource_defaults.py
@@ -30,10 +30,9 @@ from utility.tools import SQL_URI
 from utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info
 Redis = get_redis_conn()
 
-import MySQLdb
-
 import urllib.parse
 
+from wqflask.database import database_connection
 from utility.logger import getLogger
 logger = getLogger(__name__)
 
@@ -53,14 +52,14 @@ def parse_db_uri():
     return db_conn_info
 
 
-def insert_probeset_resources(default_owner_id):
+def insert_probeset_resources(cursor, default_owner_id):
     current_resources = Redis.hgetall("resources")
-    Cursor.execute("""  SELECT
+    cursor.execute("""  SELECT
                             ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public
                         FROM
                             ProbeSetFreeze""")
 
-    resource_results = Cursor.fetchall()
+    resource_results = cursor.fetchall()
     for i, resource in enumerate(resource_results):
         resource_ob = {}
         resource_ob['name'] = resource[1]
@@ -80,9 +79,9 @@ def insert_probeset_resources(default_owner_id):
         add_resource(resource_ob, update=False)
 
 
-def insert_publish_resources(default_owner_id):
+def insert_publish_resources(cursor, default_owner_id):
     current_resources = Redis.hgetall("resources")
-    Cursor.execute("""  SELECT 
+    cursor.execute("""  SELECT 
                             PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode
                         FROM
                             PublishXRef, PublishFreeze, InbredSet, Publication
@@ -91,7 +90,7 @@ def insert_publish_resources(default_owner_id):
                             InbredSet.Id = PublishXRef.InbredSetId AND
                             Publication.Id = PublishXRef.PublicationId""")
 
-    resource_results = Cursor.fetchall()
+    resource_results = cursor.fetchall()
     for resource in resource_results:
         if resource[2]:
             resource_ob = {}
@@ -114,14 +113,14 @@ def insert_publish_resources(default_owner_id):
             continue
 
 
-def insert_geno_resources(default_owner_id):
+def insert_geno_resources(cursor, default_owner_id):
     current_resources = Redis.hgetall("resources")
-    Cursor.execute("""  SELECT
+    cursor.execute("""  SELECT
                             GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality
                         FROM
                             GenoFreeze""")
 
-    resource_results = Cursor.fetchall()
+    resource_results = cursor.fetchall()
     for i, resource in enumerate(resource_results):
         resource_ob = {}
         resource_ob['name'] = resource[1]
@@ -147,15 +146,15 @@ def insert_geno_resources(default_owner_id):
 def insert_resources(default_owner_id):
     current_resources = get_resources()
     print("START")
-    insert_publish_resources(default_owner_id)
+    insert_publish_resources(cursor, default_owner_id)
     print("AFTER PUBLISH")
-    insert_geno_resources(default_owner_id)
+    insert_geno_resources(cursor, default_owner_id)
     print("AFTER GENO")
-    insert_probeset_resources(default_owner_id)
+    insert_probeset_resources(cursor, default_owner_id)
     print("AFTER PROBESET")
 
 
-def main():
+def main(cursor):
     """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
 
     Redis.delete("resources")
@@ -166,6 +165,6 @@ def main():
 
 
 if __name__ == '__main__':
-    Conn = MySQLdb.Connect(**parse_db_uri())
-    Cursor = Conn.cursor()
-    main()
+    with database_connection() as conn:
+        with conn.cursor() as cursor:
+            main(cursor)
-- 
cgit v1.2.3


From e841dd524ee33386a47abb694dea90363de144b8 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Fri, 28 Jan 2022 22:31:01 +0000
Subject: Add in-progress gen_ind_genofiles.py

gen_ind_genofiles.py is a command line script to generate genotype files for groups of
individuals/samples, taking a source .geno or .json file and a target 'dummy' .geno file as input
---
 wqflask/maintenance/gen_ind_genofiles.py | 120 +++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 wqflask/maintenance/gen_ind_genofiles.py

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
new file mode 100644
index 00000000..546bc60d
--- /dev/null
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -0,0 +1,120 @@
+# Example command: env GN2_PROFILE=/usr/local/guix-profiles/gn-latest-20220122 TMPDIR=/export/local/home/zas1024/gn2-zach/tmp WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG SERVER_PORT=5002 GENENETWORK_FILES=/export/local/home/zas1024/gn2-zach/genotype_files SQL_URI=mysql://webqtlout:webqtlout@localhost/db_webqtl ./bin/genenetwork2 ./etc/default_settings.py -c ./maintenance/gen_ind_genofiles.py
+
+import sys
+
+import MySQLdb
+
+#from flask import Blueprint
+
+from wqflask import app
+
+from gn3.db.datasets import retrieve_group_samples
+
+#gen_geno = Blueprint('gen_geno', __name__)
+
+def db_conn():
+    return MySQLdb.Connect(db=app.config.get("DB_NAME"),
+                           user=app.config.get("DB_USER"),
+                           passwd=app.config.get("DB_PASS"),
+                           host=app.config.get("DB_HOST"))
+
+def main(args):
+
+    # The file of the "main" .geno file for the group in question
+    # For example: BXD.geno or BXD.6.geno if converting to BXD individual genofiles
+    strain_genofile = args[1] 
+
+    # Get genotypes from the source strain genofile
+    strain_genotypes(strain_genofile)
+
+    # The target individuals/samples group(s) we're generating the .geno files for
+    # This can be passed as either a specific .geno file, or as a JSON file
+    # containing a set of .geno files (and their corresponding file names and sample lists)
+    if ".json" in args[2]:
+        target_groups = json.load(args[2])['genofile']
+    else:
+        target_groups = [args[2]]
+
+def group_samples(target_group):
+    """
+    Get the group samples from its "dummy" .geno file (which still contains the sample list)
+    """
+
+    # Allow for inputting the target group as either the group name or .geno file
+    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group\
+    if ".geno" not in target_group:
+        file_location += ".geno"
+
+    sample_list = []
+    with open(file_location, "r") as target_geno:
+        for i, line in enumerate(target_geno):
+            # Skip header lines
+            if line[0] in ["#", "@"] or not len(line):
+                continue
+    
+            line_items = line.split("\t")
+            sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]]
+            break
+
+    return sample_list
+
+def strain_genotypes(strain_genofile: str) -> List:
+    """
+    Read genotypes from source strain .geno file
+
+    :param strain_genofile: string of genofile filename
+    :return: a list of dictionaries representing each marker's genotypes
+
+    Example output: [
+        {
+            'Chr': '1',
+            'Locus': 'marker1',
+            'Mb': '10.0',
+            'cM': '8.0',
+            'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...]
+        },
+        ...
+    ]
+    """
+
+    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + strain_genofile
+
+    geno_start_col = None
+    header_columns = []
+    sample_list = []
+    marker_genotypes = []
+    with open(file_location, "r") as source_geno:
+        for i, line in enumerate(source_geno):
+            # Skip header lines
+            if line[0] in ["#", "@"] or not len(line):
+                continue
+
+            line_items = line.split("\t")
+
+            if "Chr" in line_items: # Header row
+                # Get the first column index containing genotypes
+                header_columns = line_items
+                for j, item in enumerate(line_items):
+                    if item not in ["Chr", "Locus", "Mb", "cM"]:
+                        geno_start_col = j
+                        break
+
+                sample_list = line_items[geno_start_col:]
+                if not geno_start_col:
+                    print("Check .geno file - expected columns not found")
+                    sys.exit()
+            else: # Marker rows
+                this_marker = {
+                    'Chr': line_items[header_columns.index("Chr")],
+                    'Locus': line_items[header_columns.index("Locus")],
+                    'Mb': line_items[header_columns.index("Mb")],
+                    'cM': line_items[header_columns.index("cM")],
+                    'genotypes': zip(sample_list, line_items[geno_start_col:])
+                }
+                marker_genotypes.append(this_marker)
+
+    return marker_genotypes
+            
+if __name__ == "__main__":
+    print("command line arguments:\n\t%s" % sys.argv)
+    main(sys.argv)
-- 
cgit v1.2.3


From bdf0653adda955b93127a1ddb7e70f9ba490e8b8 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Fri, 28 Jan 2022 22:52:49 +0000
Subject: Minor changes/bug fixes

- Removed some unused code
- Strip marker genotype to avoid newline character at end
- Convert zip to list for marker genotypes
- Add typing to group_samples
- Rename strain_genofile to source_genofile
---
 wqflask/maintenance/gen_ind_genofiles.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index 546bc60d..ec0fcd55 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -1,17 +1,14 @@
 # Example command: env GN2_PROFILE=/usr/local/guix-profiles/gn-latest-20220122 TMPDIR=/export/local/home/zas1024/gn2-zach/tmp WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG SERVER_PORT=5002 GENENETWORK_FILES=/export/local/home/zas1024/gn2-zach/genotype_files SQL_URI=mysql://webqtlout:webqtlout@localhost/db_webqtl ./bin/genenetwork2 ./etc/default_settings.py -c ./maintenance/gen_ind_genofiles.py
 
 import sys
+from typing import List
 
 import MySQLdb
 
-#from flask import Blueprint
-
 from wqflask import app
 
 from gn3.db.datasets import retrieve_group_samples
 
-#gen_geno = Blueprint('gen_geno', __name__)
-
 def db_conn():
     return MySQLdb.Connect(db=app.config.get("DB_NAME"),
                            user=app.config.get("DB_USER"),
@@ -22,10 +19,7 @@ def main(args):
 
     # The file of the "main" .geno file for the group in question
     # For example: BXD.geno or BXD.6.geno if converting to BXD individual genofiles
-    strain_genofile = args[1] 
-
-    # Get genotypes from the source strain genofile
-    strain_genotypes(strain_genofile)
+    source_genofile = args[1] 
 
     # The target individuals/samples group(s) we're generating the .geno files for
     # This can be passed as either a specific .geno file, or as a JSON file
@@ -35,13 +29,16 @@ def main(args):
     else:
         target_groups = [args[2]]
 
-def group_samples(target_group):
+    # Generate the output .geno files
+    generate_new_genofiles(strain_genotypes(source_genofile), target_groups)
+
+def group_samples(target_group: str) -> List:
     """
     Get the group samples from its "dummy" .geno file (which still contains the sample list)
     """
 
     # Allow for inputting the target group as either the group name or .geno file
-    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group\
+    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group
     if ".geno" not in target_group:
         file_location += ".geno"
 
@@ -109,7 +106,7 @@ def strain_genotypes(strain_genofile: str) -> List:
                     'Locus': line_items[header_columns.index("Locus")],
                     'Mb': line_items[header_columns.index("Mb")],
                     'cM': line_items[header_columns.index("cM")],
-                    'genotypes': zip(sample_list, line_items[geno_start_col:])
+                    'genotypes': list(zip(sample_list, [item.strip() for item in line_items][geno_start_col:]))
                 }
                 marker_genotypes.append(this_marker)
 
-- 
cgit v1.2.3


From 45694eb46fa0c337d8b2f1be945bdb96c4a2af44 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Fri, 28 Jan 2022 22:54:29 +0000
Subject: Change EOL from CRLF to LF

---
 wqflask/maintenance/gen_ind_genofiles.py | 234 +++++++++++++++----------------
 1 file changed, 117 insertions(+), 117 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index ec0fcd55..abca4a4a 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -1,117 +1,117 @@
-# Example command: env GN2_PROFILE=/usr/local/guix-profiles/gn-latest-20220122 TMPDIR=/export/local/home/zas1024/gn2-zach/tmp WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG SERVER_PORT=5002 GENENETWORK_FILES=/export/local/home/zas1024/gn2-zach/genotype_files SQL_URI=mysql://webqtlout:webqtlout@localhost/db_webqtl ./bin/genenetwork2 ./etc/default_settings.py -c ./maintenance/gen_ind_genofiles.py
-
-import sys
-from typing import List
-
-import MySQLdb
-
-from wqflask import app
-
-from gn3.db.datasets import retrieve_group_samples
-
-def db_conn():
-    return MySQLdb.Connect(db=app.config.get("DB_NAME"),
-                           user=app.config.get("DB_USER"),
-                           passwd=app.config.get("DB_PASS"),
-                           host=app.config.get("DB_HOST"))
-
-def main(args):
-
-    # The file of the "main" .geno file for the group in question
-    # For example: BXD.geno or BXD.6.geno if converting to BXD individual genofiles
-    source_genofile = args[1] 
-
-    # The target individuals/samples group(s) we're generating the .geno files for
-    # This can be passed as either a specific .geno file, or as a JSON file
-    # containing a set of .geno files (and their corresponding file names and sample lists)
-    if ".json" in args[2]:
-        target_groups = json.load(args[2])['genofile']
-    else:
-        target_groups = [args[2]]
-
-    # Generate the output .geno files
-    generate_new_genofiles(strain_genotypes(source_genofile), target_groups)
-
-def group_samples(target_group: str) -> List:
-    """
-    Get the group samples from its "dummy" .geno file (which still contains the sample list)
-    """
-
-    # Allow for inputting the target group as either the group name or .geno file
-    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group
-    if ".geno" not in target_group:
-        file_location += ".geno"
-
-    sample_list = []
-    with open(file_location, "r") as target_geno:
-        for i, line in enumerate(target_geno):
-            # Skip header lines
-            if line[0] in ["#", "@"] or not len(line):
-                continue
-    
-            line_items = line.split("\t")
-            sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]]
-            break
-
-    return sample_list
-
-def strain_genotypes(strain_genofile: str) -> List:
-    """
-    Read genotypes from source strain .geno file
-
-    :param strain_genofile: string of genofile filename
-    :return: a list of dictionaries representing each marker's genotypes
-
-    Example output: [
-        {
-            'Chr': '1',
-            'Locus': 'marker1',
-            'Mb': '10.0',
-            'cM': '8.0',
-            'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...]
-        },
-        ...
-    ]
-    """
-
-    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + strain_genofile
-
-    geno_start_col = None
-    header_columns = []
-    sample_list = []
-    marker_genotypes = []
-    with open(file_location, "r") as source_geno:
-        for i, line in enumerate(source_geno):
-            # Skip header lines
-            if line[0] in ["#", "@"] or not len(line):
-                continue
-
-            line_items = line.split("\t")
-
-            if "Chr" in line_items: # Header row
-                # Get the first column index containing genotypes
-                header_columns = line_items
-                for j, item in enumerate(line_items):
-                    if item not in ["Chr", "Locus", "Mb", "cM"]:
-                        geno_start_col = j
-                        break
-
-                sample_list = line_items[geno_start_col:]
-                if not geno_start_col:
-                    print("Check .geno file - expected columns not found")
-                    sys.exit()
-            else: # Marker rows
-                this_marker = {
-                    'Chr': line_items[header_columns.index("Chr")],
-                    'Locus': line_items[header_columns.index("Locus")],
-                    'Mb': line_items[header_columns.index("Mb")],
-                    'cM': line_items[header_columns.index("cM")],
-                    'genotypes': list(zip(sample_list, [item.strip() for item in line_items][geno_start_col:]))
-                }
-                marker_genotypes.append(this_marker)
-
-    return marker_genotypes
-            
-if __name__ == "__main__":
-    print("command line arguments:\n\t%s" % sys.argv)
-    main(sys.argv)
+# Example command: env GN2_PROFILE=/usr/local/guix-profiles/gn-latest-20220122 TMPDIR=/export/local/home/zas1024/gn2-zach/tmp WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG SERVER_PORT=5002 GENENETWORK_FILES=/export/local/home/zas1024/gn2-zach/genotype_files SQL_URI=mysql://webqtlout:webqtlout@localhost/db_webqtl ./bin/genenetwork2 ./etc/default_settings.py -c ./maintenance/gen_ind_genofiles.py
+
+import sys
+from typing import List
+
+import MySQLdb
+
+from wqflask import app
+
+from gn3.db.datasets import retrieve_group_samples
+
+def db_conn():
+    return MySQLdb.Connect(db=app.config.get("DB_NAME"),
+                           user=app.config.get("DB_USER"),
+                           passwd=app.config.get("DB_PASS"),
+                           host=app.config.get("DB_HOST"))
+
+def main(args):
+
+    # The file of the "main" .geno file for the group in question
+    # For example: BXD.geno or BXD.6.geno if converting to BXD individual genofiles
+    source_genofile = args[1] 
+
+    # The target individuals/samples group(s) we're generating the .geno files for
+    # This can be passed as either a specific .geno file, or as a JSON file
+    # containing a set of .geno files (and their corresponding file names and sample lists)
+    if ".json" in args[2]:
+        target_groups = json.load(args[2])['genofile']
+    else:
+        target_groups = [args[2]]
+
+    # Generate the output .geno files
+    generate_new_genofiles(strain_genotypes(source_genofile), target_groups)
+
+def group_samples(target_group: str) -> List:
+    """
+    Get the group samples from its "dummy" .geno file (which still contains the sample list)
+    """
+
+    # Allow for inputting the target group as either the group name or .geno file
+    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group
+    if ".geno" not in target_group:
+        file_location += ".geno"
+
+    sample_list = []
+    with open(file_location, "r") as target_geno:
+        for i, line in enumerate(target_geno):
+            # Skip header lines
+            if line[0] in ["#", "@"] or not len(line):
+                continue
+    
+            line_items = line.split("\t")
+            sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]]
+            break
+
+    return sample_list
+
+def strain_genotypes(strain_genofile: str) -> List:
+    """
+    Read genotypes from source strain .geno file
+
+    :param strain_genofile: string of genofile filename
+    :return: a list of dictionaries representing each marker's genotypes
+
+    Example output: [
+        {
+            'Chr': '1',
+            'Locus': 'marker1',
+            'Mb': '10.0',
+            'cM': '8.0',
+            'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...]
+        },
+        ...
+    ]
+    """
+
+    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + strain_genofile
+
+    geno_start_col = None
+    header_columns = []
+    sample_list = []
+    marker_genotypes = []
+    with open(file_location, "r") as source_geno:
+        for i, line in enumerate(source_geno):
+            # Skip header lines
+            if line[0] in ["#", "@"] or not len(line):
+                continue
+
+            line_items = line.split("\t")
+
+            if "Chr" in line_items: # Header row
+                # Get the first column index containing genotypes
+                header_columns = line_items
+                for j, item in enumerate(line_items):
+                    if item not in ["Chr", "Locus", "Mb", "cM"]:
+                        geno_start_col = j
+                        break
+
+                sample_list = line_items[geno_start_col:]
+                if not geno_start_col:
+                    print("Check .geno file - expected columns not found")
+                    sys.exit()
+            else: # Marker rows
+                this_marker = {
+                    'Chr': line_items[header_columns.index("Chr")],
+                    'Locus': line_items[header_columns.index("Locus")],
+                    'Mb': line_items[header_columns.index("Mb")],
+                    'cM': line_items[header_columns.index("cM")],
+                    'genotypes': list(zip(sample_list, [item.strip() for item in line_items][geno_start_col:]))
+                }
+                marker_genotypes.append(this_marker)
+
+    return marker_genotypes
+            
+if __name__ == "__main__":
+    print("command line arguments:\n\t%s" % sys.argv)
+    main(sys.argv)
-- 
cgit v1.2.3


From 743a4623c53d30779cb884a69d0cf2c7ff411f0a Mon Sep 17 00:00:00 2001
From: zsloan
Date: Wed, 9 Mar 2022 19:13:59 +0000
Subject: Add function for getting strain name from sample name

---
 wqflask/maintenance/gen_ind_genofiles.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index abca4a4a..6e818945 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -7,9 +7,7 @@ import MySQLdb
 
 from wqflask import app
 
-from gn3.db.datasets import retrieve_group_samples
-
-def db_conn():
+def conn():
     return MySQLdb.Connect(db=app.config.get("DB_NAME"),
                            user=app.config.get("DB_USER"),
                            passwd=app.config.get("DB_PASS"),
@@ -32,6 +30,17 @@ def main(args):
     # Generate the output .geno files
     generate_new_genofiles(strain_genotypes(source_genofile), target_groups)
 
+def get_strain_for_sample(sample):
+    query = (
+        "SELECT CaseAttributeXRefNew.Value "
+        "FROM CaseAttributeXRefNew, Strain "
+        "WHERE CaseAttributeXRefNew.CaseAttributeId=11 "
+        "AND CaseAttributeXRef.New.StrainId = Strain.Id "
+        "AND Strain.Name = %(name)s" )
+
+    with conn.cursor() as cursor:
+        return cursor.execute(query, {"name": name}).fetchone()[0]
+
 def group_samples(target_group: str) -> List:
     """
     Get the group samples from its "dummy" .geno file (which still contains the sample list)
@@ -115,3 +124,4 @@ def strain_genotypes(strain_genofile: str) -> List:
 if __name__ == "__main__":
     print("command line arguments:\n\t%s" % sys.argv)
     main(sys.argv)
+
-- 
cgit v1.2.3


From 27530d5a59bded06f644e4704ef21cb6da491350 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Wed, 9 Mar 2022 19:41:55 +0000
Subject: Add function for mapping strain to sample pos + begin creating
 generate_new_genofiles function

---
 wqflask/maintenance/gen_ind_genofiles.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index 6e818945..b91660a4 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -28,7 +28,7 @@ def main(args):
         target_groups = [args[2]]
 
     # Generate the output .geno files
-    generate_new_genofiles(strain_genotypes(source_genofile), target_groups)
+    generate_new_genofiles(source_genofile, strain_genotypes(source_genofile), target_groups)
 
 def get_strain_for_sample(sample):
     query = (
@@ -41,6 +41,33 @@ def get_strain_for_sample(sample):
     with conn.cursor() as cursor:
         return cursor.execute(query, {"name": name}).fetchone()[0]
 
+def generate_new_genofiles(source_genofile, strain_genotypes, target_groups):
+    for group in target_groups:
+        base_samples = group_samples(source_genofile)
+        target_samples = group_samples(group)
+        strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples)
+
+        new_genofile = app.config.get("GENENETWORK_FILES") + "/genotype/_" + group
+
+
+def map_strain_pos_to_target_group(base_samples, target_samples):
+    """
+    Retrieve corresponding strain position for each sample in the target group
+
+    This is so the genotypes from the base genofile can be mapped to the samples in the target group
+
+    For example:
+    Base strains: BXD1, BXD2, BXD3
+    Target samples: BXD1_1, BXD1_2, BXD2_1, BXD3_1, BXD3_2, BXD3_3
+    Returns: [0, 0, 1, 2, 2, 2]
+    """
+    pos_map = []
+    for i, sample in enumerate(target_samples):
+        sample_strain = get_strain_for_sample(sample)
+        pos_map.append(base_samples.index(sample_strain))
+
+    return pos_map
+
 def group_samples(target_group: str) -> List:
     """
     Get the group samples from its "dummy" .geno file (which still contains the sample list)
-- 
cgit v1.2.3


From f72480dee99ee6ab107bb84c6f3b5c663a04cc86 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Wed, 9 Mar 2022 20:01:53 +0000
Subject: Fix the way target/source genofiles were being processed + some other
 changes

- I was mixing up source/target genofiles previously; the JSON file is for the source genofiles
- references to the app context are removed in favor of just taking input as arguments or environment variables
- Updated example commands
---
 wqflask/maintenance/gen_ind_genofiles.py | 64 ++++++++++++++++----------------
 1 file changed, 31 insertions(+), 33 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index b91660a4..b781d7d1 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -1,34 +1,41 @@
-# Example command: env GN2_PROFILE=/usr/local/guix-profiles/gn-latest-20220122 TMPDIR=/export/local/home/zas1024/gn2-zach/tmp WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG SERVER_PORT=5002 GENENETWORK_FILES=/export/local/home/zas1024/gn2-zach/genotype_files SQL_URI=mysql://webqtlout:webqtlout@localhost/db_webqtl ./bin/genenetwork2 ./etc/default_settings.py -c ./maintenance/gen_ind_genofiles.py
+# Example commands:
+# python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.json
+# python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.2.geno BXD.4.geno BXD.5.geno
 
+import os
 import sys
 from typing import List
 
 import MySQLdb
 
-from wqflask import app
-
 def conn():
-    return MySQLdb.Connect(db=app.config.get("DB_NAME"),
-                           user=app.config.get("DB_USER"),
-                           passwd=app.config.get("DB_PASS"),
-                           host=app.config.get("DB_HOST"))
+    return MySQLdb.Connect(db=os.environ.get("DB_NAME"),
+                           user=os.environ.get("DB_USER"),
+                           passwd=os.environ.get("DB_PASS"),
+                           host=os.environ.get("DB_HOST"))
 
 def main(args):
 
-    # The file of the "main" .geno file for the group in question
-    # For example: BXD.geno or BXD.6.geno if converting to BXD individual genofiles
-    source_genofile = args[1] 
+    # Directory in which .geno files are located
+    geno_dir = args[1]
+
+    # Directory in which to output new files
+    out_dir = args[2]
+
+    # The individuals group that we want to generate a .geno file for
+    target_file = geno_dir + args[3]
 
-    # The target individuals/samples group(s) we're generating the .geno files for
-    # This can be passed as either a specific .geno file, or as a JSON file
-    # containing a set of .geno files (and their corresponding file names and sample lists)
-    if ".json" in args[2]:
-        target_groups = json.load(args[2])['genofile']
+    # The source group(s) we're generating the .geno files from
+    # This can be passed as either a specific .geno file (or set of files as multiple arguments),
+    # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists)
+    if ".json" in args[4]:
+        source_files = [geno_dir + genofile['location'] for genofile in json.load(args[4])['genofile']]
     else:
-        target_groups = [args[2]]
+        source_files = [geno_dir + group + ".geno" if ".geno" not in group else group for group in args[4:]]
 
     # Generate the output .geno files
-    generate_new_genofiles(source_genofile, strain_genotypes(source_genofile), target_groups)
+    for source_file in source_files:
+        generate_new_genofile(source_file, target_file)
 
 def get_strain_for_sample(sample):
     query = (
@@ -41,13 +48,11 @@ def get_strain_for_sample(sample):
     with conn.cursor() as cursor:
         return cursor.execute(query, {"name": name}).fetchone()[0]
 
-def generate_new_genofiles(source_genofile, strain_genotypes, target_groups):
-    for group in target_groups:
-        base_samples = group_samples(source_genofile)
-        target_samples = group_samples(group)
-        strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples)
-
-        new_genofile = app.config.get("GENENETWORK_FILES") + "/genotype/_" + group
+def generate_new_genofiles(source_genofile, target_genofile):
+    base_samples = group_samples(source_genofile)
+    base_genotypes = strain_genotypes(source_genofile)
+    target_samples = group_samples(target_genofile)
+    strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples)
 
 
 def map_strain_pos_to_target_group(base_samples, target_samples):
@@ -68,18 +73,13 @@ def map_strain_pos_to_target_group(base_samples, target_samples):
 
     return pos_map
 
-def group_samples(target_group: str) -> List:
+def group_samples(target_file: str) -> List:
     """
     Get the group samples from its "dummy" .geno file (which still contains the sample list)
     """
 
-    # Allow for inputting the target group as either the group name or .geno file
-    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group
-    if ".geno" not in target_group:
-        file_location += ".geno"
-
     sample_list = []
-    with open(file_location, "r") as target_geno:
+    with open(target_file, "r") as target_geno:
         for i, line in enumerate(target_geno):
             # Skip header lines
             if line[0] in ["#", "@"] or not len(line):
@@ -110,8 +110,6 @@ def strain_genotypes(strain_genofile: str) -> List:
     ]
     """
 
-    file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + strain_genofile
-
     geno_start_col = None
     header_columns = []
     sample_list = []
-- 
cgit v1.2.3


From 7e3b91d11ee59c34fc4d59c7ca94d6702ec7c5bd Mon Sep 17 00:00:00 2001
From: zsloan
Date: Wed, 9 Mar 2022 20:26:12 +0000
Subject: Generate JSON file for target genotypes

Also store parents/type metadata from source genofiles
---
 wqflask/maintenance/gen_ind_genofiles.py | 41 +++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 6 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index b781d7d1..9a97626d 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -33,9 +33,22 @@ def main(args):
     else:
         source_files = [geno_dir + group + ".geno" if ".geno" not in group else group for group in args[4:]]
 
+    if len(source_files) > 1:
+        # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files
+        target_json_loc = out_dir + args[3].split(".")[:-1] + ".json"
+        target_json = {'genofile': []}
+
     # Generate the output .geno files
     for source_file in source_files:
-        generate_new_genofile(source_file, target_file)
+        filename, samples = generate_new_genofile(source_file, target_file)
+
+        target_json['genofile'].append({
+            'location': filename.split("/")[-1],
+            'title': filename.split("/")[-1],
+            'sample_list': samples
+        })
+
+    json.dump(target_json, open(target_json_loc, "w"))
 
 def get_strain_for_sample(sample):
     query = (
@@ -67,7 +80,7 @@ def map_strain_pos_to_target_group(base_samples, target_samples):
     Returns: [0, 0, 1, 2, 2, 2]
     """
     pos_map = []
-    for i, sample in enumerate(target_samples):
+    for sample in target_samples:
         sample_strain = get_strain_for_sample(sample)
         pos_map.append(base_samples.index(sample_strain))
 
@@ -110,14 +123,28 @@ def strain_genotypes(strain_genofile: str) -> List:
     ]
     """
 
+    geno_dict = {}
+
     geno_start_col = None
     header_columns = []
     sample_list = []
     marker_genotypes = []
     with open(file_location, "r") as source_geno:
         for i, line in enumerate(source_geno):
-            # Skip header lines
-            if line[0] in ["#", "@"] or not len(line):
+            if line[0] == "@":
+                if "@type" in line:
+                    geno_dict['type'] = line.split(":")[1]
+                if "@mat" in line:
+                    geno_dict['mat'] = line.split(":")[1]
+                elif "@pat" in line:
+                    geno_dict['pat'] = line.split(":")[1]
+                elif "@het" in line:
+                    geno_dict['het'] = line.split(":")[1]
+                elif "@unk" in line:
+                    geno_dict['unk'] = line.split(":")[1]
+
+            # Skip other header lines
+            if line[0] == "#" or not len(line):
                 continue
 
             line_items = line.split("\t")
@@ -140,11 +167,13 @@ def strain_genotypes(strain_genofile: str) -> List:
                     'Locus': line_items[header_columns.index("Locus")],
                     'Mb': line_items[header_columns.index("Mb")],
                     'cM': line_items[header_columns.index("cM")],
-                    'genotypes': list(zip(sample_list, [item.strip() for item in line_items][geno_start_col:]))
+                    'genotypes': [item.strip() for item in line_items][geno_start_col:]
                 }
                 marker_genotypes.append(this_marker)
 
-    return marker_genotypes
+    geno_dict['genotypes'] = marker_genotypes
+
+    return geno_dict
             
 if __name__ == "__main__":
     print("command line arguments:\n\t%s" % sys.argv)
-- 
cgit v1.2.3


From a51f95bea5fa9a3b767aaebf75adfa706cf7940f Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 10 Mar 2022 00:45:11 +0000
Subject: Add code generating the new genotype files

Also made a large number of other fixes that proved necessary during
testing
---
 wqflask/maintenance/gen_ind_genofiles.py | 114 +++++++++++++++++++++++--------
 1 file changed, 85 insertions(+), 29 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index 9a97626d..e705119f 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -2,6 +2,7 @@
 # python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.json
 # python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.2.geno BXD.4.geno BXD.5.geno
 
+import json
 import os
 import sys
 from typing import List
@@ -28,23 +29,37 @@ def main(args):
     # The source group(s) we're generating the .geno files from
     # This can be passed as either a specific .geno file (or set of files as multiple arguments),
     # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists)
+    geno_json = {}
+    source_files = []
     if ".json" in args[4]:
-        source_files = [geno_dir + genofile['location'] for genofile in json.load(args[4])['genofile']]
+        geno_json = json.load(open(geno_dir + args[4], "r"))
+        par_f1s = {
+            "mat": geno_json['mat'],
+            "pat": geno_json['pat'],
+            "f1s": geno_json['f1s']
+        }
+
+        # List of file titles and locations from JSON
+        source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']]
     else:
-        source_files = [geno_dir + group + ".geno" if ".geno" not in group else group for group in args[4:]]
+        par_f1s = {}
+        # List of files directly taken from command line arguments, with titles just set to the filename
+        for group in args[4:]:
+            file_name = geno_dir + group + ".geno" if ".geno" not in group else group
+            source_files.append({'title': file_name[:-5], 'location': file_name})
 
     if len(source_files) > 1:
         # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files
-        target_json_loc = out_dir + args[3].split(".")[:-1] + ".json"
+        target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json"
         target_json = {'genofile': []}
 
     # Generate the output .geno files
     for source_file in source_files:
-        filename, samples = generate_new_genofile(source_file, target_file)
+        filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir)
 
         target_json['genofile'].append({
             'location': filename.split("/")[-1],
-            'title': filename.split("/")[-1],
+            'title': source_file['title'],
             'sample_list': samples
         })
 
@@ -55,20 +70,59 @@ def get_strain_for_sample(sample):
         "SELECT CaseAttributeXRefNew.Value "
         "FROM CaseAttributeXRefNew, Strain "
         "WHERE CaseAttributeXRefNew.CaseAttributeId=11 "
-        "AND CaseAttributeXRef.New.StrainId = Strain.Id "
+        "AND CaseAttributeXRefNew.StrainId = Strain.Id "
         "AND Strain.Name = %(name)s" )
 
-    with conn.cursor() as cursor:
-        return cursor.execute(query, {"name": name}).fetchone()[0]
+    with conn().cursor() as cursor:
+        cursor.execute(query, {"name": sample.strip()})
+        return cursor.fetchone()[0]
 
-def generate_new_genofiles(source_genofile, target_genofile):
-    base_samples = group_samples(source_genofile)
-    base_genotypes = strain_genotypes(source_genofile)
+def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir):
+    source_samples = group_samples(source_genofile)
+    source_genotypes = strain_genotypes(source_genofile)
     target_samples = group_samples(target_genofile)
-    strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples)
+    strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s)
 
+    if len(source_genofile.split("/")[-1].split(".")) > 2:
+        # The number in the source genofile; for example 4 in BXD.4.geno
+        source_num = source_genofile.split("/")[-1].split(".")[-2]
+        target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno"
+    else:
+        target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno"
+
+    file_location = out_dir + target_filename
+
+    with open(file_location, "w") as fh:
+        for metadata in ["name", "type", "mat", "pat", "het", "unk"]:
+            fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n")
+
+        header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples
+        fh.write("\t".join(header_line))
+
+        for marker in source_genotypes['markers']:
+            line_items = [
+                marker['Chr'],
+                marker['Locus'],
+                marker['cM'],
+                marker['Mb']
+            ]
+
+            for pos in strain_pos_map:
+                if isinstance(pos, int):
+                    line_items.append(marker['genotypes'][pos])
+                else:
+                    if pos in ["mat", "pat"]:
+                        line_items.append(source_genotypes[pos])
+                    elif pos == "f1s":
+                        line_items.append("H")
+                    else:
+                        line_items.append("U")
 
-def map_strain_pos_to_target_group(base_samples, target_samples):
+            fh.write("\t".join(line_items) + "\n")
+
+    return file_location, target_samples
+
+def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s):
     """
     Retrieve corresponding strain position for each sample in the target group
 
@@ -82,7 +136,14 @@ def map_strain_pos_to_target_group(base_samples, target_samples):
     pos_map = []
     for sample in target_samples:
         sample_strain = get_strain_for_sample(sample)
-        pos_map.append(base_samples.index(sample_strain))
+        if sample_strain in source_samples:
+            pos_map.append(source_samples.index(sample_strain))
+        else:
+            val = "U"
+            for key in par_f1s.keys():
+                if sample_strain in par_f1s[key]:
+                    val = key
+            pos_map.append(val)
 
     return pos_map
 
@@ -128,27 +189,21 @@ def strain_genotypes(strain_genofile: str) -> List:
     geno_start_col = None
     header_columns = []
     sample_list = []
-    marker_genotypes = []
-    with open(file_location, "r") as source_geno:
+    markers = []
+    with open(strain_genofile, "r") as source_geno:
         for i, line in enumerate(source_geno):
             if line[0] == "@":
-                if "@type" in line:
-                    geno_dict['type'] = line.split(":")[1]
-                if "@mat" in line:
-                    geno_dict['mat'] = line.split(":")[1]
-                elif "@pat" in line:
-                    geno_dict['pat'] = line.split(":")[1]
-                elif "@het" in line:
-                    geno_dict['het'] = line.split(":")[1]
-                elif "@unk" in line:
-                    geno_dict['unk'] = line.split(":")[1]
+                metadata_type = line[1:].split(":")[0]
+                if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']:
+                    geno_dict[metadata_type] = line.split(":")[1].strip()
+
+                continue
 
             # Skip other header lines
             if line[0] == "#" or not len(line):
                 continue
 
             line_items = line.split("\t")
-
             if "Chr" in line_items: # Header row
                 # Get the first column index containing genotypes
                 header_columns = line_items
@@ -169,9 +224,10 @@ def strain_genotypes(strain_genofile: str) -> List:
                     'cM': line_items[header_columns.index("cM")],
                     'genotypes': [item.strip() for item in line_items][geno_start_col:]
                 }
-                marker_genotypes.append(this_marker)
 
-    geno_dict['genotypes'] = marker_genotypes
+                markers.append(this_marker)
+
+    geno_dict['markers'] = markers
 
     return geno_dict
             
-- 
cgit v1.2.3


From f76bca81639027a87e2d2cc5697258714d7bf7d9 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 10 Mar 2022 00:50:14 +0000
Subject: Replace top comment with docstring

---
 wqflask/maintenance/gen_ind_genofiles.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index e705119f..0c4efba0 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -1,6 +1,19 @@
-# Example commands:
-# python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.json
-# python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.2.geno BXD.4.geno BXD.5.geno
+#!/usr/bin/env python3
+"""A script that generates the genotype files for groups of individuals, using an existing strain genotype file as a basis
+
+Example commands:
+python3 gen_ind_genofiles.py
+        /home/zas1024/gn2-zach/genotype_files/genotype/
+        /home/zas1024/gn2-zach/new_geno/
+        BXD-Micturition.geno
+        BXD.json
+python3 gen_ind_genofiles.py
+        /home/zas1024/gn2-zach/genotype_files/genotype
+        /home/zas1024/gn2-zach/new_geno/
+        BXD-Micturition.geno
+        BXD.2.geno BXD.4.geno BXD.5.geno
+
+"""
 
 import json
 import os
-- 
cgit v1.2.3


From 9126eda6159c5d605c41aae276f5dd9ba8df3f01 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 10 Mar 2022 00:51:07 +0000
Subject: Remove unnecessary print statement

---
 wqflask/maintenance/gen_ind_genofiles.py | 1 -
 1 file changed, 1 deletion(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py
index 0c4efba0..8b958efa 100644
--- a/wqflask/maintenance/gen_ind_genofiles.py
+++ b/wqflask/maintenance/gen_ind_genofiles.py
@@ -245,6 +245,5 @@ def strain_genotypes(strain_genofile: str) -> List:
     return geno_dict
             
 if __name__ == "__main__":
-    print("command line arguments:\n\t%s" % sys.argv)
     main(sys.argv)
 
-- 
cgit v1.2.3


From ec1ca101b15421c83de6094984dcec985a395d71 Mon Sep 17 00:00:00 2001
From: BonfaceKilz
Date: Thu, 17 Mar 2022 16:20:32 +0300
Subject: Create a db connection correctly

* wqflask/maintenance/quantile_normalize.py: Fix how the cursor is
created.
---
 wqflask/maintenance/quantile_normalize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'wqflask/maintenance')

diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index 2e2b0ec3..90ec72de 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -100,7 +100,7 @@ def set_data(cursor, dataset_name):
 
 if __name__ == '__main__':
     with database_connection as conn:
-        with conn.cursor as cursor:
+        with conn.cursor() as cursor:
             success, _ = bulk(es, set_data(cursor, sys.argv[1]))
 
             response = es.search(
-- 
cgit v1.2.3