From 58f59b8f7df82969b58a604070aec095d17e0501 Mon Sep 17 00:00:00 2001
From: Muriithi Frederick Muriuki
Date: Mon, 30 Aug 2021 11:44:37 +0300
Subject: Fix issues with traits file format

* README.md: update header: Traits ==> Trait
* gn3/computations/qtlreaper.py: update header: Traits ==> Trait
* qtlfilesexport.py: Choose only BXD strains

  Rename the first column header from "Traits" to "Trait" to correspond with
  what `rust-qtlreaper` expects.

  Choose only the BXD strains for the proof-of-concept example - this helped
  bring out the fact that the traits file SHOULD NOT contain a strain column
  for a strain that does not exist in the genotype file in consideration.

  If the traits file has a strain column which does not exist in the genotype
  file, then `rust-qtlreaper` fails with a panic, since, from what I can tell,
  it tries to get a value from the genotype file for the non-existent strain,
  which results to a `None` type. Subsequent attempts at running an operation
  on the `None` type lead to the panic.
---
 README.md                     |  4 +++-
 gn3/computations/qtlreaper.py |  2 +-
 qtlfilesexport.py             | 31 ++++++++++++++++++++++++++++++-
 3 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 0e0e509..b54015f 100644
--- a/README.md
+++ b/README.md
@@ -136,8 +136,10 @@ Under the **"Trait"** column, the traits are numbered from **T1** to **T<n>** wh
 As an example, you could end up with a trait file like the following:
 
 ```txt
-Traits	BXD27	BXD32	DBA/2J	BXD21	...
+Trait	BXD27	BXD32	DBA/2J	BXD21	...
 T1	10.5735	9.27408	9.48255	9.18253	...
 T2	6.4471	6.7191	5.98015	6.68051	...
 ...
 ```
+
+It is very important that the column header names for the strains correspond to the genotype file used.
diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py
index a88659e..9b13a55 100644
--- a/gn3/computations/qtlreaper.py
+++ b/gn3/computations/qtlreaper.py
@@ -24,7 +24,7 @@ def generate_traits_file(strains, trait_values, traits_filename):
     traits_filename: The tab-separated value to put the values in for
         computation of QTLs.
     """
-    header = "Traits\t{}\n".format("\t".join(strains))
+    header = "Trait\t{}\n".format("\t".join(strains))
     data = [header] + [
         "T{}\t{}\n".format(i+1, "\t".join([str(i) for i in t]))
         for i, t in enumerate(trait_values[:-1])] + [
diff --git a/qtlfilesexport.py b/qtlfilesexport.py
index 0543dc9..adc5e77 100644
--- a/qtlfilesexport.py
+++ b/qtlfilesexport.py
@@ -41,7 +41,36 @@ def main():
         retrieve_trait_info(threshold, fullname, conn)
         for fullname in trait_fullnames()]
     traits_data_list = [retrieve_trait_data(t, conn) for t in traits]
-    strains = list(set([k for td in traits_data_list for k in td["data"].keys()]))
+    # strains = list(set([k for td in traits_data_list for k in td["data"].keys()]))
+    strains = [# Use only the strains in the BXD.geno genotype file
+        "BXD1", "BXD2", "BXD5", "BXD6", "BXD8", "BXD9", "BXD11", "BXD12",
+        "BXD13", "BXD14", "BXD15", "BXD16", "BXD18", "BXD19", "BXD20", "BXD21",
+        "BXD22", "BXD23", "BXD24", "BXD24a", "BXD25", "BXD27", "BXD28", "BXD29",
+        "BXD30", "BXD31", "BXD32", "BXD33", "BXD34", "BXD35", "BXD36", "BXD37",
+        "BXD38", "BXD39", "BXD40", "BXD41", "BXD42", "BXD43", "BXD44", "BXD45",
+        "BXD48", "BXD48a", "BXD49", "BXD50", "BXD51", "BXD52", "BXD53", "BXD54",
+        "BXD55", "BXD56", "BXD59", "BXD60", "BXD61", "BXD62", "BXD63", "BXD64",
+        "BXD65", "BXD65a", "BXD65b", "BXD66", "BXD67", "BXD68", "BXD69",
+        "BXD70", "BXD71", "BXD72", "BXD73", "BXD73a", "BXD73b", "BXD74",
+        "BXD75", "BXD76", "BXD77", "BXD78", "BXD79", "BXD81", "BXD83", "BXD84",
+        "BXD85", "BXD86", "BXD87", "BXD88", "BXD89", "BXD90", "BXD91", "BXD93",
+        "BXD94", "BXD95", "BXD98", "BXD99", "BXD100", "BXD101", "BXD102",
+        "BXD104", "BXD105", "BXD106", "BXD107", "BXD108", "BXD109", "BXD110",
+        "BXD111", "BXD112", "BXD113", "BXD114", "BXD115", "BXD116", "BXD117",
+        "BXD119", "BXD120", "BXD121", "BXD122", "BXD123", "BXD124", "BXD125",
+        "BXD126", "BXD127", "BXD128", "BXD128a", "BXD130", "BXD131", "BXD132",
+        "BXD133", "BXD134", "BXD135", "BXD136", "BXD137", "BXD138", "BXD139",
+        "BXD141", "BXD142", "BXD144", "BXD145", "BXD146", "BXD147", "BXD148",
+        "BXD149", "BXD150", "BXD151", "BXD152", "BXD153", "BXD154", "BXD155",
+        "BXD156", "BXD157", "BXD160", "BXD161", "BXD162", "BXD165", "BXD168",
+        "BXD169", "BXD170", "BXD171", "BXD172", "BXD173", "BXD174", "BXD175",
+        "BXD176", "BXD177", "BXD178", "BXD180", "BXD181", "BXD183", "BXD184",
+        "BXD186", "BXD187", "BXD188", "BXD189", "BXD190", "BXD191", "BXD192",
+        "BXD193", "BXD194", "BXD195", "BXD196", "BXD197", "BXD198", "BXD199",
+        "BXD200", "BXD201", "BXD202", "BXD203", "BXD204", "BXD205", "BXD206",
+        "BXD207", "BXD208", "BXD209", "BXD210", "BXD211", "BXD212", "BXD213",
+        "BXD214", "BXD215", "BXD216", "BXD217", "BXD218", "BXD219", "BXD220"
+    ]
     exported_traits_data_list = [
         export_trait_data(td, strains) for td in traits_data_list]
     slinked = slink(cluster_traits(exported_traits_data_list))
-- 
cgit v1.2.3