From 79ce392c5a482a0460e8a91344a0e63c9e9a8085 Mon Sep 17 00:00:00 2001
From: zsloan
Date: Tue, 24 Oct 2023 17:41:16 +0000
Subject: Add script for creating list of ProbeSet datasets with NAs for Max
 QTL

---
 scripts/find_dbs_with_NAs.py | 54 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 scripts/find_dbs_with_NAs.py

diff --git a/scripts/find_dbs_with_NAs.py b/scripts/find_dbs_with_NAs.py
new file mode 100644
index 00000000..a4ff3ee5
--- /dev/null
+++ b/scripts/find_dbs_with_NAs.py
@@ -0,0 +1,54 @@
+import os
+from typing import Tuple
+from urllib.parse import urlparse
+import MySQLdb as mdb
+from MySQLdb.cursors import DictCursor
+
+def parse_db_url(sql_uri: str) -> Tuple:
+    """function to parse SQL_URI"""
+    parsed_db = urlparse(sql_uri)
+    return (
+        parsed_db.hostname, parsed_db.username, parsed_db.password,
+        parsed_db.path[1:], parsed_db.port)
+
+sql_uri = os.environ.get("SQL_URI")
+host, user, passwd, db_name, port = parse_db_url(sql_uri)
+conn = mdb.connect(db=db_name,
+                   user=user,
+                   passwd=passwd,
+                   host=host,
+                   port=port)
+
+query = (
+    "SELECT psf.Id, psf.FullName "
+    "FROM ProbeSetXRef AS psx "
+    "INNER JOIN ProbeSetFreeze AS psf ON psx.ProbeSetFreezeId = psf.Id "
+    "INNER JOIN ProbeFreeze AS pf ON psf.ProbeFreezeId = pf.Id "
+    "INNER JOIN InbredSet AS ibs ON pf.InbredSetId = ibs.Id "
+    "WHERE psx.Locus IS NULL AND "
+    "ibs.MappingMethodId = 1"
+)
+
+dataset_NAs_counts = {}
+dataset_name_dict = {} # Mapping of IDs to names
+with conn.cursor(cursorclass=DictCursor) as cursor:
+    cursor.execute(query)
+    for row in cursor.fetchall():
+        dset_id = str(row['Id'])
+        if dset_id in dataset_NAs_counts:
+            dataset_NAs_counts[dset_id] += 1
+        else:
+            dataset_NAs_counts[dset_id] = 1
+        if dset_id not in dataset_name_dict:
+            dataset_name_dict[dset_id] = str(row['FullName'])
+
+sorted_NAs_counts = {k: v for k, v in sorted(dataset_NAs_counts.items(), key=lambda item: item[1], reverse=True)}
+
+output_path = os.path.join(os.environ.get("TMPDIR"), "filtered_NAs_list.csv")
+with open(output_path, "w") as out_file:
+    out_file.write("ID\tFullName\tCount\n")
+    for dset_id in sorted_NAs_counts:
+        out_file.write("\t".join([str(dset_id), str(dataset_name_dict[dset_id]), str(sorted_NAs_counts[dset_id])]) + "\n")
+
+
+        
\ No newline at end of file
-- 
cgit 1.4.1