aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzsloan2023-10-24 17:41:16 +0000
committerzsloan2023-10-24 17:41:16 +0000
commit79ce392c5a482a0460e8a91344a0e63c9e9a8085 (patch)
tree36299a0b54123e150783d86c5316d5f654c616a1
parenta9dcc1091a3b02c79e9e6625f22dced79ee0a1a5 (diff)
downloadgenenetwork2-79ce392c5a482a0460e8a91344a0e63c9e9a8085.tar.gz
Add script for creating list of ProbeSet datasets with NAs for Max QTL
-rw-r--r--scripts/find_dbs_with_NAs.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/scripts/find_dbs_with_NAs.py b/scripts/find_dbs_with_NAs.py
new file mode 100644
index 00000000..a4ff3ee5
--- /dev/null
+++ b/scripts/find_dbs_with_NAs.py
@@ -0,0 +1,54 @@
+import os
+from typing import Tuple
+from urllib.parse import urlparse
+import MySQLdb as mdb
+from MySQLdb.cursors import DictCursor
+
+def parse_db_url(sql_uri: str) -> Tuple:
+ """function to parse SQL_URI"""
+ parsed_db = urlparse(sql_uri)
+ return (
+ parsed_db.hostname, parsed_db.username, parsed_db.password,
+ parsed_db.path[1:], parsed_db.port)
+
+sql_uri = os.environ.get("SQL_URI")
+host, user, passwd, db_name, port = parse_db_url(sql_uri)
+conn = mdb.connect(db=db_name,
+ user=user,
+ passwd=passwd,
+ host=host,
+ port=port)
+
+query = (
+ "SELECT psf.Id, psf.FullName "
+ "FROM ProbeSetXRef AS psx "
+ "INNER JOIN ProbeSetFreeze AS psf ON psx.ProbeSetFreezeId = psf.Id "
+ "INNER JOIN ProbeFreeze AS pf ON psf.ProbeFreezeId = pf.Id "
+ "INNER JOIN InbredSet AS ibs ON pf.InbredSetId = ibs.Id "
+ "WHERE psx.Locus IS NULL AND "
+ "ibs.MappingMethodId = 1"
+)
+
+dataset_NAs_counts = {}
+dataset_name_dict = {} # Mapping of IDs to names
+with conn.cursor(cursorclass=DictCursor) as cursor:
+ cursor.execute(query)
+ for row in cursor.fetchall():
+ dset_id = str(row['Id'])
+ if dset_id in dataset_NAs_counts:
+ dataset_NAs_counts[dset_id] += 1
+ else:
+ dataset_NAs_counts[dset_id] = 1
+ if dset_id not in dataset_name_dict:
+ dataset_name_dict[dset_id] = str(row['FullName'])
+
+sorted_NAs_counts = {k: v for k, v in sorted(dataset_NAs_counts.items(), key=lambda item: item[1], reverse=True)}
+
+output_path = os.path.join(os.environ.get("TMPDIR"), "filtered_NAs_list.csv")
+with open(output_path, "w") as out_file:
+ out_file.write("ID\tFullName\tCount\n")
+ for dset_id in sorted_NAs_counts:
+ out_file.write("\t".join([str(dset_id), str(dataset_name_dict[dset_id]), str(sorted_NAs_counts[dset_id])]) + "\n")
+
+
+ \ No newline at end of file