diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/add_missing_columns.sh | 3 | ||||
-rw-r--r-- | scripts/authentication/group.py | 153 | ||||
-rw-r--r-- | scripts/authentication/resource.py | 104 | ||||
-rw-r--r-- | scripts/convert_dol_genotypes.py | 74 | ||||
-rwxr-xr-x | scripts/maintenance/datastructure.py | 177 | ||||
-rwxr-xr-x | scripts/maintenance/load_phenotypes.py | 43 | ||||
-rw-r--r-- | scripts/maintenance/utilities.py | 89 |
7 files changed, 625 insertions, 18 deletions
diff --git a/scripts/add_missing_columns.sh b/scripts/add_missing_columns.sh index 70d5fdeb..611e2dd6 100644 --- a/scripts/add_missing_columns.sh +++ b/scripts/add_missing_columns.sh @@ -13,6 +13,9 @@ ALTER TABLE PublishXRef ADD mean double AFTER DataId; + ALTER TABLE CaseAttribute + ADD Description varchar(255) AFTER Name; + -- This takes some time ALTER TABLE ProbeSet ADD UniProtID varchar(20) AFTER ProteinName; diff --git a/scripts/authentication/group.py b/scripts/authentication/group.py new file mode 100644 index 00000000..c8c2caad --- /dev/null +++ b/scripts/authentication/group.py @@ -0,0 +1,153 @@ +"""A script for adding users to a specific group. + +Example: + +Assuming there are no groups and 'test@bonfacemunyoki.com' does not +exist in Redis: + +.. code-block:: bash + python group.py -g "editors" -m "test@bonfacemunyoki.com" + +results in:: + + Successfully created the group: 'editors' + Data: '{"admins": [], "members": []}' + +If 'me@bonfacemunyoki.com' exists in 'users' in Redis and we run: + +.. code-block:: bash + python group.py -g "editors" -m "me@bonfacemunyoki.com" + +now results in:: + + No new group was created. + Updated Data: {'admins': [], 'members': ['me@bonfacemunyoki.com']} + +""" + +import argparse +import datetime +import redis +import json +import uuid + +from typing import Dict, Optional, Set + + +def create_group_data(users: Dict, target_group: str, + members: Optional[str] = None, + admins: Optional[str] = None) -> Dict: + """Return a dictionary that contains the following keys: "key", + "field", and "value" that can be used in a redis hash as follows: + HSET key field value + + The "field" return value is a unique-id that is used to + distinguish the groups. + + Parameters: + + - `users`: a list of users for example: + + {'8ad942fe-490d-453e-bd37-56f252e41603': + '{"email_address": "me@test.com", + "full_name": "John Doe", + "organization": "Genenetwork", + "password": {"algorithm": "pbkdf2", + "hashfunc": "sha256", + "salt": "gJrd1HnPSSCmzB5veMPaVk2ozzDlS1Z7Ggcyl1+pciA=", + "iterations": 100000, "keylength": 32, + "created_timestamp": "2021-09-22T11:32:44.971912", + "password": "edcdaa60e84526c6"}, + "user_id": "8ad942fe", "confirmed": 1, + "registration_info": { + "timestamp": "2021-09-22T11:32:45.028833", + "ip_address": "127.0.0.1", + "user_agent": "Mozilla/5.0"}}'} + + - `target_group`: the group name that will be stored inside the + "groups" hash in Redis. + + - `members`: a comma-separated list of values that contain members + of the `target_group` e.g. "me@test1.com, me@test2.com, + me@test3.com" + + - `admins`: a comma-separated list of values that contain + administrators of the `target_group` e.g. "me@test1.com, + me@test2.com, me@test3.com" + + """ + # Emails + _members: Set = set("".join(members.split()).split(",") + if members else []) + _admins: Set = set("".join(admins.split()).split(",") + if admins else []) + + # Unique IDs + member_ids: Set = set() + admin_ids: Set = set() + + for user_id, user_details in users.items(): + _details = json.loads(user_details) + if _details.get("email_address") in _members: + member_ids.add(user_id) + if _details.get("email_address") in _admins: + admin_ids.add(user_id) + + timestamp: str = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p') + return {"key": "groups", + "field": str(uuid.uuid4()), + "value": json.dumps({ + "name": target_group, + "admins": list(admin_ids), + "members": list(member_ids), + "changed_timestamp": timestamp, + })} + + +if __name__ == "__main__": + # Initialising the parser CLI arguments + parser = argparse.ArgumentParser() + parser.add_argument("-g", "--group-name", + help="This is the name of the GROUP mask") + parser.add_argument("-m", "--members", + help="Members of the GROUP mask") + parser.add_argument("-a", "--admins", + help="Admins of the GROUP mask") + args = parser.parse_args() + + if not args.group_name: + exit("\nExiting. Please specify a group name to use!\n") + + members = args.members if args.members else None + admins = args.admins if args.admins else None + + REDIS_CONN = redis.Redis(decode_responses=True) + USERS = REDIS_CONN.hgetall("users") + + if not any([members, admins]): + exit("\nExiting. Please provide a value for " + "MEMBERS(-m) or ADMINS(-a)!\n") + + data = create_group_data( + users=USERS, + target_group=args.group_name, + members=members, + admins=admins) + + if not REDIS_CONN.hget("groups", data.get("field")): + updated_data = json.loads(data["value"]) + timestamp = datetime.datetime.utcnow().strftime('%b %d %Y %I:%M%p') + updated_data["created_timestamp"] = timestamp + data["value"] = json.dumps(updated_data) + + created_p = REDIS_CONN.hset(data.get("key", ""), + data.get("field", ""), + data.get("value", "")) + + groups = json.loads(REDIS_CONN.hget("groups", + data.get("field"))) # type: ignore + if created_p: + exit(f"\nSuccessfully created the group: '{args.group_name}'\n" + f"`HGETALL groups {args.group_name}`: {groups}\n") + exit("\nNo new group was created.\n" + f"`HGETALL groups {args.group_name}`: {groups}\n") diff --git a/scripts/authentication/resource.py b/scripts/authentication/resource.py new file mode 100644 index 00000000..4996f34c --- /dev/null +++ b/scripts/authentication/resource.py @@ -0,0 +1,104 @@ +"""A script that: + +- Optionally restores data from a json file. + +- By default, without any args provided, adds the group: 'editors' to +every resource. 'editors' should have the right to edit both metadata +and data. + +- Optionally creates a back-up every time you edit a resource. + + +To restore a back-up: + +.. code-block:: python + python resource.py --restore <PATH/TO/RESOURCE/BACK-UP/FILE> + +To add editors to every resource without creating a back-up: + +.. code-block:: python + python resource.py + +To add editors to every resource while creating a back-up before any +destructive edits: + +.. code-block:: python + python resource.py --enable-backup + +""" +import argparse +import json +import redis +import os + +from datetime import datetime + + +def recover_hash(name: str, file_path: str, set_function) -> bool: + """Recover back-ups using the `set_function` + + Parameters: + + - `name`: Redis hash where `file_path` will be restored + + - `file_path`: File path where redis hash is sourced from + + - `set_function`: Function used to do the Redis backup for + example: HSET + + """ + try: + with open(file_path, "r") as f: + resources = json.load(f) + for resource_id, resource in resources.items(): + set_function(name=name, + key=resource_id, + value=resource) + return True + except Exception as e: + print(e) + return False + + +if __name__ == "__main__": + # Initialising the parser CLI arguments + parser = argparse.ArgumentParser() + parser.add_argument("--group-id", + help="Add the group id to all resources") + parser.add_argument("--restore", + help="Restore from a given backup") + parser.add_argument("--enable-backup", action="store_true", + help="Create a back up before edits") + args = parser.parse_args() + + if not args.group_id: + exit("Please specify the group-id!\n") + if args.restore: + if recover_hash(name="resources", + file_path=args.back_up, + set_function=redis.Redis(decode_responses=True).hset): + exit(f"\n Done restoring {args.back_up}!\n") + else: + exit(f"\n There was an error restoring {args.back_up}!\n") + + REDIS_CONN = redis.Redis(decode_responses=True) + RESOURCES = REDIS_CONN.hgetall("resources") + BACKUP_DIR = os.path.join(os.getenv("HOME"), "redis") + if args.enable_backup: + FILENAME = ("resources-" + f"{datetime.now().strftime('%Y-%m-%d-%I:%M:%S-%p')}" + ".json") + if not os.path.exists(BACKUP_DIR): + os.mkdir(BACKUP_DIR) + with open(os.path.join(BACKUP_DIR, FILENAME), "w") as f: + json.dump(RESOURCES, f, indent=4) + print(f"\nDone backing upto {FILENAME}") + + for resource_id, resource in RESOURCES.items(): + _resource = json.loads(resource) # str -> dict conversion + _resource["group_masks"] = {args.group_id: {"metadata": "edit", + "data": "edit"}} + REDIS_CONN.hset("resources", + resource_id, + json.dumps(_resource)) + exit("Done updating `resources`\n") diff --git a/scripts/convert_dol_genotypes.py b/scripts/convert_dol_genotypes.py new file mode 100644 index 00000000..81b3bd6d --- /dev/null +++ b/scripts/convert_dol_genotypes.py @@ -0,0 +1,74 @@ +# This is just to convert the Rqtl2 format genotype files for DOL into a .geno file +# Everything is hard-coded since I doubt this will be re-used and I just wanted to generate the file quickly + +import os + +geno_dir = "/home/zas1024/gn2-zach/DO_genotypes/" +markers_file = "/home/zas1024/gn2-zach/DO_genotypes/SNP_Map.txt" +gn_geno_path = "/home/zas1024/gn2-zach/DO_genotypes/DOL.geno" + +# Iterate through the SNP_Map.txt file to get marker positions +marker_data = {} +with open(markers_file, "r") as markers_fh: + for i, line in enumerate(markers_fh): + if i == 0: + continue + else: + line_items = line.split("\t") + this_marker = {} + this_marker['chr'] = line_items[2] if line_items[2] != "0" else "M" + this_marker['pos'] = f'{float(line_items[3])/1000000:.6f}' + marker_data[line_items[1]] = this_marker + +# Iterate through R/qtl2 format genotype files and pull out the samplelist and genotypes for each marker +sample_names = [] +for filename in os.listdir(geno_dir): + if "gm4qtl2_geno" in filename: + with open(geno_dir + "/" + filename, "r") as rqtl_geno_fh: + for i, line in enumerate(rqtl_geno_fh): + line_items = line.split(",") + if i < 3: + continue + elif not len(sample_names) and i == 3: + sample_names = [item.replace("TLB", "TB") for item in line_items[1:]] + elif i > 3: + marker_data[line_items[0]]['genotypes'] = ["X" if item.strip() == "-" else item.strip() for item in line_items[1:]] + +# Generate list of marker obs to iterate through when writing to .geno file +marker_list = [] +for key, value in marker_data.items(): + if 'genotypes' in value: + this_marker = { + 'chr': value['chr'], + 'locus': key, + 'pos': value['pos'], + 'genotypes': value['genotypes'] + } + marker_list.append(this_marker) + +def sort_func(e): + """For ensuring that X/Y chromosomes/mitochondria are sorted to the end correctly""" + try: + return float((e['chr']))*1000 + float(e['pos']) + except: + if e['chr'] == "X": + return 20000 + float(e['pos']) + elif e['chr'] == "Y": + return 21000 + float(e['pos']) + elif e['chr'] == "M": + return 22000 + float(e['pos']) + +# Sort markers by chromosome +marker_list.sort(key=sort_func) + +# Write lines to .geno file +with open(gn_geno_path, "w") as gn_geno_fh: + gn_geno_fh.write("\t".join((["Chr", "Locus", "cM", "Mb"] + sample_names))) + for marker in marker_list: + row_contents = [ + marker['chr'], + marker['locus'], + marker['pos'], + marker['pos'] + ] + marker['genotypes'] + gn_geno_fh.write("\t".join(row_contents) + "\n") diff --git a/scripts/maintenance/datastructure.py b/scripts/maintenance/datastructure.py new file mode 100755 index 00000000..9f3e8b1e --- /dev/null +++ b/scripts/maintenance/datastructure.py @@ -0,0 +1,177 @@ +import utilities + +def get_probesetfreezes(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT ProbeSetFreeze.`Id`, ProbeSetFreeze.`Name`, ProbeSetFreeze.`FullName` + FROM ProbeSetFreeze, ProbeFreeze + WHERE ProbeSetFreeze.`ProbeFreezeId`=ProbeFreeze.`Id` + AND ProbeFreeze.`InbredSetId`=%s + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchall() + +def get_probesetfreeze(probesetfreezeid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT ProbeSetFreeze.`Id`, ProbeSetFreeze.`Name`, ProbeSetFreeze.`FullName` + FROM ProbeSetFreeze + WHERE ProbeSetFreeze.`Id`=%s + """ + cursor.execute(sql, (probesetfreezeid)) + return cursor.fetchone() + +def get_strains(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT Strain.`Id`, Strain.`Name` + FROM StrainXRef, Strain + WHERE StrainXRef.`InbredSetId`=%s + AND StrainXRef.`StrainId`=Strain.`Id` + ORDER BY StrainXRef.`OrderId` + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchall() + +def get_inbredset(probesetfreezeid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT InbredSet.`Id`, InbredSet.`Name`, InbredSet.`FullName` + FROM InbredSet, ProbeFreeze, ProbeSetFreeze + WHERE InbredSet.`Id`=ProbeFreeze.`InbredSetId` + AND ProbeFreeze.`Id`=ProbeSetFreeze.`ProbeFreezeId` + AND ProbeSetFreeze.`Id`=%s + """ + cursor.execute(sql, (probesetfreezeid)) + return cursor.fetchone() + +def get_species(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT Species.`Id`, Species.`Name`, Species.`MenuName`, Species.`FullName` + FROM InbredSet, Species + WHERE InbredSet.`Id`=%s + AND InbredSet.`SpeciesId`=Species.`Id` + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchone() + +def get_genofreeze_byinbredsetid(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT GenoFreeze.`Id`, GenoFreeze.`Name`, GenoFreeze.`FullName`, GenoFreeze.`InbredSetId` + FROM GenoFreeze + WHERE GenoFreeze.`InbredSetId`=%s + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchone() + +def get_nextdataid_genotype(): + cursor, con = utilities.get_cursor() + sql = """ + SELECT GenoData.`Id` + FROM GenoData + ORDER BY GenoData.`Id` DESC + LIMIT 1 + """ + cursor.execute(sql) + re = cursor.fetchone() + dataid = re[0] + dataid += 1 + return dataid + +def get_nextdataid_phenotype(): + cursor, con = utilities.get_cursor() + sql = """ + SELECT PublishData.`Id` + FROM PublishData + ORDER BY PublishData.`Id` DESC + LIMIT 1 + """ + cursor.execute(sql) + re = cursor.fetchone() + dataid = re[0] + dataid += 1 + return dataid + +def get_nextorderid_strainxref(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT StrainXRef.`OrderId` + FROM StrainXRef + WHERE StrainXRef.`InbredSetId`=%s + ORDER BY StrainXRef.`OrderId` DESC + LIMIT 1 + """ + cursor.execute(sql, (inbredsetid)) + re = cursor.fetchone() + if re: + orderid = re[0] + 1 + else: + orderid = 1 + return orderid + +def insert_strain(inbredsetid, strainname): + speciesid = get_species(inbredsetid)[0] + cursor, con = utilities.get_cursor() + sql = """ + INSERT INTO Strain + SET + Strain.`Name`=%s, + Strain.`Name2`=%s, + Strain.`SpeciesId`=%s + """ + cursor.execute(sql, (strainname, strainname, speciesid)) + +def insert_strainxref(inbredsetid, strainid): + orderid = get_nextorderid_strainxref(inbredsetid) + cursor, con = utilities.get_cursor() + sql = """ + INSERT INTO StrainXRef + SET + StrainXRef.`InbredSetId`=%s, + StrainXRef.`StrainId`=%s, + StrainXRef.`OrderId`=%s, + StrainXRef.`Used_for_mapping`=%s, + StrainXRef.`PedigreeStatus`=%s + """ + cursor.execute(sql, (inbredsetid, strainid, orderid, "N", None)) + +def get_strain(inbredsetid, strainname): + speciesid = get_species(inbredsetid)[0] + cursor, con = utilities.get_cursor() + sql = """ + SELECT Strain.`Id`, Strain.`Name` + FROM Strain + WHERE Strain.`SpeciesId`=%s + AND Strain.`Name` LIKE %s + """ + cursor.execute(sql, (speciesid, strainname)) + return cursor.fetchone() + +def get_strainxref(inbredsetid, strainid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT StrainXRef.`StrainId` + FROM StrainXRef + WHERE StrainXRef.`InbredSetId`=%s + AND StrainXRef.`StrainId`=%s + """ + cursor.execute(sql, (inbredsetid, strainid)) + return cursor.fetchone() + +def get_strain_sure(inbredsetid, strainname, updatestrainxref=None): + strain = get_strain(inbredsetid, strainname) + if not strain: + insert_strain(inbredsetid, strainname) + strain = get_strain(inbredsetid, strainname) + strainxref = get_strainxref(inbredsetid, strain[0]) + if not strainxref and updatestrainxref: + insert_strainxref(inbredsetid, strain[0]) + return strain + +def get_strains_bynames(inbredsetid, strainnames, updatestrainxref=None): + strains = [] + for strainname in strainnames: + strains.append(get_strain_sure(inbredsetid, strainname, updatestrainxref)) + return strains diff --git a/scripts/maintenance/load_phenotypes.py b/scripts/maintenance/load_phenotypes.py index 759d2eec..aa02d0cd 100755 --- a/scripts/maintenance/load_phenotypes.py +++ b/scripts/maintenance/load_phenotypes.py @@ -1,3 +1,11 @@ +# Load Python3 environment with GN2 utilities: +# +# source /usr/local/guix-profiles/gn-latest-20210512/etc/profile +# +# and run +# +# python load_phenotypes.py [args...] + import sys import csv @@ -9,35 +17,34 @@ def main(argv): config = utilities.get_config(argv[1]) print("config:") for item in config.items('config'): - print(("\t%s" % (str(item)))) + print("\t%s" % (str(item))) # var inbredsetid = config.get('config', 'inbredsetid') - print(("inbredsetid: %s" % inbredsetid)) + print("inbredsetid: %s" % inbredsetid) species = datastructure.get_species(inbredsetid) speciesid = species[0] - print(("speciesid: %s" % speciesid)) + print("speciesid: %s" % speciesid) dataid = datastructure.get_nextdataid_phenotype() - print(("next data id: %s" % dataid)) + print("next data id: %s" % dataid) cursor, con = utilities.get_cursor() # datafile datafile = open(config.get('config', 'datafile'), 'r') phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"') - phenotypedata_head = next(phenotypedata) - print(("phenotypedata head:\n\t%s" % phenotypedata_head)) + phenotypedata_head = phenotypedata.next() + print("phenotypedata head:\n\t%s" % phenotypedata_head) strainnames = phenotypedata_head[1:] strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") # metafile metafile = open(config.get('config', 'metafile'), 'r') phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"') - phenotypemeta_head = next(phenotypemeta) - print(("phenotypemeta head:\n\t%s" % phenotypemeta_head)) - print() + phenotypemeta_head = phenotypemeta.next() + print("phenotypemeta head:\n\t%s" % phenotypemeta_head) # load for metarow in phenotypemeta: # - datarow_value = next(phenotypedata) - datarow_se = next(phenotypedata) - datarow_n = next(phenotypedata) + datarow_value = phenotypedata.next() + datarow_se = phenotypedata.next() + datarow_n = phenotypedata.next() # Phenotype sql = """ INSERT INTO Phenotype @@ -67,7 +74,7 @@ def main(argv): )) rowcount = cursor.rowcount phenotypeid = con.insert_id() - print(("INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid))) + print("INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid)) # Publication publicationid = None # reset pubmed_id = utilities.to_db_string(metarow[0], None) @@ -81,7 +88,7 @@ def main(argv): re = cursor.fetchone() if re: publicationid = re[0] - print(("get Publication record: %d" % publicationid)) + print("get Publication record: %d" % publicationid) if not publicationid: sql = """ INSERT INTO Publication @@ -109,7 +116,7 @@ def main(argv): )) rowcount = cursor.rowcount publicationid = con.insert_id() - print(("INSERT INTO Publication: %d record: %d" % (rowcount, publicationid))) + print("INSERT INTO Publication: %d record: %d" % (rowcount, publicationid)) # data for index, strain in enumerate(strains): # @@ -158,14 +165,14 @@ def main(argv): cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, "")) rowcount = cursor.rowcount publishxrefid = con.insert_id() - print(("INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid))) + print("INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid)) # for loop next dataid += 1 - print() + print # release con.close() if __name__ == "__main__": - print(("command line arguments:\n\t%s" % sys.argv)) + print("command line arguments:\n\t%s" % sys.argv) main(sys.argv) print("exit successfully") diff --git a/scripts/maintenance/utilities.py b/scripts/maintenance/utilities.py new file mode 100644 index 00000000..886410c2 --- /dev/null +++ b/scripts/maintenance/utilities.py @@ -0,0 +1,89 @@ +import MySQLdb +import re +import configparser + +def get_cursor(): + host = 'tux.uthsc.edu' + user = 'webqtlout' + passwd = 'webqtlout' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, host=host, user=user, passwd=passwd) + cursor = con.cursor() + return cursor, con + +def clearspaces(s, default=None): + if s: + s = re.sub('\s+', ' ', s) + s = s.strip() + return s + else: + return default + +def to_dic(keys, values): + dic = {} + for i in range(len(keys)): + key = keys[i] + value = values[i] + dic[key] = value + return dic + +def overlap(dic1, dic2): + keys = [] + values1 = [] + values2 = [] + for key in dic1.keys(): + if key in dic2: + value1 = dic1[key] + value2 = dic2[key] + if value1 and value2: + keys.append(key) + values1.append(value1) + values2.append(value2) + return keys, values1, values2 + +def to_db_string(s, default): + if s: + s = s.strip() + if len(s) == 0: + return default + elif s == 'x': + return default + else: + return s + else: + return default + +def to_db_float(s, default): + if s: + s = s.strip() + if len(s) == 0: + return default + elif s == 'x': + return default + else: + try: + return float(s) + except: + return default + else: + return default + +def to_db_int(s, default): + if s: + s = s.strip() + if len(s) == 0: + return default + elif s == 'x': + return default + else: + try: + return int(s) + except: + return default + else: + return default + +def get_config(configfile): + config = configparser.ConfigParser() + config.read(configfile) + return config |