diff options
author | zsloan | 2022-03-09 20:01:53 +0000 |
---|---|---|
committer | zsloan | 2022-03-16 14:41:09 -0500 |
commit | f72480dee99ee6ab107bb84c6f3b5c663a04cc86 (patch) | |
tree | f28dc5672e38fa5708c14d9b3a98a4f81ea68963 /wqflask/maintenance/gen_ind_genofiles.py | |
parent | 27530d5a59bded06f644e4704ef21cb6da491350 (diff) | |
download | genenetwork2-f72480dee99ee6ab107bb84c6f3b5c663a04cc86.tar.gz |
Fix the way target/source genofiles were being processed + some other changes
- I was mixing up source/target genofiles previously; the JSON file is for the source genofiles
- references to the app context are removed in favor of just taking input as arguments or environment variables
- Updated example commands
Diffstat (limited to 'wqflask/maintenance/gen_ind_genofiles.py')
-rw-r--r-- | wqflask/maintenance/gen_ind_genofiles.py | 64 |
1 files changed, 31 insertions, 33 deletions
diff --git a/wqflask/maintenance/gen_ind_genofiles.py b/wqflask/maintenance/gen_ind_genofiles.py index b91660a4..b781d7d1 100644 --- a/wqflask/maintenance/gen_ind_genofiles.py +++ b/wqflask/maintenance/gen_ind_genofiles.py @@ -1,34 +1,41 @@ -# Example command: env GN2_PROFILE=/usr/local/guix-profiles/gn-latest-20220122 TMPDIR=/export/local/home/zas1024/gn2-zach/tmp WEBSERVER_MODE=DEBUG LOG_LEVEL=DEBUG SERVER_PORT=5002 GENENETWORK_FILES=/export/local/home/zas1024/gn2-zach/genotype_files SQL_URI=mysql://webqtlout:webqtlout@localhost/db_webqtl ./bin/genenetwork2 ./etc/default_settings.py -c ./maintenance/gen_ind_genofiles.py +# Example commands: +# python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.json +# python3 gen_ind_genofiles.py /home/zas1024/gn2-zach/genotype_files/genotype/ /home/zas1024/gn2-zach/new_geno/ BXD-Micturition.geno BXD.2.geno BXD.4.geno BXD.5.geno +import os import sys from typing import List import MySQLdb -from wqflask import app - def conn(): - return MySQLdb.Connect(db=app.config.get("DB_NAME"), - user=app.config.get("DB_USER"), - passwd=app.config.get("DB_PASS"), - host=app.config.get("DB_HOST")) + return MySQLdb.Connect(db=os.environ.get("DB_NAME"), + user=os.environ.get("DB_USER"), + passwd=os.environ.get("DB_PASS"), + host=os.environ.get("DB_HOST")) def main(args): - # The file of the "main" .geno file for the group in question - # For example: BXD.geno or BXD.6.geno if converting to BXD individual genofiles - source_genofile = args[1] + # Directory in which .geno files are located + geno_dir = args[1] + + # Directory in which to output new files + out_dir = args[2] + + # The individuals group that we want to generate a .geno file for + target_file = geno_dir + args[3] - # The target individuals/samples group(s) we're generating the .geno files for - # This can be passed as either a specific .geno file, or as a JSON file - # containing a set of .geno files (and their corresponding file names and sample lists) - if ".json" in args[2]: - target_groups = json.load(args[2])['genofile'] + # The source group(s) we're generating the .geno files from + # This can be passed as either a specific .geno file (or set of files as multiple arguments), + # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists) + if ".json" in args[4]: + source_files = [geno_dir + genofile['location'] for genofile in json.load(args[4])['genofile']] else: - target_groups = [args[2]] + source_files = [geno_dir + group + ".geno" if ".geno" not in group else group for group in args[4:]] # Generate the output .geno files - generate_new_genofiles(source_genofile, strain_genotypes(source_genofile), target_groups) + for source_file in source_files: + generate_new_genofile(source_file, target_file) def get_strain_for_sample(sample): query = ( @@ -41,13 +48,11 @@ def get_strain_for_sample(sample): with conn.cursor() as cursor: return cursor.execute(query, {"name": name}).fetchone()[0] -def generate_new_genofiles(source_genofile, strain_genotypes, target_groups): - for group in target_groups: - base_samples = group_samples(source_genofile) - target_samples = group_samples(group) - strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples) - - new_genofile = app.config.get("GENENETWORK_FILES") + "/genotype/_" + group +def generate_new_genofiles(source_genofile, target_genofile): + base_samples = group_samples(source_genofile) + base_genotypes = strain_genotypes(source_genofile) + target_samples = group_samples(target_genofile) + strain_pos_map = map_strain_pos_to_target_group(base_samples, target_samples) def map_strain_pos_to_target_group(base_samples, target_samples): @@ -68,18 +73,13 @@ def map_strain_pos_to_target_group(base_samples, target_samples): return pos_map -def group_samples(target_group: str) -> List: +def group_samples(target_file: str) -> List: """ Get the group samples from its "dummy" .geno file (which still contains the sample list) """ - # Allow for inputting the target group as either the group name or .geno file - file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + target_group - if ".geno" not in target_group: - file_location += ".geno" - sample_list = [] - with open(file_location, "r") as target_geno: + with open(target_file, "r") as target_geno: for i, line in enumerate(target_geno): # Skip header lines if line[0] in ["#", "@"] or not len(line): @@ -110,8 +110,6 @@ def strain_genotypes(strain_genofile: str) -> List: ] """ - file_location = app.config.get("GENENETWORK_FILES") + "/genotype/" + strain_genofile - geno_start_col = None header_columns = [] sample_list = [] |