diff options
-rw-r--r-- | gn3/computations/rqtl.py | 222 |
1 files changed, 146 insertions, 76 deletions
diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py index 65ee6de..2f9c4a2 100644 --- a/gn3/computations/rqtl.py +++ b/gn3/computations/rqtl.py @@ -11,24 +11,34 @@ from gn3.commands import compose_rqtl_cmd from gn3.computations.gemma import generate_hash_of_string from gn3.fs_helpers import get_hash_of_files -def generate_rqtl_cmd(rqtl_wrapper_cmd: str, - rqtl_wrapper_kwargs: Dict, - rqtl_wrapper_bool_kwargs: list) -> Dict: + +def generate_rqtl_cmd( + rqtl_wrapper_cmd: str, + rqtl_wrapper_kwargs: Dict, + rqtl_wrapper_bool_kwargs: list, +) -> Dict: """Given the base rqtl_wrapper command and -dict of keyword arguments, return the full rqtl_wrapper command and an -output filename generated from a hash of the genotype and phenotype files""" + dict of keyword arguments, return the full rqtl_wrapper command and an + output filename generated from a hash of the genotype and phenotype files""" # Generate a hash from contents of the genotype and phenotype files _hash = get_hash_of_files( - [v for k, v in rqtl_wrapper_kwargs.items() if k in ["g", "p"]]) + [v for k, v in rqtl_wrapper_kwargs.items() if k in ["g", "p"]] + ) # Append to hash a hash of keyword arguments _hash += generate_hash_of_string( - ",".join([f"{k}:{v}" for k, v in rqtl_wrapper_kwargs.items() if k not in ["g", "p"]])) + ",".join( + [ + f"{k}:{v}" + for k, v in rqtl_wrapper_kwargs.items() + if k not in ["g", "p"] + ] + ) + ) # Append to hash a hash of boolean keyword arguments - _hash += generate_hash_of_string( - ",".join(rqtl_wrapper_bool_kwargs)) + _hash += generate_hash_of_string(",".join(rqtl_wrapper_bool_kwargs)) # Temporarily substitute forward-slashes in hash with underscores _hash = _hash.replace("/", "_") @@ -37,12 +47,12 @@ output filename generated from a hash of the genotype and phenotype files""" rqtl_wrapper_kwargs["filename"] = _output_filename return { - "output_file": - _output_filename, - "rqtl_cmd": - compose_rqtl_cmd(rqtl_wrapper_cmd=rqtl_wrapper_cmd, - rqtl_wrapper_kwargs=rqtl_wrapper_kwargs, - rqtl_wrapper_bool_kwargs=rqtl_wrapper_bool_kwargs) + "output_file": _output_filename, + "rqtl_cmd": compose_rqtl_cmd( + rqtl_wrapper_cmd=rqtl_wrapper_cmd, + rqtl_wrapper_kwargs=rqtl_wrapper_kwargs, + rqtl_wrapper_bool_kwargs=rqtl_wrapper_bool_kwargs, + ), } @@ -52,8 +62,13 @@ def process_rqtl_mapping(file_name: str) -> List: marker_obs = [] # Later I should probably redo this using csv.read to avoid the # awkwardness with removing quotes with [1:-1] - with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", file_name), "r", encoding="utf-8") as the_file: + with open( + os.path.join( + current_app.config.get("TMPDIR", "/tmp"), "output", file_name + ), + "r", + encoding="utf-8", + ) as the_file: for line in the_file: line_items = line.split(",") if line_items[1][1:-1] == "chr" or not line_items: @@ -71,53 +86,70 @@ def process_rqtl_mapping(file_name: str) -> List: "chr": the_chr, "cM": float(line_items[2]), "Mb": float(line_items[2]), - "lod_score": float(line_items[3]) + "lod_score": float(line_items[3]), } marker_obs.append(this_marker) return marker_obs + def process_rqtl_pairscan(file_name: str, geno_file: str) -> List: """Given an output file name, read in R/qtl pair-scan results and return -a list of both the JSON needed for the d3panels figure and a list of results -to be used when generating the results table (which will include marker names)""" + a list of both the JSON needed for the d3panels figure and a list of results + to be used when generating the results table (which will include marker names)""" figure_data = pairscan_for_figure(file_name) table_data = pairscan_for_table(file_name, geno_file) return [figure_data, table_data] + def pairscan_for_figure(file_name: str) -> Dict: """Given an output file name, read in R/qtl pair-scan results and return the JSON needed for the d3panels figure""" figure_data = {} # Open the file with the actual results, written as a list of lists - with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", file_name), "r") as the_file: + with open( + os.path.join( + current_app.config.get("TMPDIR", "/tmp"), "output", file_name + ), + "r", + encoding="utf8", + ) as the_file: lod_results = [] for i, line in enumerate(the_file): - if i == 0: # Skip first line + if i == 0: # Skip first line continue - line_items = [item.rstrip('\n') for item in line.split(",")] - lod_results.append(line_items[1:]) # Append all but first item in line - figure_data['lod'] = lod_results - - # Open the map file with the list of markers/pseudomarkers and their positions - with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", "MAP_" + file_name), "r") as the_file: - chr_list = [] # type: List - pos_list = [] # type: List + line_items = [item.rstrip("\n") for item in line.split(",")] + # Append all but first item in line + lod_results.append(line_items[1:]) + figure_data["lod"] = lod_results + + # Open the map file with the list of markers/pseudomarkers and their + # positions + with open( + os.path.join( + current_app.config.get("TMPDIR", "/tmp"), + "output", + "MAP_" + file_name, + ), + "r", + encoding="utf8", + ) as the_file: + chr_list = [] # type: List + pos_list = [] # type: List for i, line in enumerate(the_file): - if i == 0: # Skip first line + if i == 0: # Skip first line continue - line_items = [item.rstrip('\n') for item in line.split(",")] + line_items = [item.rstrip("\n") for item in line.split(",")] chr_list.append(line_items[1][1:-1]) pos_list.append(line_items[2]) - figure_data['chr'] = chr_list - figure_data['pos'] = pos_list + figure_data["chr"] = chr_list + figure_data["pos"] = pos_list return figure_data + def get_marker_list(map_file: str) -> List: """ Open the map file with the list of markers/pseudomarkers and create list of marker obs @@ -127,21 +159,31 @@ def get_marker_list(map_file: str) -> List: """ marker_list = [] - with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", map_file), "r") as map_fh: + with open( + os.path.join( + current_app.config.get("TMPDIR", "/tmp"), "output", map_file + ), + "r", + encoding="utf8", + ) as map_fh: for line in map_fh.readlines()[1:]: - line_items = [item.rstrip('\n') for item in line.split(",")] + line_items = [item.rstrip("\n") for item in line.split(",")] this_marker = { - 'name': line_items[0], - 'chr': line_items[1][1:-1], # Strip quotes from beginning and end of chr string - 'pos': line_items[2] + "name": line_items[0], + "chr": line_items[1][ + 1:-1 + ], # Strip quotes from beginning and end of chr string + "pos": line_items[2], } marker_list.append(this_marker) return marker_list -def generate_table_rows(results_file: str, marker_list: List, original_markers: Dict) -> List: + +def generate_table_rows( + results_file: str, marker_list: List, original_markers: Dict +) -> List: """ Open the file with the actual R/qtl pair-scan results and write them as they will be displayed in the results table @@ -152,38 +194,45 @@ def generate_table_rows(results_file: str, marker_list: List, original_markers: original_markers: Dict of markers from the .geno file, for finding proximal/distal markers to each pseudomarker """ - table_data = [] - with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", results_file), "r") as the_file: + with open( + os.path.join( + current_app.config.get("TMPDIR", "/tmp"), "output", results_file + ), + "r", + encoding="utf8", + ) as the_file: for i, line in enumerate(the_file.readlines()[1:]): marker_1 = marker_list[i] - marker_1['proximal'], marker_1['distal'] = find_nearest_marker(marker_1['chr'], - marker_1['pos'], - original_markers) - line_items = [item.rstrip('\n') for item in line.split(",")] + marker_1["proximal"], marker_1["distal"] = find_nearest_marker( + marker_1["chr"], marker_1["pos"], original_markers + ) + line_items = [item.rstrip("\n") for item in line.split(",")] for j, item in enumerate(line_items[1:]): marker_2 = marker_list[j] - marker_2['proximal'], marker_2['distal'] = find_nearest_marker(marker_2['chr'], - marker_2['pos'], - original_markers) + marker_2["proximal"], marker_2["distal"] = find_nearest_marker( + marker_2["chr"], marker_2["pos"], original_markers + ) try: lod_score = f"{float(item):.3f}" except ValueError: lod_score = f"{item}" - table_data.append({ - 'proximal1': marker_1['proximal'], - 'distal1': marker_1['distal'], - 'pos1': f"Chr {marker_1['chr']} @ {float(marker_1['pos']):.1f} cM", - 'lod': lod_score, - 'proximal2': marker_2['proximal'], - 'distal2': marker_2['distal'], - 'pos2': f"Chr {marker_2['chr']} @ {float(marker_2['pos']):.1f} cM" - }) + table_data.append( + { + "proximal1": marker_1["proximal"], + "distal1": marker_1["distal"], + "pos1": f"Chr {marker_1['chr']} @ {float(marker_1['pos']):.1f} cM", + "lod": lod_score, + "proximal2": marker_2["proximal"], + "distal2": marker_2["distal"], + "pos2": f"Chr {marker_2['chr']} @ {float(marker_2['pos']):.1f} cM", + } + ) return table_data + def pairscan_for_table(file_name: str, geno_file: str) -> List: """ Read in R/qtl pair-scan results and return as List representing @@ -204,37 +253,50 @@ def pairscan_for_table(file_name: str, geno_file: str) -> List: # they will be displayed in the results table table_data = generate_table_rows(file_name, marker_list, original_markers) - return sorted(table_data, key=lambda i: float(i['lod']), reverse=True)[:500] + return sorted(table_data, key=lambda i: float(i["lod"]), reverse=True)[:500] + def build_marker_pos_dict(genotype_file: str) -> Dict: """Gets list of markers and their positions from .geno file Basically a pared-down version of parse_genotype_file for R/qtl pair-scan""" - with open(genotype_file, "r") as infile: + with open(genotype_file, "r", encoding="utf8") as infile: contents = infile.readlines() # Get all lines after the metadata - lines = tuple(line for line in contents if - ((not line.strip().startswith("#")) and - (not line.strip().startswith("@")) and - (not line.strip() == ""))) + lines = tuple( + line + for line in contents + if ( + (not line.strip().startswith("#")) + and (not line.strip().startswith("@")) + and (not line.strip() == "") + ) + ) header_items = lines[0].split("\t") mb_exists = "Mb" in header_items - pos_column = header_items.index("Mb") if mb_exists else header_items.index("cM") + pos_column = ( + header_items.index("Mb") if mb_exists else header_items.index("cM") + ) - the_markers = {"1": {}} # type: Dict[str, Dict] - for line in lines[1:]: # The lines with markers + the_markers = {"1": {}} # type: Dict[str, Dict] + for line in lines[1:]: # The lines with markers line_items = line.split("\t") this_chr = line_items[0] if this_chr not in the_markers: the_markers[this_chr] = {} - the_markers[this_chr][str(float(line_items[pos_column]))] = line_items[1] + the_markers[this_chr][str(float(line_items[pos_column]))] = line_items[ + 1 + ] return the_markers -def find_nearest_marker(the_chr: str, the_pos: str, marker_list: Dict) -> Tuple[str, str]: + +def find_nearest_marker( + the_chr: str, the_pos: str, marker_list: Dict +) -> Tuple[str, str]: """Given a chromosome and position of a pseudomarker (from R/qtl pair-scan results), return the nearest real marker""" @@ -243,18 +305,26 @@ def find_nearest_marker(the_chr: str, the_pos: str, marker_list: Dict) -> Tuple[ # Get the position of the pseudomarker in the list of markers for the chr the_pos_index = bisect(pos_list, float(the_pos)) - proximal_marker = marker_list[the_chr][str(pos_list[the_pos_index-1])] + proximal_marker = marker_list[the_chr][str(pos_list[the_pos_index - 1])] distal_marker = marker_list[the_chr][str(pos_list[the_pos_index])] return proximal_marker, distal_marker + def process_perm_output(file_name: str) -> Tuple[List, float, float]: """Given base filename, read in R/qtl permutation output and calculate suggestive and significant thresholds""" perm_results = [] - with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", "PERM_" + file_name), "r", encoding="utf-8") as the_file: + with open( + os.path.join( + current_app.config.get("TMPDIR", "/tmp"), + "output", + "PERM_" + file_name, + ), + "r", + encoding="utf-8", + ) as the_file: for i, line in enumerate(the_file): if i == 0: # Skip header line |