diff options
Diffstat (limited to 'gn3/computations')
-rw-r--r-- | gn3/computations/correlations.py | 58 | ||||
-rw-r--r-- | gn3/computations/correlations2.py | 36 | ||||
-rw-r--r-- | gn3/computations/ctl.py | 30 | ||||
-rw-r--r-- | gn3/computations/diff.py | 2 | ||||
-rw-r--r-- | gn3/computations/gemma.py | 2 | ||||
-rw-r--r-- | gn3/computations/parsers.py | 2 | ||||
-rw-r--r-- | gn3/computations/partial_correlations.py | 628 | ||||
-rw-r--r-- | gn3/computations/partial_correlations_optimised.py | 244 | ||||
-rw-r--r-- | gn3/computations/pca.py | 189 | ||||
-rw-r--r-- | gn3/computations/qtlreaper.py | 16 | ||||
-rw-r--r-- | gn3/computations/rqtl.py | 5 | ||||
-rw-r--r-- | gn3/computations/wgcna.py | 28 |
12 files changed, 1093 insertions, 147 deletions
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py index c5c56db..a0da2c4 100644 --- a/gn3/computations/correlations.py +++ b/gn3/computations/correlations.py @@ -7,6 +7,7 @@ from typing import List from typing import Tuple from typing import Optional from typing import Callable +from typing import Generator import scipy.stats import pingouin as pg @@ -38,20 +39,15 @@ def map_shared_keys_to_values(target_sample_keys: List, return target_dataset_data -def normalize_values(a_values: List, - b_values: List) -> Tuple[List[float], List[float], int]: - """Trim two lists of values to contain only the values they both share Given - two lists of sample values, trim each list so that it contains only the - samples that contain a value in both lists. Also returns the number of - such samples. - - >>> normalize_values([2.3, None, None, 3.2, 4.1, 5], - [3.4, 7.2, 1.3, None, 6.2, 4.1]) - ([2.3, 4.1, 5], [3.4, 6.2, 4.1], 3) - +def normalize_values(a_values: List, b_values: List) -> Generator: + """ + :param a_values: list of primary strain values + :param b_values: a list of target strain values + :return: yield 2 values if none of them is none """ + for a_val, b_val in zip(a_values, b_values): - if (a_val and b_val is not None): + if (a_val is not None) and (b_val is not None): yield a_val, b_val @@ -79,15 +75,18 @@ def compute_sample_r_correlation(trait_name, corr_method, trait_vals, """ - sanitized_traits_vals, sanitized_target_vals = list( - zip(*list(normalize_values(trait_vals, target_samples_vals)))) - num_overlap = len(sanitized_traits_vals) + try: + normalized_traits_vals, normalized_target_vals = list( + zip(*list(normalize_values(trait_vals, target_samples_vals)))) + num_overlap = len(normalized_traits_vals) + except ValueError: + return None if num_overlap > 5: (corr_coefficient, p_value) =\ - compute_corr_coeff_p_value(primary_values=sanitized_traits_vals, - target_values=sanitized_target_vals, + compute_corr_coeff_p_value(primary_values=normalized_traits_vals, + target_values=normalized_target_vals, corr_method=corr_method) if corr_coefficient is not None and not math.isnan(corr_coefficient): @@ -108,7 +107,7 @@ package :not packaged in guix def filter_shared_sample_keys(this_samplelist, - target_samplelist) -> Tuple[List, List]: + target_samplelist) -> Generator: """Given primary and target sample-list for two base and target trait select filter the values using the shared keys @@ -134,9 +133,16 @@ def fast_compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - processed_values.append((trait_name, corr_method, - list(zip(*list(filter_shared_sample_keys( - this_trait_samples, target_trait_data)))))) + + try: + this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + + processed_values.append( + (trait_name, corr_method, this_vals, target_vals)) + except ValueError: + continue + with closing(multiprocessing.Pool()) as pool: results = pool.starmap(compute_sample_r_correlation, processed_values) @@ -168,8 +174,14 @@ def compute_all_sample_correlation(this_trait, for target_trait in target_dataset: trait_name = target_trait.get("trait_id") target_trait_data = target_trait["trait_sample_data"] - this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( - this_trait_samples, target_trait_data)))) + + try: + this_vals, target_vals = list(zip(*list(filter_shared_sample_keys( + this_trait_samples, target_trait_data)))) + + except ValueError: + # case where no matching strain names + continue sample_correlation = compute_sample_r_correlation( trait_name=trait_name, diff --git a/gn3/computations/correlations2.py b/gn3/computations/correlations2.py index 93db3fa..d0222ae 100644 --- a/gn3/computations/correlations2.py +++ b/gn3/computations/correlations2.py @@ -6,45 +6,21 @@ FUNCTIONS: compute_correlation: TODO: Describe what the function does...""" -from math import sqrt -from functools import reduce +from scipy import stats ## From GN1: mostly for clustering and heatmap generation def __items_with_values(dbdata, userdata): """Retains only corresponding items in the data items that are not `None` values. This should probably be renamed to something sensible""" - def both_not_none(item1, item2): - """Check that both items are not the value `None`.""" - if (item1 is not None) and (item2 is not None): - return (item1, item2) - return None - def split_lists(accumulator, item): - """Separate the 'x' and 'y' items.""" - return [accumulator[0] + [item[0]], accumulator[1] + [item[1]]] - return reduce( - split_lists, - filter(lambda x: x is not None, map(both_not_none, dbdata, userdata)), - [[], []]) + filtered = [x for x in zip(dbdata, userdata) if x[0] is not None and x[1] is not None] + return tuple(zip(*filtered)) if filtered else ([], []) def compute_correlation(dbdata, userdata): - """Compute some form of correlation. + """Compute the Pearson correlation coefficient. This is extracted from https://github.com/genenetwork/genenetwork1/blob/master/web/webqtl/utility/webqtlUtil.py#L622-L647 """ x_items, y_items = __items_with_values(dbdata, userdata) - if len(x_items) < 6: - return (0.0, len(x_items)) - meanx = sum(x_items)/len(x_items) - meany = sum(y_items)/len(y_items) - def cal_corr_vals(acc, item): - xitem, yitem = item - return [ - acc[0] + ((xitem - meanx) * (yitem - meany)), - acc[1] + ((xitem - meanx) * (xitem - meanx)), - acc[2] + ((yitem - meany) * (yitem - meany))] - xyd, sxd, syd = reduce(cal_corr_vals, zip(x_items, y_items), [0.0, 0.0, 0.0]) - try: - return ((xyd/(sqrt(sxd)*sqrt(syd))), len(x_items)) - except ZeroDivisionError: - return(0, len(x_items)) + correlation = stats.pearsonr(x_items, y_items)[0] if len(x_items) >= 6 else 0 + return (correlation, len(x_items)) diff --git a/gn3/computations/ctl.py b/gn3/computations/ctl.py new file mode 100644 index 0000000..f881410 --- /dev/null +++ b/gn3/computations/ctl.py @@ -0,0 +1,30 @@ +"""module contains code to process ctl analysis data""" +import json +from gn3.commands import run_cmd + +from gn3.computations.wgcna import dump_wgcna_data +from gn3.computations.wgcna import compose_wgcna_cmd +from gn3.computations.wgcna import process_image + +from gn3.settings import TMPDIR + + +def call_ctl_script(data): + """function to call ctl script""" + data["imgDir"] = TMPDIR + temp_file_name = dump_wgcna_data(data) + cmd = compose_wgcna_cmd("ctl_analysis.R", temp_file_name) + + cmd_results = run_cmd(cmd) + with open(temp_file_name, "r", encoding="utf-8") as outputfile: + if cmd_results["code"] != 0: + return (cmd_results, None) + output_file_data = json.load(outputfile) + + output_file_data["image_data"] = process_image( + output_file_data["image_loc"]).decode("ascii") + + output_file_data["ctl_plots"] = [process_image(ctl_plot).decode("ascii") for + ctl_plot in output_file_data["ctl_plots"]] + + return (cmd_results, output_file_data) diff --git a/gn3/computations/diff.py b/gn3/computations/diff.py index af02f7f..0b6edd6 100644 --- a/gn3/computations/diff.py +++ b/gn3/computations/diff.py @@ -6,7 +6,7 @@ from gn3.commands import run_cmd def generate_diff(data: str, edited_data: str) -> Optional[str]: """Generate the diff between 2 files""" - results = run_cmd(f"diff {data} {edited_data}", success_codes=(1, 2)) + results = run_cmd(f'"diff {data} {edited_data}"', success_codes=(1, 2)) if results.get("code", -1) > 0: return results.get("output") return None diff --git a/gn3/computations/gemma.py b/gn3/computations/gemma.py index 0b22d3c..8036a7b 100644 --- a/gn3/computations/gemma.py +++ b/gn3/computations/gemma.py @@ -31,7 +31,7 @@ def generate_pheno_txt_file(trait_filename: str, # Early return if this already exists! if os.path.isfile(f"{tmpdir}/gn2/{trait_filename}"): return f"{tmpdir}/gn2/{trait_filename}" - with open(f"{tmpdir}/gn2/{trait_filename}", "w") as _file: + with open(f"{tmpdir}/gn2/{trait_filename}", "w", encoding="utf-8") as _file: for value in values: if value == "x": _file.write("NA\n") diff --git a/gn3/computations/parsers.py b/gn3/computations/parsers.py index 1af35d6..79e3955 100644 --- a/gn3/computations/parsers.py +++ b/gn3/computations/parsers.py @@ -15,7 +15,7 @@ def parse_genofile(file_path: str) -> Tuple[List[str], 'u': None, } genotypes, samples = [], [] - with open(file_path, "r") as _genofile: + with open(file_path, "r", encoding="utf-8") as _genofile: for line in _genofile: line = line.strip() if line.startswith(("#", "@")): diff --git a/gn3/computations/partial_correlations.py b/gn3/computations/partial_correlations.py index 07dc16d..5017796 100644 --- a/gn3/computations/partial_correlations.py +++ b/gn3/computations/partial_correlations.py @@ -5,12 +5,32 @@ It is an attempt to migrate over the partial correlations feature from GeneNetwork1. """ -from functools import reduce -from typing import Any, Tuple, Sequence +import math +import warnings +from functools import reduce, partial +from typing import Any, Tuple, Union, Sequence + +import numpy +import pandas +import pingouin from scipy.stats import pearsonr, spearmanr from gn3.settings import TEXTDIR +from gn3.random import random_string +from gn3.function_helpers import compose from gn3.data_helpers import parse_csv_line +from gn3.db.traits import export_informative +from gn3.db.datasets import retrieve_trait_dataset +from gn3.db.partial_correlations import traits_info, traits_data +from gn3.db.species import species_name, translate_to_mouse_gene_id +from gn3.db.correlations import ( + get_filename, + fetch_all_database_data, + check_for_literature_info, + fetch_tissue_correlations, + fetch_literature_correlations, + check_symbol_for_tissue_correlation, + fetch_gene_symbol_tissue_value_dict_for_trait) def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): """ @@ -40,7 +60,7 @@ def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): __process_sample__, sampleslist, (tuple(), tuple(), tuple())) return reduce( - lambda acc, item: ( + lambda acc, item: (# type: ignore[arg-type, return-value] acc[0] + (item[0],), acc[1] + (item[1],), acc[2] + (item[2],), @@ -49,22 +69,6 @@ def control_samples(controls: Sequence[dict], sampleslist: Sequence[str]): [__process_control__(trait_data) for trait_data in controls], (tuple(), tuple(), tuple(), tuple())) -def dictify_by_samples(samples_vals_vars: Sequence[Sequence]) -> Sequence[dict]: - """ - Build a sequence of dictionaries from a sequence of separate sequences of - samples, values and variances. - - This is a partial migration of - `web.webqtl.correlation.correlationFunction.fixStrains` function in GN1. - This implementation extracts code that will find common use, and that will - find use in more than one place. - """ - return tuple( - { - sample: {"sample_name": sample, "value": val, "variance": var} - for sample, val, var in zip(*trait_line) - } for trait_line in zip(*(samples_vals_vars[0:3]))) - def fix_samples(primary_trait: dict, control_traits: Sequence[dict]) -> Sequence[Sequence[Any]]: """ Corrects sample_names, values and variance such that they all contain only @@ -108,7 +112,7 @@ def find_identical_traits( return acc + ident[1] def __dictify_controls__(acc, control_item): - ckey = "{:.3f}".format(control_item[0]) + ckey = tuple(f"{item:.3f}" for item in control_item[0]) return {**acc, ckey: acc.get(ckey, tuple()) + (control_item[1],)} return (reduce(## for identical control traits @@ -148,11 +152,11 @@ def tissue_correlation( assert len(primary_trait_values) == len(target_trait_values), ( "The lengths of the `primary_trait_values` and `target_trait_values` " "must be equal") - assert method in method_fns.keys(), ( - "Method must be one of: {}".format(",".join(method_fns.keys()))) + assert method in method_fns, ( + "Method must be one of: {','.join(method_fns.keys())}") corr, pvalue = method_fns[method](primary_trait_values, target_trait_values) - return (round(corr, 10), round(pvalue, 10)) + return (corr, pvalue) def batch_computed_tissue_correlation( primary_trait_values: Tuple[float, ...], target_traits_dict: dict, @@ -196,33 +200,19 @@ def good_dataset_samples_indexes( samples_from_file.index(good) for good in set(samples).intersection(set(samples_from_file)))) -def determine_partials( - primary_vals, control_vals, all_target_trait_names, - all_target_trait_values, method): - """ - This **WILL** be a migration of - `web.webqtl.correlation.correlationFunction.determinePartialsByR` function - in GeneNetwork1. - - The function in GeneNetwork1 contains code written in R that is then used to - compute the partial correlations. - """ - ## This function is not implemented at this stage - return tuple( - primary_vals, control_vals, all_target_trait_names, - all_target_trait_values, method) - -def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] +def partial_correlations_fast(# pylint: disable=[R0913, R0914] samples, primary_vals, control_vals, database_filename, fetched_correlations, method: str, correlation_type: str) -> Tuple[ - float, Tuple[float, ...]]: + int, Tuple[float, ...]]: """ + Computes partial correlation coefficients using data from a CSV file. + This is a partial migration of the `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsFast` function in GeneNetwork1. """ assert method in ("spearman", "pearson") - with open(f"{TEXTDIR}/{database_filename}", "r") as dataset_file: + with open(database_filename, "r", encoding="utf-8") as dataset_file: # pytest: disable=[W1514] dataset = tuple(dataset_file.readlines()) good_dataset_samples = good_dataset_samples_indexes( @@ -245,7 +235,7 @@ def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] all_target_trait_names: Tuple[str, ...] = processed_trait_names_values[0] all_target_trait_values: Tuple[float, ...] = processed_trait_names_values[1] - all_correlations = determine_partials( + all_correlations = compute_partial( primary_vals, control_vals, all_target_trait_names, all_target_trait_values, method) ## Line 772 to 779 in GN1 are the cause of the weird complexity in the @@ -254,36 +244,544 @@ def compute_partial_correlations_fast(# pylint: disable=[R0913, R0914] ## `correlation_type` parameter return len(all_correlations), tuple( corr + ( - (fetched_correlations[corr[0]],) if correlation_type == "literature" - else fetched_correlations[corr[0]][0:2]) + (fetched_correlations[corr[0]],) # type: ignore[index] + if correlation_type == "literature" + else fetched_correlations[corr[0]][0:2]) # type: ignore[index] for idx, corr in enumerate(all_correlations)) -def partial_correlation_matrix( +def build_data_frame( xdata: Tuple[float, ...], ydata: Tuple[float, ...], - zdata: Tuple[float, ...], method: str = "pearsons", - omit_nones: bool = True) -> float: + zdata: Union[ + Tuple[float, ...], + Tuple[Tuple[float, ...], ...]]) -> pandas.DataFrame: + """ + Build a pandas DataFrame object from xdata, ydata and zdata + """ + x_y_df = pandas.DataFrame({"x": xdata, "y": ydata}) + if isinstance(zdata[0], float): + return x_y_df.join(pandas.DataFrame({"z": zdata})) + interm_df = x_y_df.join(pandas.DataFrame( + {f"z{i}": val for i, val in enumerate(zdata)})) + if interm_df.shape[1] == 3: + return interm_df.rename(columns={"z0": "z"}) + return interm_df + +def compute_trait_info(primary_vals, control_vals, target, method): """ - Computes the partial correlation coefficient using the - 'variance-covariance matrix' method + Compute the correlation values for the given arguments. + """ + targ_vals = target[0] + targ_name = target[1] + primary = [ + prim for targ, prim in zip(targ_vals, primary_vals) + if targ is not None] + + if len(primary) < 3: + return None + + def __remove_controls_for_target_nones(cont_targ): + return tuple(cont for cont, targ in cont_targ if targ is not None) + + datafrm = build_data_frame( + primary, + [targ for targ in targ_vals if targ is not None], + [__remove_controls_for_target_nones(tuple(zip(control, targ_vals))) + for control in control_vals]) + covariates = "z" if datafrm.shape[1] == 3 else [ + col for col in datafrm.columns if col not in ("x", "y")] + ppc = pingouin.partial_corr( + data=datafrm, x="x", y="y", covar=covariates, method=( + "pearson" if "pearson" in method.lower() else "spearman")) + pc_coeff = ppc["r"][0] + + zero_order_corr = pingouin.corr( + datafrm["x"], datafrm["y"], method=( + "pearson" if "pearson" in method.lower() else "spearman")) + + if math.isnan(pc_coeff): + return ( + targ_name, len(primary), pc_coeff, 1, zero_order_corr["r"][0], + zero_order_corr["p-val"][0]) + return ( + targ_name, len(primary), pc_coeff, + (ppc["p-val"][0] if not math.isnan(ppc["p-val"][0]) else ( + 0 if (abs(pc_coeff - 1) < 0.0000001) else 1)), + zero_order_corr["r"][0], zero_order_corr["p-val"][0]) + +def compute_partial( + primary_vals, control_vals, target_vals, target_names, + method: str) -> Tuple[ + Union[ + Tuple[str, int, float, float, float, float], None], + ...]: + """ + Compute the partial correlations. - This is a partial migration of the - `web.webqtl.correlation.correlationFunction.determinPartialsByR` function in - GeneNetwork1, specifically the `pcor.mat` function written in the R - programming language. + This is a re-implementation of the + `web.webqtl.correlation.correlationFunction.determinePartialsByR` function + in GeneNetwork1. + + This implementation reworks the child function `compute_partial` which will + then be used in the place of `determinPartialsByR`. + """ + return tuple( + result for result in ( + compute_trait_info( + primary_vals, control_vals, (tvals, tname), method) + for tvals, tname in zip(target_vals, target_names)) + if result is not None) + +def partial_correlations_normal(# pylint: disable=R0913 + primary_vals, control_vals, input_trait_gene_id, trait_database, + data_start_pos: int, db_type: str, method: str) -> Tuple[ + int, Tuple[Union[ + Tuple[str, int, float, float, float, float], None], + ...]]:#Tuple[float, ...] """ - return 0 + Computes the correlation coefficients. -def partial_correlation_recursive( - xdata: Tuple[float, ...], ydata: Tuple[float, ...], - zdata: Tuple[float, ...], method: str = "pearsons", - omit_nones: bool = True) -> float: + This is a migration of the + `web.webqtl.correlation.PartialCorrDBPage.getPartialCorrelationsNormal` + function in GeneNetwork1. """ - Computes the partial correlation coefficient using the 'recursive formula' - method + def __add_lit_and_tiss_corr__(item): + if method.lower() == "sgo literature correlation": + # if method is 'SGO Literature Correlation', `compute_partial` + # would give us LitCorr in the [1] position + return tuple(item) + trait_database[1] + if method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + # if method is 'Tissue Correlation, *', `compute_partial` would give + # us Tissue Corr in the [1] position and Tissue Corr P Value in the + # [2] position + return tuple(item) + (trait_database[1], trait_database[2]) + return item + + target_trait_names, target_trait_vals = reduce(# type: ignore[var-annotated] + lambda acc, item: (acc[0]+(item[0],), acc[1]+(item[data_start_pos:],)), + trait_database, (tuple(), tuple())) + + all_correlations = compute_partial( + primary_vals, control_vals, target_trait_vals, target_trait_names, + method) + + if (input_trait_gene_id and db_type == "ProbeSet" and method.lower() in ( + "sgo literature correlation", "tissue correlation, pearson's r", + "tissue correlation, spearman's rho")): + return ( + len(trait_database), + tuple( + __add_lit_and_tiss_corr__(item) + for idx, item in enumerate(all_correlations))) + + return len(trait_database), all_correlations + +def partial_corrs(# pylint: disable=[R0913] + conn, samples, primary_vals, control_vals, return_number, species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, dataset, database_filename): + """ + Compute the partial correlations, selecting the fast or normal method + depending on the existence of the database text file. This is a partial migration of the - `web.webqtl.correlation.correlationFunction.determinPartialsByR` function in - GeneNetwork1, specifically the `pcor.rec` function written in the R - programming language. + `web.webqtl.correlation.PartialCorrDBPage.__init__` function in + GeneNetwork1. + """ + if database_filename: + return partial_correlations_fast( + samples, primary_vals, control_vals, database_filename, + ( + fetch_literature_correlations( + species, input_trait_geneid, dataset, return_number, conn) + if "literature" in method.lower() else + fetch_tissue_correlations( + dataset, input_trait_symbol, tissue_probeset_freeze_id, + method, return_number, conn)), + method, + ("literature" if method.lower() == "sgo literature correlation" + else ("tissue" if "tissue" in method.lower() else "genetic"))) + + trait_database, data_start_pos = fetch_all_database_data( + conn, species, input_trait_geneid, input_trait_symbol, samples, dataset, + method, return_number, tissue_probeset_freeze_id) + return partial_correlations_normal( + primary_vals, control_vals, input_trait_geneid, trait_database, + data_start_pos, dataset, method) + +def literature_correlation_by_list( + conn: Any, species: str, trait_list: Tuple[dict]) -> Tuple[dict, ...]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getLiteratureCorrelationByList` + function in GeneNetwork1. + """ + if any((lambda t: ( + bool(t.get("tissue_corr")) and + bool(t.get("tissue_p_value"))))(trait) + for trait in trait_list): + temporary_table_name = f"LITERATURE{random_string(8)}" + query1 = ( + f"CREATE TEMPORARY TABLE {temporary_table_name} " + "(GeneId1 INT(12) UNSIGNED, GeneId2 INT(12) UNSIGNED PRIMARY KEY, " + "value DOUBLE)") + query2 = ( + f"INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " + "SELECT GeneId1, GeneId2, value FROM LCorrRamin3 " + "WHERE GeneId1=%(geneid)s") + query3 = ( + "INSERT INTO {temporary_table_name}(GeneId1, GeneId2, value) " + "SELECT GeneId2, GeneId1, value FROM LCorrRamin3 " + "WHERE GeneId2=%s AND GeneId1 != %(geneid)s") + + def __set_mouse_geneid__(trait): + if trait.get("geneid"): + return { + **trait, + "mouse_geneid": translate_to_mouse_gene_id( + species, trait.get("geneid"), conn) + } + return {**trait, "mouse_geneid": 0} + + def __retrieve_lcorr__(cursor, geneids): + cursor.execute( + f"SELECT GeneId2, value FROM {temporary_table_name} " + "WHERE GeneId2 IN %(geneids)s", + geneids=geneids) + return dict(cursor.fetchall()) + + with conn.cursor() as cursor: + cursor.execute(query1) + cursor.execute(query2) + cursor.execute(query3) + + traits = tuple(__set_mouse_geneid__(trait) for trait in trait_list) + lcorrs = __retrieve_lcorr__( + cursor, ( + trait["mouse_geneid"] for trait in traits + if (trait["mouse_geneid"] != 0 and + trait["mouse_geneid"].find(";") < 0))) + return tuple( + {**trait, "l_corr": lcorrs.get(trait["mouse_geneid"], None)} + for trait in traits) + + return trait_list + return trait_list + +def tissue_correlation_by_list( + conn: Any, primary_trait_symbol: str, tissue_probeset_freeze_id: int, + method: str, trait_list: Tuple[dict]) -> Tuple[dict, ...]: + """ + This is a migration of the + `web.webqtl.correlation.CorrelationPage.getTissueCorrelationByList` + function in GeneNetwork1. + """ + def __add_tissue_corr__(trait, primary_trait_values, trait_values): + result = pingouin.corr( + primary_trait_values, trait_values, + method=("spearman" if "spearman" in method.lower() else "pearson")) + return { + **trait, + "tissue_corr": result["r"], + "tissue_p_value": result["p-val"] + } + + if any((lambda t: bool(t.get("l_corr")))(trait) for trait in trait_list): + prim_trait_symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + (primary_trait_symbol,), tissue_probeset_freeze_id, conn) + if primary_trait_symbol.lower() in prim_trait_symbol_value_dict: + primary_trait_value = prim_trait_symbol_value_dict[ + primary_trait_symbol.lower()] + gene_symbol_list = tuple( + trait["symbol"] for trait in trait_list if "symbol" in trait.keys()) + symbol_value_dict = fetch_gene_symbol_tissue_value_dict_for_trait( + gene_symbol_list, tissue_probeset_freeze_id, conn) + return tuple( + __add_tissue_corr__( + trait, primary_trait_value, + symbol_value_dict[trait["symbol"].lower()]) + for trait in trait_list + if ("symbol" in trait and + bool(trait["symbol"]) and + trait["symbol"].lower() in symbol_value_dict)) + return tuple({ + **trait, + "tissue_corr": None, + "tissue_p_value": None + } for trait in trait_list) + return trait_list + +def trait_for_output(trait): + """ + Process a trait for output. + + Removes a lot of extraneous data from the trait, that is not needed for + the display of partial correlation results. + This function also removes all key-value pairs, for which the value is + `None`, because it is a waste of network resources to transmit the key-value + pair just to indicate it does not exist. + """ + def __nan_to_none__(val): + if val is None: + return None + if math.isnan(val) or numpy.isnan(val): + return None + return val + + trait = { + "trait_type": trait["db"]["dataset_type"], + "dataset_name": trait["db"]["dataset_name"], + "dataset_type": trait["db"]["dataset_type"], + "group": trait["db"]["group"], + "trait_fullname": trait["trait_fullname"], + "trait_name": trait["trait_name"], + "symbol": trait.get("symbol"), + "description": trait.get("description"), + "pre_publication_description": trait.get("Pre_publication_description"), + "post_publication_description": trait.get( + "Post_publication_description"), + "original_description": trait.get("Original_description"), + "authors": trait.get("Authors"), + "year": trait.get("Year"), + "probe_target_description": trait.get("Probe_target_description"), + "chr": trait.get("chr"), + "mb": trait.get("mb"), + "geneid": trait.get("geneid"), + "homologeneid": trait.get("homologeneid"), + "noverlap": trait.get("noverlap"), + "partial_corr": __nan_to_none__(trait.get("partial_corr")), + "partial_corr_p_value": __nan_to_none__( + trait.get("partial_corr_p_value")), + "corr": __nan_to_none__(trait.get("corr")), + "corr_p_value": __nan_to_none__(trait.get("corr_p_value")), + "rank_order": __nan_to_none__(trait.get("rank_order")), + "delta": ( + None if trait.get("partial_corr") is None + else (trait.get("partial_corr") - trait.get("corr"))), + "l_corr": __nan_to_none__(trait.get("l_corr")), + "tissue_corr": __nan_to_none__(trait.get("tissue_corr")), + "tissue_p_value": __nan_to_none__(trait.get("tissue_p_value")) + } + return {key: val for key, val in trait.items() if val is not None} + +def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] + conn: Any, primary_trait_name: str, + control_trait_names: Tuple[str, ...], method: str, + criteria: int, target_db_name: str) -> dict: + """ + This is the 'ochestration' function for the partial-correlation feature. + + This function will dispatch the functions doing data fetches from the + database (and various other places) and feed that data to the functions + doing the conversions and computations. It will then return the results of + all of that work. + + This function is doing way too much. Look into splitting out the + functionality into smaller functions that do fewer things. """ - return 0 + threshold = 0 + corr_min_informative = 4 + + all_traits = traits_info( + conn, threshold, (primary_trait_name,) + control_trait_names) + all_traits_data = traits_data(conn, all_traits) + + primary_trait = tuple( + trait for trait in all_traits + if trait["trait_fullname"] == primary_trait_name)[0] + if not primary_trait["haveinfo"]: + return { + "status": "not-found", + "message": f"Could not find primary trait {primary_trait['trait_fullname']}" + } + cntrl_traits = tuple( + trait for trait in all_traits + if trait["trait_fullname"] != primary_trait_name) + if not any(trait["haveinfo"] for trait in cntrl_traits): + return { + "status": "not-found", + "message": "None of the requested control traits were found."} + for trait in cntrl_traits: + if trait["haveinfo"] is False: + warnings.warn( + (f"Control traits {trait['trait_fullname']} was not found " + "- continuing without it."), + category=UserWarning) + + group = primary_trait["db"]["group"] + primary_trait_data = all_traits_data[primary_trait["trait_name"]] + primary_samples, primary_values, _primary_variances = export_informative( + primary_trait_data) + + cntrl_traits_data = tuple( + data for trait_name, data in all_traits_data.items() + if trait_name != primary_trait["trait_name"]) + species = species_name(conn, group) + + (cntrl_samples, + cntrl_values, + _cntrl_variances, + _cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + + common_primary_control_samples = primary_samples + fixed_primary_vals = primary_values + fixed_control_vals = cntrl_values + if not all(cnt_smp == primary_samples for cnt_smp in cntrl_samples): + (common_primary_control_samples, + fixed_primary_vals, + fixed_control_vals, + _primary_variances, + _cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + + if len(common_primary_control_samples) < corr_min_informative: + return { + "status": "error", + "message": ( + f"Fewer than {corr_min_informative} samples data entered for " + f"{group} dataset. No calculation of correlation has been " + "attempted."), + "error_type": "Inadequate Samples"} + + identical_traits_names = find_identical_traits( + primary_trait_name, primary_values, control_trait_names, cntrl_values) + if len(identical_traits_names) > 0: + return { + "status": "error", + "message": ( + f"{identical_traits_names[0]} and {identical_traits_names[1]} " + "have the same values for the {len(fixed_primary_vals)} " + "samples that will be used to compute the partial correlation " + "(common for all primary and control traits). In such cases, " + "partial correlation cannot be computed. Please re-select your " + "traits."), + "error_type": "Identical Traits"} + + input_trait_geneid = primary_trait.get("geneid", 0) + input_trait_symbol = primary_trait.get("symbol", "") + input_trait_mouse_geneid = translate_to_mouse_gene_id( + species, input_trait_geneid, conn) + + tissue_probeset_freeze_id = 1 + db_type = primary_trait["db"]["dataset_type"] + + if db_type == "ProbeSet" and method.lower() in ( + "sgo literature correlation", + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return { + "status": "error", + "message": ( + "Wrong correlation type: It is not possible to compute the " + f"{method} between your trait and data in the {target_db_name} " + "database. Please try again after selecting another type of " + "correlation."), + "error_type": "Correlation Type"} + + if (method.lower() == "sgo literature correlation" and ( + bool(input_trait_geneid) is False or + check_for_literature_info(conn, input_trait_mouse_geneid))): + return { + "status": "error", + "message": ( + "No Literature Information: This gene does not have any " + "associated Literature Information."), + "error_type": "Literature Correlation"} + + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and bool(input_trait_symbol) is False): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + target_dataset = retrieve_trait_dataset( + ("Temp" if "Temp" in target_db_name else + ("Publish" if "Publish" in target_db_name else + "Geno" if "Geno" in target_db_name else "ProbeSet")), + {"db": {"dataset_name": target_db_name}, "trait_name": "_"}, + threshold, + conn) + + database_filename = get_filename(conn, target_db_name, TEXTDIR) + _total_traits, all_correlations = partial_corrs( + conn, common_primary_control_samples, fixed_primary_vals, + fixed_control_vals, len(fixed_primary_vals), species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, {**target_dataset, "dataset_type": target_dataset["type"]}, database_filename) + + + def __make_sorter__(method): + def __by_lit_or_tiss_corr_then_p_val__(row): + return (row[6], row[3]) + + def __by_partial_corr_p_value__(row): + return row[3] + + if (("literature" in method.lower()) or ("tissue" in method.lower())): + return __by_lit_or_tiss_corr_then_p_val__ + + return __by_partial_corr_p_value__ + + add_lit_corr_and_tiss_corr = compose( + partial(literature_correlation_by_list, conn, species), + partial( + tissue_correlation_by_list, conn, input_trait_symbol, + tissue_probeset_freeze_id, method)) + + selected_results = sorted( + all_correlations, + key=__make_sorter__(method))[:criteria] + traits_list_corr_info = { + f"{target_dataset['dataset_name']}::{item[0]}": { + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} + if len(item) == 8 else {}), + **({"l_corr": item[6]} + if len(item) == 7 else {}) + } for item in selected_results} + + trait_list = add_lit_corr_and_tiss_corr(tuple( + {**trait, **traits_list_corr_info.get(trait["trait_fullname"], {})} + for trait in traits_info( + conn, threshold, + tuple( + f"{target_dataset['dataset_name']}::{item[0]}" + for item in selected_results)))) + + return { + "status": "success", + "results": { + "primary_trait": trait_for_output(primary_trait), + "control_traits": tuple( + trait_for_output(trait) for trait in cntrl_traits), + "correlations": tuple( + trait_for_output(trait) for trait in trait_list), + "dataset_type": target_dataset["type"], + "method": "spearman" if "spearman" in method.lower() else "pearson" + }} diff --git a/gn3/computations/partial_correlations_optimised.py b/gn3/computations/partial_correlations_optimised.py new file mode 100644 index 0000000..601289c --- /dev/null +++ b/gn3/computations/partial_correlations_optimised.py @@ -0,0 +1,244 @@ +""" +This contains an optimised version of the + `gn3.computations.partial_correlations.partial_correlations_entry` +function. +""" +from functools import partial +from typing import Any, Tuple + +from gn3.settings import TEXTDIR +from gn3.function_helpers import compose +from gn3.db.partial_correlations import traits_info, traits_data +from gn3.db.species import species_name, translate_to_mouse_gene_id +from gn3.db.traits import export_informative, retrieve_trait_dataset +from gn3.db.correlations import ( + get_filename, + check_for_literature_info, + check_symbol_for_tissue_correlation) +from gn3.computations.partial_correlations import ( + fix_samples, + partial_corrs, + control_samples, + trait_for_output, + find_identical_traits, + tissue_correlation_by_list, + literature_correlation_by_list) + +def partial_correlations_entry(# pylint: disable=[R0913, R0914, R0911] + conn: Any, primary_trait_name: str, + control_trait_names: Tuple[str, ...], method: str, + criteria: int, target_db_name: str) -> dict: + """ + This is the 'ochestration' function for the partial-correlation feature. + + This function will dispatch the functions doing data fetches from the + database (and various other places) and feed that data to the functions + doing the conversions and computations. It will then return the results of + all of that work. + + This function is doing way too much. Look into splitting out the + functionality into smaller functions that do fewer things. + """ + threshold = 0 + corr_min_informative = 4 + + all_traits = traits_info( + conn, threshold, (primary_trait_name,) + control_trait_names) + all_traits_data = traits_data(conn, all_traits) + + # primary_trait = retrieve_trait_info(threshold, primary_trait_name, conn) + primary_trait = tuple( + trait for trait in all_traits + if trait["trait_fullname"] == primary_trait_name)[0] + group = primary_trait["db"]["group"] + # primary_trait_data = retrieve_trait_data(primary_trait, conn) + primary_trait_data = all_traits_data[primary_trait["trait_name"]] + primary_samples, primary_values, _primary_variances = export_informative( + primary_trait_data) + + # cntrl_traits = tuple( + # retrieve_trait_info(threshold, trait_full_name, conn) + # for trait_full_name in control_trait_names) + # cntrl_traits_data = tuple( + # retrieve_trait_data(cntrl_trait, conn) + # for cntrl_trait in cntrl_traits) + cntrl_traits = tuple( + trait for trait in all_traits + if trait["trait_fullname"] != primary_trait_name) + cntrl_traits_data = tuple( + data for trait_name, data in all_traits_data.items() + if trait_name != primary_trait["trait_name"]) + species = species_name(conn, group) + + (cntrl_samples, + cntrl_values, + _cntrl_variances, + _cntrl_ns) = control_samples(cntrl_traits_data, primary_samples) + + common_primary_control_samples = primary_samples + fixed_primary_vals = primary_values + fixed_control_vals = cntrl_values + if not all(cnt_smp == primary_samples for cnt_smp in cntrl_samples): + (common_primary_control_samples, + fixed_primary_vals, + fixed_control_vals, + _primary_variances, + _cntrl_variances) = fix_samples(primary_trait, cntrl_traits) + + if len(common_primary_control_samples) < corr_min_informative: + return { + "status": "error", + "message": ( + f"Fewer than {corr_min_informative} samples data entered for " + f"{group} dataset. No calculation of correlation has been " + "attempted."), + "error_type": "Inadequate Samples"} + + identical_traits_names = find_identical_traits( + primary_trait_name, primary_values, control_trait_names, cntrl_values) + if len(identical_traits_names) > 0: + return { + "status": "error", + "message": ( + f"{identical_traits_names[0]} and {identical_traits_names[1]} " + "have the same values for the {len(fixed_primary_vals)} " + "samples that will be used to compute the partial correlation " + "(common for all primary and control traits). In such cases, " + "partial correlation cannot be computed. Please re-select your " + "traits."), + "error_type": "Identical Traits"} + + input_trait_geneid = primary_trait.get("geneid", 0) + input_trait_symbol = primary_trait.get("symbol", "") + input_trait_mouse_geneid = translate_to_mouse_gene_id( + species, input_trait_geneid, conn) + + tissue_probeset_freeze_id = 1 + db_type = primary_trait["db"]["dataset_type"] + + if db_type == "ProbeSet" and method.lower() in ( + "sgo literature correlation", + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho"): + return { + "status": "error", + "message": ( + "Wrong correlation type: It is not possible to compute the " + f"{method} between your trait and data in the {target_db_name} " + "database. Please try again after selecting another type of " + "correlation."), + "error_type": "Correlation Type"} + + if (method.lower() == "sgo literature correlation" and ( + bool(input_trait_geneid) is False or + check_for_literature_info(conn, input_trait_mouse_geneid))): + return { + "status": "error", + "message": ( + "No Literature Information: This gene does not have any " + "associated Literature Information."), + "error_type": "Literature Correlation"} + + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and bool(input_trait_symbol) is False): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + if ( + method.lower() in ( + "tissue correlation, pearson's r", + "tissue correlation, spearman's rho") + and check_symbol_for_tissue_correlation( + conn, tissue_probeset_freeze_id, input_trait_symbol)): + return { + "status": "error", + "message": ( + "No Tissue Correlation Information: This gene does not have " + "any associated Tissue Correlation Information."), + "error_type": "Tissue Correlation"} + + target_dataset = retrieve_trait_dataset( + ("Temp" if "Temp" in target_db_name else + ("Publish" if "Publish" in target_db_name else + "Geno" if "Geno" in target_db_name else "ProbeSet")), + {"db": {"dataset_name": target_db_name}, "trait_name": "_"}, + threshold, + conn) + + database_filename = get_filename(conn, target_db_name, TEXTDIR) + _total_traits, all_correlations = partial_corrs( + conn, common_primary_control_samples, fixed_primary_vals, + fixed_control_vals, len(fixed_primary_vals), species, + input_trait_geneid, input_trait_symbol, tissue_probeset_freeze_id, + method, {**target_dataset, "dataset_type": target_dataset["type"]}, database_filename) + + + def __make_sorter__(method): + def __sort_6__(row): + return row[6] + + def __sort_3__(row): + return row[3] + + if "literature" in method.lower(): + return __sort_6__ + + if "tissue" in method.lower(): + return __sort_6__ + + return __sort_3__ + + # sorted_correlations = sorted( + # all_correlations, key=__make_sorter__(method)) + + add_lit_corr_and_tiss_corr = compose( + partial(literature_correlation_by_list, conn, species), + partial( + tissue_correlation_by_list, conn, input_trait_symbol, + tissue_probeset_freeze_id, method)) + + selected_results = sorted( + all_correlations, + key=__make_sorter__(method))[:min(criteria, len(all_correlations))] + traits_list_corr_info = { + "{target_dataset['dataset_name']}::{item[0]}": { + "noverlap": item[1], + "partial_corr": item[2], + "partial_corr_p_value": item[3], + "corr": item[4], + "corr_p_value": item[5], + "rank_order": (1 if "spearman" in method.lower() else 0), + **({ + "tissue_corr": item[6], + "tissue_p_value": item[7]} + if len(item) == 8 else {}), + **({"l_corr": item[6]} + if len(item) == 7 else {}) + } for item in selected_results} + + trait_list = add_lit_corr_and_tiss_corr(tuple( + {**trait, **traits_list_corr_info.get(trait["trait_fullname"], {})} + for trait in traits_info( + conn, threshold, + tuple( + f"{target_dataset['dataset_name']}::{item[0]}" + for item in selected_results)))) + + return { + "status": "success", + "results": { + "primary_trait": trait_for_output(primary_trait), + "control_traits": tuple( + trait_for_output(trait) for trait in cntrl_traits), + "correlations": tuple( + trait_for_output(trait) for trait in trait_list), + "dataset_type": target_dataset["type"], + "method": "spearman" if "spearman" in method.lower() else "pearson" + }} diff --git a/gn3/computations/pca.py b/gn3/computations/pca.py new file mode 100644 index 0000000..35c9f03 --- /dev/null +++ b/gn3/computations/pca.py @@ -0,0 +1,189 @@ +"""module contains pca implementation using python""" + + +from typing import Any +from scipy import stats + +from sklearn.decomposition import PCA +from sklearn import preprocessing + +import numpy as np +import redis + + +from typing_extensions import TypeAlias + +fArray: TypeAlias = list[float] + + +def compute_pca(array: list[fArray]) -> dict[str, Any]: + """ + computes the principal component analysis + + Parameters: + + array(list[list]):a list of lists contains data to perform pca + + + Returns: + pca_dict(dict):dict contains the pca_object,pca components,pca scores + + + """ + + corr_matrix = np.array(array) + + pca_obj = PCA() + scaled_data = preprocessing.scale(corr_matrix) + + pca_obj.fit(scaled_data) + + return { + "pca": pca_obj, + "components": pca_obj.components_, + "scores": pca_obj.transform(scaled_data) + } + + +def generate_scree_plot_data(variance_ratio: fArray) -> tuple[list, fArray]: + """ + generates the scree data for plotting + + Parameters: + + variance_ratio(list[floats]):ratios for contribution of each pca + + Returns: + + coordinates(list[(x_coor,y_coord)]) + + + """ + + perc_var = [round(ratio*100, 1) for ratio in variance_ratio] + + x_coordinates = [f"PC{val}" for val in range(1, len(perc_var)+1)] + + return (x_coordinates, perc_var) + + +def generate_pca_traits_vals(trait_data_array: list[fArray], + corr_array: list[fArray]) -> list[list[Any]]: + """ + generates datasets from zscores of the traits and eigen_vectors\ + of correlation matrix + + Parameters: + + trait_data_array(list[floats]):an list of the traits + corr_array(list[list]): list of arrays for computing eigen_vectors + + Returns: + + pca_vals[list[list]]: + + + """ + + trait_zscores = stats.zscore(trait_data_array) + + if len(trait_data_array[0]) < 10: + trait_zscores = trait_data_array + + (eigen_values, corr_eigen_vectors) = np.linalg.eig(np.array(corr_array)) + idx = eigen_values.argsort()[::-1] + + return np.dot(corr_eigen_vectors[:, idx], trait_zscores) + + +def process_factor_loadings_tdata(factor_loadings, traits_num: int): + """ + + transform loadings for tables visualization + + Parameters: + factor_loading(numpy.ndarray) + traits_num(int):number of traits + + Returns: + tabular_loadings(list[list[float]]) + """ + + target_columns = 3 if traits_num > 2 else 2 + + trait_loadings = list(factor_loadings.T) + + return [list(trait_loading[:target_columns]) + for trait_loading in trait_loadings] + + +def generate_pca_temp_traits( + species: str, + group: str, + traits_data: list[fArray], + corr_array: list[fArray], + dataset_samples: list[str], + shared_samples: list[str], + create_time: str +) -> dict[str, list[Any]]: + """ + + + generate pca temp datasets + + """ + + # pylint: disable=too-many-arguments + + pca_trait_dict = {} + + pca_vals = generate_pca_traits_vals(traits_data, corr_array) + + for (idx, pca_trait) in enumerate(list(pca_vals)): + + trait_id = f"PCA{str(idx+1)}_{species}_{group}_{create_time}" + sample_vals = [] + + pointer = 0 + + for sample in dataset_samples: + if sample in shared_samples: + + sample_vals.append(str(pca_trait[pointer])) + pointer += 1 + + else: + sample_vals.append("x") + + pca_trait_dict[trait_id] = sample_vals + + return pca_trait_dict + + +def cache_pca_dataset(redis_conn: Any, exp_days: int, + pca_trait_dict: dict[str, list[Any]]): + """ + + caches pca dataset to redis + + Parameters: + + redis_conn(object) + exp_days(int): fo redis cache + pca_trait_dict(Dict): contains traits and traits vals to cache + + Returns: + + boolean(True if correct conn object False incase of exception) + + + """ + + try: + for trait_id, sample_data in pca_trait_dict.items(): + samples_str = " ".join([str(x) for x in sample_data]) + redis_conn.set(trait_id, samples_str, ex=exp_days) + return True + + except (redis.ConnectionError, AttributeError): + return False diff --git a/gn3/computations/qtlreaper.py b/gn3/computations/qtlreaper.py index d1ff4ac..b61bdae 100644 --- a/gn3/computations/qtlreaper.py +++ b/gn3/computations/qtlreaper.py @@ -27,7 +27,7 @@ def generate_traits_file(samples, trait_values, traits_filename): ["{}\t{}".format( len(trait_values), "\t".join([str(i) for i in t])) for t in trait_values[-1:]]) - with open(traits_filename, "w") as outfile: + with open(traits_filename, "w", encoding="utf8") as outfile: outfile.writelines(data) def create_output_directory(path: str): @@ -68,13 +68,13 @@ def run_reaper( The function will raise a `subprocess.CalledProcessError` exception in case of any errors running the `qtlreaper` command. """ - create_output_directory("{}/qtlreaper".format(output_dir)) - output_filename = "{}/qtlreaper/main_output_{}.txt".format( - output_dir, random_string(10)) + create_output_directory(f"{output_dir}/qtlreaper") + output_filename = ( + f"{output_dir}/qtlreaper/main_output_{random_string(10)}.txt") output_list = ["--main_output", output_filename] if separate_nperm_output: - permu_output_filename: Union[None, str] = "{}/qtlreaper/permu_output_{}.txt".format( - output_dir, random_string(10)) + permu_output_filename: Union[None, str] = ( + f"{output_dir}/qtlreaper/permu_output_{random_string(10)}.txt") output_list = output_list + [ "--permu_output", permu_output_filename] # type: ignore[list-item] else: @@ -135,7 +135,7 @@ def parse_reaper_main_results(results_file): """ Parse the results file of running QTLReaper into a list of dicts. """ - with open(results_file, "r") as infile: + with open(results_file, "r", encoding="utf8") as infile: lines = infile.readlines() def __parse_column_float_value(value): @@ -164,7 +164,7 @@ def parse_reaper_permutation_results(results_file): """ Parse the results QTLReaper permutations into a list of values. """ - with open(results_file, "r") as infile: + with open(results_file, "r", encoding="utf8") as infile: lines = infile.readlines() return [float(line.strip()) for line in lines] diff --git a/gn3/computations/rqtl.py b/gn3/computations/rqtl.py index e81aba3..65ee6de 100644 --- a/gn3/computations/rqtl.py +++ b/gn3/computations/rqtl.py @@ -53,7 +53,7 @@ def process_rqtl_mapping(file_name: str) -> List: # Later I should probably redo this using csv.read to avoid the # awkwardness with removing quotes with [1:-1] with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", file_name), "r") as the_file: + "output", file_name), "r", encoding="utf-8") as the_file: for line in the_file: line_items = line.split(",") if line_items[1][1:-1] == "chr" or not line_items: @@ -118,7 +118,6 @@ def pairscan_for_figure(file_name: str) -> Dict: return figure_data - def get_marker_list(map_file: str) -> List: """ Open the map file with the list of markers/pseudomarkers and create list of marker obs @@ -255,7 +254,7 @@ def process_perm_output(file_name: str) -> Tuple[List, float, float]: perm_results = [] with open(os.path.join(current_app.config.get("TMPDIR", "/tmp"), - "output", "PERM_" + file_name), "r") as the_file: + "output", "PERM_" + file_name), "r", encoding="utf-8") as the_file: for i, line in enumerate(the_file): if i == 0: # Skip header line diff --git a/gn3/computations/wgcna.py b/gn3/computations/wgcna.py index ab12fe7..c985491 100644 --- a/gn3/computations/wgcna.py +++ b/gn3/computations/wgcna.py @@ -19,7 +19,7 @@ def dump_wgcna_data(request_data: dict): request_data["TMPDIR"] = TMPDIR - with open(temp_file_path, "w") as output_file: + with open(temp_file_path, "w", encoding="utf-8") as output_file: json.dump(request_data, output_file) return temp_file_path @@ -31,20 +31,18 @@ def stream_cmd_output(socketio, request_data, cmd: str): socketio.emit("output", {"data": f"calling you script {cmd}"}, namespace="/", room=request_data["socket_id"]) - results = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) + with subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) as results: + if results.stdout is not None: + for line in iter(results.stdout.readline, b""): + socketio.emit("output", + {"data": line.decode("utf-8").rstrip()}, + namespace="/", room=request_data["socket_id"]) - if results.stdout is not None: - - for line in iter(results.stdout.readline, b""): - socketio.emit("output", - {"data": line.decode("utf-8").rstrip()}, - namespace="/", room=request_data["socket_id"]) - - socketio.emit( - "output", {"data": - "parsing the output results"}, namespace="/", - room=request_data["socket_id"]) + socketio.emit( + "output", {"data": + "parsing the output results"}, namespace="/", + room=request_data["socket_id"]) def process_image(image_loc: str) -> bytes: @@ -75,7 +73,7 @@ def call_wgcna_script(rscript_path: str, request_data: dict): run_cmd_results = run_cmd(cmd) - with open(generated_file, "r") as outputfile: + with open(generated_file, "r", encoding="utf-8") as outputfile: if run_cmd_results["code"] != 0: return run_cmd_results |