diff options
| author | Alexander Kabui | 2021-11-15 18:03:55 +0300 | 
|---|---|---|
| committer | Alexander Kabui | 2021-11-15 18:03:55 +0300 | 
| commit | 18b53441a0136071db94c72b112a746e056ef971 (patch) | |
| tree | b1a70e0470901bcce9fcc8dc4bc92bc881f40930 | |
| parent | 01d42255f52a61c6d3d007ffd1e5e02765a76730 (diff) | |
| download | genenetwork2-18b53441a0136071db94c72b112a746e056ef971.tar.gz | |
refactor function to fetch datasets data for precomputes
| -rw-r--r-- | wqflask/wqflask/correlation/pre_computes.py | 72 | 
1 files changed, 56 insertions, 16 deletions
| diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index 01fa1a3d..e7147ddf 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -20,6 +20,7 @@ def generate_filename(base_dataset_name, target_dataset_name, base_timestamp, ta def cache_compute_results(base_dataset_type, base_dataset_name, target_dataset_name, + corr_method, correlation_results, trait_name): # pass @@ -34,43 +35,35 @@ def cache_compute_results(base_dataset_type, target_dataset_timestamp = base_timestamp - - file_name = generate_filename( base_dataset_name, target_dataset_name, base_timestamp, target_dataset_timestamp) - - file_path = os.path.join(TMPDIR,f"{file_name}.json") - + file_path = os.path.join(TMPDIR, f"{file_name}.json") try: - with open(file_path,"r+") as json_file_handler: + with open(file_path, "r+") as json_file_handler: data = json.load(json_file_handler) data[trait_name] = correlation_results json_file_handler.seek(0) - json.dump(data,json_file_handler) + json.dump(data, json_file_handler) json_file_handler.truncate() - + except FileNotFoundError: - with open(file_path,"w+") as file_handler: + with open(file_path, "w+") as file_handler: data = {} - data[trait_name] =correlation_results - - json.dump(data,file_handler) - + data[trait_name] = correlation_results + json.dump(data, file_handler) # create the file only if it does not exists # else open the file to cache the results - - def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_type, trait_name): """function to check for precomputed results""" @@ -84,7 +77,8 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ return else: - base_timestamp = target_dataset_timestamp = base_timestamp.decode("utf-8") + base_timestamp = target_dataset_timestamp = base_timestamp.decode( + "utf-8") file_name = generate_filename( base_dataset_name, target_dataset_name, @@ -103,3 +97,49 @@ def fetch_precompute_results(base_dataset_name, target_dataset_name, dataset_typ except FileNotFoundError: pass + + +def pre_compute_dataset_vs_dataset(base_dataset, target_dataset, corr_method): + """compute sample correlation between dataset vs dataset + wn:heavy function should be invoked less frequently + input:datasets_data(two dicts),corr_method + + output:correlation results for entire dataset against entire dataset + """ + dataset_correlation_results = {} + + for (trait_name, strain_values) in target_dataset.trait_data: + + this_trait_data = { + "trait_sample_data": strain_values, + "trait_id": trait_name + } + + trait_correlation_result = fast_compute_all_sample_correlation( + corr_method=corr_method, this_trait=this_trait_data, target_dataset=target_dataset_data) + + dataset_correlation_results[trait_name] = trait_correlation_result + + return dataset_correlation_results + + +def get_datasets_data(base_dataset, target_dataset_data): + """required to pass data in a given format to the pre compute + function + + output:two dicts for datasets with key==trait and value==strains + """ + target_traits_data = target_dataset.get_trait_data( + base_dataset.group.all_samples_ordered()) + + base_traits_data = base_dataset.get_trait_data( + base_dataset.group.all_samples_ordered()) + + samples_fetched = base_dataset.group.all_samples_ordered() + + target_results = map_shared_keys_to_values( + samples_fetched, target_traits_data) + base_results = map_shared_keys_to_values( + samples_fetched, base_traits_data) + + return (target_results, base_results) | 
