diff options
-rw-r--r-- | wqflask/wqflask/correlation/pre_computes.py | 59 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/rust_correlation.py | 5 |
2 files changed, 41 insertions, 23 deletions
diff --git a/wqflask/wqflask/correlation/pre_computes.py b/wqflask/wqflask/correlation/pre_computes.py index ddcc5ba9..c995b471 100644 --- a/wqflask/wqflask/correlation/pre_computes.py +++ b/wqflask/wqflask/correlation/pre_computes.py @@ -16,34 +16,53 @@ from utility.tools import SQL_URI from json.decoder import JSONDecodeError -def cache_trait_metadata(dataset_name, data): +def to_generate_datasets(dataset_name, dataset_type, gen_type, species="mouse"): + try: + with lmdb.open(os.path.join("/tmp", "todolist_generate"), map_size=20971520) as env: + with env.begin(write=True) as txn: + data = txn.get(f"{gen_type}:{dataset_type}".encode()) + if data: + data = pickle.loads(data) + data[dataset_name] = ( + dataset_type, dataset_name, species) + else: + data = {dataset_name: ( + dataset_type, dataset_name, species)} + + txn.put(f"{gen_type}:{dataset_type}".encode(), pickle.dumps(data)) + except Exception as e: + pass + + +def cache_trait_metadata(dataset_name, data): try: - with lmdb.open(os.path.join(TMPDIR,f"metadata_{dataset_name}"),map_size=20971520) as env: - with env.begin(write=True) as txn: + with lmdb.open(os.path.join(TMPDIR, f"metadata_{dataset_name}"), map_size=20971520) as env: + with env.begin(write=True) as txn: data_bytes = pickle.dumps(data) txn.put(f"{dataset_name}".encode(), data_bytes) current_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') txn.put(b"creation_date", current_date.encode()) return "success" - except lmdb.Error as error: + except lmdb.Error as error: pass -def read_trait_metadata(dataset_name,dataset_type): + +def read_trait_metadata(dataset_name, dataset_type): try: - with lmdb.open(os.path.join("/tmp/",f"metadata_{dataset_type}"), readonly=True, lock=False) as env: + with lmdb.open(os.path.join("/tmp/", f"metadata_{dataset_type}"), readonly=True, lock=False) as env: with env.begin() as txn: - metadata = txn.get(dataset_name.encode()) + metadata = txn.get(dataset_name.encode()) return (pickle.loads(metadata)["data"] if metadata else {}) except lmdb.Error as error: return {} - -def parse_lmdb_dataset(strain_names,target_strains,data): - _vals = [] +def parse_lmdb_dataset(strain_names, target_strains, data): + _vals = [] _posit = [0] + def __fetch_id_positions__(all_ids, target_ids): _vals = [] _posit = [0] # alternative for parsing @@ -54,17 +73,18 @@ def parse_lmdb_dataset(strain_names,target_strains,data): _posit.append(idx) return (_posit, _vals) - _posit,sample_vals = __fetch_id_positions__(strain_names,target_strains) - return (sample_vals,[[line[i] for i in _posit] for line in data.values()]) + _posit, sample_vals = __fetch_id_positions__(strain_names, target_strains) + return (sample_vals, [[line[i] for i in _posit] for line in data.values()]) + -def read_lmdb_strain_files(dataset_type,dataset_name,sql_uri=SQL_URI): +def read_lmdb_strain_files(dataset_type, dataset_name, sql_uri=SQL_URI): # target file path for example probeset and name used to generate the name def __sanitise_filename__(filename): ttable = str.maketrans({" ": "_", "/": "_", "\\": "_"}) return str.translate(filename, ttable) - def __generate_file_name__(db_name): + def __generate_file_name__(db_name): # todo add expiry time and checker with database_connection() as conn: @@ -77,12 +97,12 @@ def read_lmdb_strain_files(dataset_type,dataset_name,sql_uri=SQL_URI): f"ProbeSetFreezeId_{results[0]}_{results[1]}") try: # change this to tmpdir - with lmdb.open(os.path.join(TMPDIR,"Probesets"),readonly=True,lock=False) as env: + with lmdb.open(os.path.join(TMPDIR, "Probesets"), readonly=True, lock=False) as env: with env.begin() as txn: - filename = __generate_file_name__ (dataset_name) + filename = __generate_file_name__(dataset_name) if filename: meta = pickle.loads(txn.get(filename.encode())) - return (meta["strain_names"],meta["data"]) + return (meta["strain_names"], meta["data"]) return {} except Exception as error: return {} @@ -130,8 +150,6 @@ def generate_filename(*args, suffix="", file_ext="json"): return f"{hashlib.md5(string_unicode).hexdigest()}_{suffix}.{file_ext}" - - def fetch_text_file(dataset_name, conn, text_dir=TMPDIR): """fetch textfiles with strain vals if exists""" @@ -154,9 +172,6 @@ def fetch_text_file(dataset_name, conn, text_dir=TMPDIR): pass - - - def read_text_file(sample_dict, file_path): def __fetch_id_positions__(all_ids, target_ids): diff --git a/wqflask/wqflask/correlation/rust_correlation.py b/wqflask/wqflask/correlation/rust_correlation.py index ea63d244..5f024440 100644 --- a/wqflask/wqflask/correlation/rust_correlation.py +++ b/wqflask/wqflask/correlation/rust_correlation.py @@ -16,7 +16,7 @@ from wqflask.correlation.pre_computes import write_db_to_textfile from wqflask.correlation.pre_computes import read_trait_metadata from wqflask.correlation.pre_computes import cache_trait_metadata from wqflask.correlation.pre_computes import parse_lmdb_dataset - +from wqflask.correlation.pre_computes import to_generate_datasets from wqflask.correlation.pre_computes import read_lmdb_strain_files from gn3.computations.correlations import compute_all_lit_correlation from gn3.computations.rust_correlation import run_correlation @@ -73,6 +73,7 @@ def get_metadata(dataset, traits): if cached_metadata: return {trait:cached_metadata.get(trait) for trait in traits} else: + to_generate_datasets(dataset.name, "ProbeSet", "metadata") return {**({trait_name: { "name": trait_name, "view": True, @@ -262,6 +263,8 @@ def __compute_sample_corr__( (sample_vals,target_data) = parse_lmdb_dataset(results[0],sample_data,results[1]) return run_correlation(target_data, sample_vals, method, ",", corr_type, n_top) + else: + to_generate_datasets(target_dataset.name, "ProbeSet", "textfile", target_dataset.group.species) target_dataset.get_trait_data(list(sample_data.keys())) def __merge_key_and_values__(rows, current): |