diff options
author | BonfaceKilz | 2022-02-24 09:00:49 +0300 |
---|---|---|
committer | BonfaceKilz | 2022-03-12 15:33:09 +0300 |
commit | a4ae3a4d1bb680f8a0d38d37f8ceffe0f3a1b2a9 (patch) | |
tree | a0ea343627038cbcb246a86e0d75965e04f18f61 /gn3/csvcmp.py | |
parent | 83bfe825ece64807a96280fda9cb4ef3e8ac0587 (diff) | |
download | genenetwork3-a4ae3a4d1bb680f8a0d38d37f8ceffe0f3a1b2a9.tar.gz |
Add methods for working with csv data
gn3/csvcmp.py: New file
(create_dirs_if_not_exists): From a list of dirs, create them if they don't
exist.
(remove_insignificant_edits): Given a dict with a "Modification" key, remove
edits with "delta < ε".
(csv_diff): Generate a csv_diff using the "csvdiff" tool packaged in guix.
tests/unit/test_csvcmp.py: Add some tests for "gn3/csvcmp.py"
Diffstat (limited to 'gn3/csvcmp.py')
-rw-r--r-- | gn3/csvcmp.py | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/gn3/csvcmp.py b/gn3/csvcmp.py new file mode 100644 index 0000000..e033396 --- /dev/null +++ b/gn3/csvcmp.py @@ -0,0 +1,68 @@ +import json +import os +import uuid +from gn3.commands import run_cmd + + +def create_dirs_if_not_exists(dirs: list): + for dir_ in dirs: + if not os.path.exists(dir_): + os.makedirs(dir_) + + +def remove_insignificant_edits(diff_data, epsilon=0.001): + _mod = [] + for mod in diff_data.get("Modifications"): + original = mod.get("Original").split(",") + current = mod.get("Current").split(",") + for i, (x, y) in enumerate(zip(original, current)): + if all([ + x.replace('.', '').isdigit(), + y.replace('.', '').isdigit(), + abs(float(x) - float(y)) < epsilon, + ]): + current[i] = x + if not (__o := ",".join(original)) == (__c := ",".join(current)): + _mod.append({ + "Original": __o, + "Current": __c, + }) + diff_data['Modifications'] = _mod + return diff_data + + +def csv_diff(base_csv, delta_csv, tmp_dir="/tmp"): + base_csv_list = base_csv.strip().split("\n") + delta_csv_list = delta_csv.strip().split("\n") + + _header1, _header2 = "", "" + for i, line in enumerate(base_csv_list): + if line.startswith("Strain Name,Value,SE,Count"): + _header1, _header2 = line, delta_csv_list[i] + break + + if _header1 != _header2: + header = max(_header1, _header2) + base_csv = base_csv.replace("Strain Name,Value,SE,Count", + header, 1) + delta_csv = delta_csv.replace("Strain Name,Value,SE,Count", + header, 1) + file_name1 = os.path.join(tmp_dir, str(uuid.uuid4())) + file_name2 = os.path.join(tmp_dir, str(uuid.uuid4())) + with open(file_name1, "w") as f_: + f_.write(base_csv) + with open(file_name2, "w") as f_: + f_.write(delta_csv) + + # Now we can run the diff! + _r = run_cmd(cmd=("csvdiff " + f"'{file_name1}' '{file_name2}' " + "--format json")) + if _r.get("code") == 0: + _r["output"] = json.loads(_r.get("output")) + # Clean Up! + if os.path.exists(file_name1): + os.remove(file_name1) + if os.path.exists(file_name2): + os.remove(file_name2) + return _r |