diff options
author | BonfaceKilz | 2022-02-24 09:00:49 +0300 |
---|---|---|
committer | BonfaceKilz | 2022-03-12 15:33:09 +0300 |
commit | a4ae3a4d1bb680f8a0d38d37f8ceffe0f3a1b2a9 (patch) | |
tree | a0ea343627038cbcb246a86e0d75965e04f18f61 | |
parent | 83bfe825ece64807a96280fda9cb4ef3e8ac0587 (diff) | |
download | genenetwork3-a4ae3a4d1bb680f8a0d38d37f8ceffe0f3a1b2a9.tar.gz |
Add methods for working with csv data
gn3/csvcmp.py: New file
(create_dirs_if_not_exists): From a list of dirs, create them if they don't
exist.
(remove_insignificant_edits): Given a dict with a "Modification" key, remove
edits with "delta < ε".
(csv_diff): Generate a csv_diff using the "csvdiff" tool packaged in guix.
tests/unit/test_csvcmp.py: Add some tests for "gn3/csvcmp.py"
-rw-r--r-- | gn3/csvcmp.py | 68 | ||||
-rw-r--r-- | tests/unit/test_csvcmp.py | 41 |
2 files changed, 109 insertions, 0 deletions
diff --git a/gn3/csvcmp.py b/gn3/csvcmp.py new file mode 100644 index 0000000..e033396 --- /dev/null +++ b/gn3/csvcmp.py @@ -0,0 +1,68 @@ +import json +import os +import uuid +from gn3.commands import run_cmd + + +def create_dirs_if_not_exists(dirs: list): + for dir_ in dirs: + if not os.path.exists(dir_): + os.makedirs(dir_) + + +def remove_insignificant_edits(diff_data, epsilon=0.001): + _mod = [] + for mod in diff_data.get("Modifications"): + original = mod.get("Original").split(",") + current = mod.get("Current").split(",") + for i, (x, y) in enumerate(zip(original, current)): + if all([ + x.replace('.', '').isdigit(), + y.replace('.', '').isdigit(), + abs(float(x) - float(y)) < epsilon, + ]): + current[i] = x + if not (__o := ",".join(original)) == (__c := ",".join(current)): + _mod.append({ + "Original": __o, + "Current": __c, + }) + diff_data['Modifications'] = _mod + return diff_data + + +def csv_diff(base_csv, delta_csv, tmp_dir="/tmp"): + base_csv_list = base_csv.strip().split("\n") + delta_csv_list = delta_csv.strip().split("\n") + + _header1, _header2 = "", "" + for i, line in enumerate(base_csv_list): + if line.startswith("Strain Name,Value,SE,Count"): + _header1, _header2 = line, delta_csv_list[i] + break + + if _header1 != _header2: + header = max(_header1, _header2) + base_csv = base_csv.replace("Strain Name,Value,SE,Count", + header, 1) + delta_csv = delta_csv.replace("Strain Name,Value,SE,Count", + header, 1) + file_name1 = os.path.join(tmp_dir, str(uuid.uuid4())) + file_name2 = os.path.join(tmp_dir, str(uuid.uuid4())) + with open(file_name1, "w") as f_: + f_.write(base_csv) + with open(file_name2, "w") as f_: + f_.write(delta_csv) + + # Now we can run the diff! + _r = run_cmd(cmd=("csvdiff " + f"'{file_name1}' '{file_name2}' " + "--format json")) + if _r.get("code") == 0: + _r["output"] = json.loads(_r.get("output")) + # Clean Up! + if os.path.exists(file_name1): + os.remove(file_name1) + if os.path.exists(file_name2): + os.remove(file_name2) + return _r diff --git a/tests/unit/test_csvcmp.py b/tests/unit/test_csvcmp.py new file mode 100644 index 0000000..f73865d --- /dev/null +++ b/tests/unit/test_csvcmp.py @@ -0,0 +1,41 @@ +from gn3.csvcmp import csv_diff +from gn3.csvcmp import remove_insignificant_edits + +import pytest + + +@pytest.mark.unit_test +def test_remove_insignificant_data(): + diff_data = { + 'Additions': [], + 'Deletions': [], + 'Modifications': [ + {'Current': '1.000001,3', 'Original': '1,3'}, + {'Current': '1,3', 'Original': '1.000001,3'}, + {'Current': '2.000001,3', 'Original': '2,2'}, + {'Current': '1.01,3', 'Original': '1,2'} + ] + } + expected_json = { + 'Additions': [], + 'Deletions': [], + 'Modifications': [ + {'Current': '2,3', 'Original': '2,2'}, + {'Current': '1.01,3', 'Original': '1,2'} + ] + } + assert (remove_insignificant_edits(diff_data) == + expected_json) + + +@pytest.mark.unit_test +def test_csv_diff(): + test_results = csv_diff(base_csv="a,b\n1,2\n", + delta_csv="a,b\n1,3") + _json = { + 'Additions': [], + 'Deletions': [], + 'Modifications': [{'Current': '1,3', 'Original': '1,2'}] + } + assert(test_results.get("code") == 0 and + test_results.get("output") == _json) |