mechanical-rob: Parser for GN1 results and some sample results

* test/requests/correlation_results_text_files/*results.csv: csv files with sample results from GN1 for the trait `1435464_at` in dataset `HC_M2_0606_P`. * wqflask/scripts/parse_corr_gn1_results_to_csv.py: parser for results from GN1 when saved to a html file.
author: Frederick Muriuki Muriithi 2022-11-25 13:29:06 +0300
committer: Frederick Muriuki Muriithi 2022-11-25 13:38:12 +0300
commit: e33a3a78d33f409ae20c8486d0bd2453f189e2b1 (patch)
tree: 7df5d7db3b05ad7ef5a2be9078f08068a85e56e8 /wqflask/scripts
parent: d318b1d649a3ece1a5a6c24bcd3496e14dbc69cd (diff)
download: genenetwork2-e33a3a78d33f409ae20c8486d0bd2453f189e2b1.tar.gz
1 files changed, 86 insertions, 0 deletions
diff --git a/wqflask/scripts/parse_corr_gn1_results_to_csv.py b/wqflask/scripts/parse_corr_gn1_results_to_csv.py
new file mode 100644
index 00000000..b04a0378
--- /dev/null
+++ b/wqflask/scripts/parse_corr_gn1_results_to_csv.py
@@ -0,0 +1,86 @@
+import csv
+import sys
+from functools import reduce, partial
+import pathlib
+
+from lxml import etree
+
+def thread(value, *functions):
+    return reduce(lambda result, func: func(result), functions, value)
+
+def parse_file(filename: pathlib.Path):
+    with open(filename, encoding="utf-8") as inpfl:
+        raw_html = inpfl.read()
+
+    return etree.HTML(raw_html)
+
+def first_row_headers(table):
+    return tuple(
+        " ".join(text.strip() for text in cell.xpath(".//child::text()"))
+        for cell in table.xpath("./tbody/tr[1]/td"))
+    
+
+def results_table(tables):
+    found = tuple(filter(
+        lambda table: (
+            first_row_headers(table)[0:4] ==
+            ("Index", "Record ID", "Symbol", "Description")),
+        tables))
+    # print(f"Found {len(found)} with the expected first row")
+    return found[0]
+
+def table_contents(table):
+    return tuple(
+        tuple(" ".join(text.strip() for text in cell.xpath(".//child::text()"))
+            for cell in row)
+        for row in table.xpath("./tbody/tr"))
+
+
+def to_dicts(contents):
+    frow = contents[0]
+    return tuple(dict(zip(frow, row)) for row in contents[1:])
+
+def write_csv(input_file, to_output_file, contents):
+    def __write__(stream):
+        writer = csv.DictWriter(
+            stream, fieldnames=list(contents[0].keys()),
+            dialect=csv.unix_dialect)
+        writer.writeheader()
+        writer.writerows(contents)
+
+    if not to_output_file:
+        return __write__(sys.stdout)
+
+    output_file = input_file.parent.joinpath(
+        f"{input_file.stem}__results.csv")
+    with open(output_file, "w", encoding="utf-8") as out_file:
+        return __write__(out_file)
+
+def output_stream():
+    if not to_output_file:
+        return sys.stdout
+
+    output_file = input_file.parent.joinpath(
+        f"{input_file.stem}.csv")
+    with open(output_file) as out_file:
+        yield out_file
+
+def run():
+    if len(sys.argv) != 3:
+        print("Usage: python3 test.py <input-file> <to-ouput-file: [Y/n]>")
+        sys.exit(1)
+
+    _this_file, input_file, to_output_file = sys.argv
+    input_file = pathlib.Path(input_file).absolute()
+    thread(
+        input_file,
+        parse_file,
+        lambda tree: tree.xpath("//table"),
+        results_table,
+        table_contents,
+        to_dicts,
+        partial(write_csv, input_file, to_output_file == "Y")
+    )
+
+if __name__ == "__main__":
+    run()
author	Frederick Muriuki Muriithi	2022-11-25 13:29:06 +0300
committer	Frederick Muriuki Muriithi	2022-11-25 13:38:12 +0300
commit	e33a3a78d33f409ae20c8486d0bd2453f189e2b1 (patch)
tree	7df5d7db3b05ad7ef5a2be9078f08068a85e56e8 /wqflask/scripts
parent	d318b1d649a3ece1a5a6c24bcd3496e14dbc69cd (diff)
download	genenetwork2-e33a3a78d33f409ae20c8486d0bd2453f189e2b1.tar.gz