aboutsummaryrefslogtreecommitdiff
path: root/wqflask
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2022-11-25 13:29:06 +0300
committerFrederick Muriuki Muriithi2022-11-25 13:38:12 +0300
commite33a3a78d33f409ae20c8486d0bd2453f189e2b1 (patch)
tree7df5d7db3b05ad7ef5a2be9078f08068a85e56e8 /wqflask
parentd318b1d649a3ece1a5a6c24bcd3496e14dbc69cd (diff)
downloadgenenetwork2-e33a3a78d33f409ae20c8486d0bd2453f189e2b1.tar.gz
mechanical-rob: Parser for GN1 results and some sample results
* test/requests/correlation_results_text_files/*results.csv: csv files with sample results from GN1 for the trait `1435464_at` in dataset `HC_M2_0606_P`. * wqflask/scripts/parse_corr_gn1_results_to_csv.py: parser for results from GN1 when saved to a html file.
Diffstat (limited to 'wqflask')
-rw-r--r--wqflask/scripts/parse_corr_gn1_results_to_csv.py86
1 files changed, 86 insertions, 0 deletions
diff --git a/wqflask/scripts/parse_corr_gn1_results_to_csv.py b/wqflask/scripts/parse_corr_gn1_results_to_csv.py
new file mode 100644
index 00000000..b04a0378
--- /dev/null
+++ b/wqflask/scripts/parse_corr_gn1_results_to_csv.py
@@ -0,0 +1,86 @@
+import csv
+import sys
+from functools import reduce, partial
+import pathlib
+
+from lxml import etree
+
+def thread(value, *functions):
+ return reduce(lambda result, func: func(result), functions, value)
+
+def parse_file(filename: pathlib.Path):
+ with open(filename, encoding="utf-8") as inpfl:
+ raw_html = inpfl.read()
+
+ return etree.HTML(raw_html)
+
+def first_row_headers(table):
+ return tuple(
+ " ".join(text.strip() for text in cell.xpath(".//child::text()"))
+ for cell in table.xpath("./tbody/tr[1]/td"))
+
+
+def results_table(tables):
+ found = tuple(filter(
+ lambda table: (
+ first_row_headers(table)[0:4] ==
+ ("Index", "Record ID", "Symbol", "Description")),
+ tables))
+ # print(f"Found {len(found)} with the expected first row")
+ return found[0]
+
+def table_contents(table):
+ return tuple(
+ tuple(" ".join(text.strip() for text in cell.xpath(".//child::text()"))
+ for cell in row)
+ for row in table.xpath("./tbody/tr"))
+
+
+def to_dicts(contents):
+ frow = contents[0]
+ return tuple(dict(zip(frow, row)) for row in contents[1:])
+
+def write_csv(input_file, to_output_file, contents):
+ def __write__(stream):
+ writer = csv.DictWriter(
+ stream, fieldnames=list(contents[0].keys()),
+ dialect=csv.unix_dialect)
+ writer.writeheader()
+ writer.writerows(contents)
+
+ if not to_output_file:
+ return __write__(sys.stdout)
+
+ output_file = input_file.parent.joinpath(
+ f"{input_file.stem}__results.csv")
+ with open(output_file, "w", encoding="utf-8") as out_file:
+ return __write__(out_file)
+
+def output_stream():
+ if not to_output_file:
+ return sys.stdout
+
+ output_file = input_file.parent.joinpath(
+ f"{input_file.stem}.csv")
+ with open(output_file) as out_file:
+ yield out_file
+
+def run():
+ if len(sys.argv) != 3:
+ print("Usage: python3 test.py <input-file> <to-ouput-file: [Y/n]>")
+ sys.exit(1)
+
+ _this_file, input_file, to_output_file = sys.argv
+ input_file = pathlib.Path(input_file).absolute()
+ thread(
+ input_file,
+ parse_file,
+ lambda tree: tree.xpath("//table"),
+ results_table,
+ table_contents,
+ to_dicts,
+ partial(write_csv, input_file, to_output_file == "Y")
+ )
+
+if __name__ == "__main__":
+ run()