scripts: add GN2 results parsing to parser script

* wqflask/scripts/parse_corr_gn1_results_to_csv.py: Rename script * wqflask/scripts/parse_corr_html_results_to_csv.py: * Use argparse to parse CLI arguments * Add parsing for GN2 results
author: Frederick Muriuki Muriithi 2022-12-05 12:17:59 +0300
committer: Frederick Muriuki Muriithi 2022-12-05 12:17:59 +0300
commit: 47f032329f967440f44889ee856f1a8510fb773c (patch)
tree: 212686d805cb4a3cd4b1fd767f5af92cb38aae46 /wqflask/scripts
parent: 1a651da72faf987f7e3d0c7ba4c60cafe850c7a0 (diff)
download: genenetwork2-47f032329f967440f44889ee856f1a8510fb773c.tar.gz
2 files changed, 172 insertions, 86 deletions
diff --git a/wqflask/scripts/parse_corr_gn1_results_to_csv.py b/wqflask/scripts/parse_corr_gn1_results_to_csv.py
deleted file mode 100644
index b04a0378..00000000
--- a/wqflask/scripts/parse_corr_gn1_results_to_csv.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import csv
-import sys
-from functools import reduce, partial
-import pathlib
-
-from lxml import etree
-
-def thread(value, *functions):
-    return reduce(lambda result, func: func(result), functions, value)
-
-def parse_file(filename: pathlib.Path):
-    with open(filename, encoding="utf-8") as inpfl:
-        raw_html = inpfl.read()
-
-    return etree.HTML(raw_html)
-
-def first_row_headers(table):
-    return tuple(
-        " ".join(text.strip() for text in cell.xpath(".//child::text()"))
-        for cell in table.xpath("./tbody/tr[1]/td"))
-    
-
-def results_table(tables):
-    found = tuple(filter(
-        lambda table: (
-            first_row_headers(table)[0:4] ==
-            ("Index", "Record ID", "Symbol", "Description")),
-        tables))
-    # print(f"Found {len(found)} with the expected first row")
-    return found[0]
-
-def table_contents(table):
-    return tuple(
-        tuple(" ".join(text.strip() for text in cell.xpath(".//child::text()"))
-            for cell in row)
-        for row in table.xpath("./tbody/tr"))
-
-
-def to_dicts(contents):
-    frow = contents[0]
-    return tuple(dict(zip(frow, row)) for row in contents[1:])
-
-def write_csv(input_file, to_output_file, contents):
-    def __write__(stream):
-        writer = csv.DictWriter(
-            stream, fieldnames=list(contents[0].keys()),
-            dialect=csv.unix_dialect)
-        writer.writeheader()
-        writer.writerows(contents)
-
-    if not to_output_file:
-        return __write__(sys.stdout)
-
-    output_file = input_file.parent.joinpath(
-        f"{input_file.stem}__results.csv")
-    with open(output_file, "w", encoding="utf-8") as out_file:
-        return __write__(out_file)
-
-def output_stream():
-    if not to_output_file:
-        return sys.stdout
-
-    output_file = input_file.parent.joinpath(
-        f"{input_file.stem}.csv")
-    with open(output_file) as out_file:
-        yield out_file
-
-def run():
-    if len(sys.argv) != 3:
-        print("Usage: python3 test.py <input-file> <to-ouput-file: [Y/n]>")
-        sys.exit(1)
-
-    _this_file, input_file, to_output_file = sys.argv
-    input_file = pathlib.Path(input_file).absolute()
-    thread(
-        input_file,
-        parse_file,
-        lambda tree: tree.xpath("//table"),
-        results_table,
-        table_contents,
-        to_dicts,
-        partial(write_csv, input_file, to_output_file == "Y")
-    )
-
-if __name__ == "__main__":
-    run()
diff --git a/wqflask/scripts/parse_corr_html_results_to_csv.py b/wqflask/scripts/parse_corr_html_results_to_csv.py
new file mode 100644
index 00000000..c54d99ca
--- /dev/null
+++ b/wqflask/scripts/parse_corr_html_results_to_csv.py
@@ -0,0 +1,172 @@
+import csv
+import sys
+import json
+from pathlib import Path
+from functools import reduce, partial
+from typing import Any, Union, Sequence, Optional
+from argparse import Action, Namespace, ArgumentError, ArgumentParser
+
+from lxml import etree
+
+def thread(value, *functions):
+    return reduce(lambda result, func: func(result), functions, value)
+
+def parse_file(filename: Path):
+    with open(filename, encoding="utf-8") as inpfl:
+        raw_html = inpfl.read()
+
+    return etree.HTML(raw_html)
+
+def first_row_headers(table):
+    return tuple(
+        " ".join(text.strip() for text in cell.xpath(".//child::text()"))
+        for cell in table.xpath("./tbody/tr[1]/td"))
+
+def results_table(tables):
+    found = tuple(filter(
+        lambda table: (
+            first_row_headers(table)[0:4] ==
+            ("Index", "Record ID", "Symbol", "Description")),
+        tables))
+    return found[0]
+
+def table_contents(table):
+    return tuple(
+        tuple(" ".join(text.strip() for text in cell.xpath(".//child::text()"))
+            for cell in row)
+        for row in table.xpath("./tbody/tr"))
+
+
+def to_dicts(contents):
+    frow = contents[0]
+    return tuple(dict(zip(frow, row)) for row in contents[1:])
+
+def write_csv(
+        input_file: Path, output_dir: Union[bool, Path],
+        contents: Sequence[dict]) -> Sequence[Sequence[str]]:
+    def __write__(stream):
+        writer = csv.DictWriter(
+            stream, fieldnames=list(contents[0].keys()),
+            dialect=csv.unix_dialect)
+        writer.writeheader()
+        writer.writerows(contents)
+
+    if not bool(output_dir):
+        return __write__(sys.stdout)
+
+    output_file = output_dir.joinpath(
+        f"{input_file.stem}__results.csv")
+    with open(output_file, "w", encoding="utf-8") as out_file:
+        return __write__(out_file)
+
+def output_stream():
+    if not to_output_file:
+        return sys.stdout
+
+    output_file = input_file.parent.joinpath(
+        f"{input_file.stem}.csv")
+    with open(output_file) as out_file:
+        yield out_file
+
+class FileCheck(Action):
+    """Action class to check existence of a given file path."""
+
+    def __init__(self, option_strings, dest, **kwargs):
+        "Initialise the FileCheck action class"
+        super().__init__(option_strings, dest, **kwargs)
+
+    def __call__(# pylint: disable=[signature-differs]
+            self, parser: ArgumentParser, namespace: Namespace,
+            values: Union[str, Sequence[Any], None],
+            option_string: Optional[str] = "") -> None:
+        """Check existence of a given file path and set it, or raise an
+        exception."""
+        the_path = str(values or "")
+        the_file = Path(the_path).absolute().resolve()
+        if not the_file.is_file():
+            raise ArgumentError(
+                self,
+                f"The file '{values}' does not exist or is a folder/directory.")
+
+        setattr(namespace, self.dest, the_file)
+
+class DirectoryCheck(Action):
+    """Action class to check the existence of a particular directory"""
+    def __init__(self, option_strings, dest, **kwargs):
+        """Init `DirectoryCheck` action object."""
+        super().__init__(option_strings, dest, **kwargs)
+
+    def __call__(
+            self, parser: ArgumentParser, namespace: Namespace,
+            values: Union[str, Sequence[Any], None],
+            option_string: Optional[str] = "") -> None:
+        the_dir = Path(str(values or "")).absolute().resolve()
+        if not the_dir.is_dir():
+            raise ArgumentError(
+                self, f"The directory '{the_dir}' does not exist!")
+
+        setattr(namespace, self.dest, the_dir)
+
+def gn1_parser(subparsers) -> None:
+    parser = subparsers.add_parser("gn1")
+    parser.add_argument(
+        "inputfile", help="The HTML file to parse", action=FileCheck)
+    parser.add_argument(
+        "--outputdir", help="Path to output directory", action=DirectoryCheck,
+        default=False)
+    parser.set_defaults(
+        func=lambda args: thread(
+            args.inputfile,
+            parse_file,
+            lambda tree: tree.xpath("//table"),
+            results_table,
+            table_contents,
+            to_dicts,
+            partial(write_csv, args.inputfile, args.outputdir)))
+
+def tablejson_script(scripts):
+    for script in scripts:
+        script_content = thread(
+            script.xpath('.//child::text()'),
+            lambda val: "".join(val).strip())
+        if script_content.find("var tableJson") >= 0:
+            return json.loads(thread(
+                script_content,
+                lambda val: val[len("var tableJson = "):].strip()))
+        continue
+    return None
+
+def gn2_parser(subparsers) -> None:
+    parser = subparsers.add_parser("gn2")
+    parser.add_argument(
+        "inputfile", help="The HTML file to parse", action=FileCheck)
+    parser.add_argument(
+        "--outputdir", help="Path to output directory", action=DirectoryCheck,
+        default=False)
+    parser.set_defaults(
+        func=lambda args: thread(
+            args.inputfile,
+            parse_file,
+            lambda tree: tree.xpath("//script"),
+            tablejson_script,
+            partial(write_csv, args.inputfile, args.outputdir)
+        ))
+
+def parse_cli_args():
+    parser = ArgumentParser(
+        "parse_corr_html_results_to_csv",
+        description = "Parse correlation results from the given HTML file.")
+    subparsers = parser.add_subparsers(
+        title="subcommands", description="Valid subcommands",
+        help="additional help")
+    gn1_parser(subparsers)
+    gn2_parser(subparsers)
+
+    return parser.parse_args()
+
+def run():
+    args = parse_cli_args()
+    args.func(args)
+
+if __name__ == "__main__":
+    run()
author	Frederick Muriuki Muriithi	2022-12-05 12:17:59 +0300
committer	Frederick Muriuki Muriithi	2022-12-05 12:17:59 +0300
commit	47f032329f967440f44889ee856f1a8510fb773c (patch)
tree	212686d805cb4a3cd4b1fd767f5af92cb38aae46 /wqflask/scripts
parent	1a651da72faf987f7e3d0c7ba4c60cafe850c7a0 (diff)
download	genenetwork2-47f032329f967440f44889ee856f1a8510fb773c.tar.gz