diff options
Diffstat (limited to 'r_qtl')
-rw-r--r-- | r_qtl/exceptions.py (renamed from r_qtl/errors.py) | 2 | ||||
-rw-r--r-- | r_qtl/fileerrors.py | 9 | ||||
-rw-r--r-- | r_qtl/r_qtl2.py | 228 | ||||
-rw-r--r-- | r_qtl/r_qtl2_qc.py | 62 |
4 files changed, 273 insertions, 28 deletions
diff --git a/r_qtl/errors.py b/r_qtl/exceptions.py index 417eb58..9620cf4 100644 --- a/r_qtl/errors.py +++ b/r_qtl/exceptions.py @@ -6,7 +6,7 @@ class RQTLError(Exception): class InvalidFormat(RQTLError): """Raised when the format of the file(s) is invalid.""" -class MissingFileError(InvalidFormat): +class MissingFileException(InvalidFormat): """ Raise when at least one file listed in the control file is missing from the R/qtl2 bundle. diff --git a/r_qtl/fileerrors.py b/r_qtl/fileerrors.py index e76676c..c253d71 100644 --- a/r_qtl/fileerrors.py +++ b/r_qtl/fileerrors.py @@ -1,5 +1,14 @@ """QC errors as distinguished from actual exceptions""" from collections import namedtuple +InvalidValue = namedtuple( + "InvalidValue", + ("filename", + "rowtitle", + "coltitle", + "cellvalue", + "message")) + + MissingFile = namedtuple( "MissingFile", ("controlfilekey", "filename", "message")) diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 0a96e7c..0ef487f 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -1,21 +1,26 @@ """The R/qtl2 parsing and processing code.""" import io +import os import csv import json from pathlib import Path -from zipfile import ZipFile from functools import reduce, partial +from zipfile import ZipFile, is_zipfile from typing import Union, Iterator, Iterable, Callable, Optional import yaml from functional_tools import take, chain -from r_qtl.errors import InvalidFormat, MissingFileError +from r_qtl.exceptions import InvalidFormat, MissingFileException FILE_TYPES = ( "geno", "founder_geno", "pheno", "covar", "phenocovar", "gmap", "pmap", - "phenose") + "phenose", "phenonum") + +__CONTROL_FILE_ERROR_MESSAGE__ = ( + "The zipped bundle that was provided does not contain a valid control file " + "in either JSON or YAML format.") def __special_file__(filename): @@ -30,7 +35,81 @@ def __special_file__(filename): return (is_macosx_special_file or is_nix_hidden_file) -def control_data(zfile: ZipFile) -> dict: +def extract(zfile: ZipFile, outputdir: Path) -> tuple[Path, ...]: + """Extract a ZipFile + + This function will extract a zipfile `zfile` to the directory `outputdir`. + + Parameters + ---------- + zfile: zipfile.ZipFile object - the zipfile to extract. + outputdir: Optional pathlib.Path object - where the extracted files go. + + Returns + ------- + A tuple of Path objects, each pointing to a member in the zipfile. + """ + outputdir.mkdir(parents=True, exist_ok=True) + return tuple(Path(zfile.extract(member, outputdir)) + for member in zfile.namelist() + if not __special_file__(member)) + + +def transpose_csv( + inpath: Path, + linesplitterfn: Callable, + linejoinerfn: Callable, + outpath: Path) -> Path: + """Transpose a file: Make its rows into columns and its columns into rows. + + This function will create a new file, `outfile`, with the same content as + the original, `infile`, except transposed i.e. The rows of `infile` are the + columns of `outfile` and the columns of `infile` are the rows of `outfile`. + + Parameters + ---------- + inpath: The CSV file to transpose. + linesplitterfn: A function to use for splitting each line into columns + linejoinerfn: A function to use to rebuild the lines + outpath: The path where the transposed data is stored + """ + def __read_by_line__(_path): + with open(_path, "r", encoding="utf8") as infile: + for line in infile: + if line.startswith("#"): + continue + yield line + + transposed_data= (f"{linejoinerfn(items)}\n" for items in zip(*( + linesplitterfn(line) for line in __read_by_line__(inpath)))) + + with open(outpath, "w", encoding="utf8") as outfile: + for line in transposed_data: + outfile.write(line) + + return outpath + + +def transpose_csv_with_rename(inpath: Path, + linesplitterfn: Callable, + linejoinerfn: Callable) -> Path: + """Renames input file and creates new transposed file with the original name + of the input file. + + Parameters + ---------- + inpath: Path to the input file. Should be a pathlib.Path object. + linesplitterfn: A function to use for splitting each line into columns + linejoinerfn: A function to use to rebuild the lines + """ + transposedfilepath = Path(inpath) + origbkp = inpath.parent.joinpath(f"{inpath.stem}___original{inpath.suffix}") + os.rename(inpath, origbkp) + return transpose_csv( + origbkp, linesplitterfn, linejoinerfn, transposedfilepath) + + +def __control_data_from_zipfile__(zfile: ZipFile) -> dict: """Retrieve the control file from the zip file info.""" files = tuple(filename for filename in zfile.namelist() @@ -39,7 +118,7 @@ def control_data(zfile: ZipFile) -> dict: or filename.endswith(".json")))) num_files = len(files) if num_files == 0: - raise InvalidFormat("Expected a json or yaml control file.") + raise InvalidFormat(__CONTROL_FILE_ERROR_MESSAGE__) if num_files > 1: raise InvalidFormat("Found more than one possible control file.") @@ -56,6 +135,88 @@ def control_data(zfile: ZipFile) -> dict: else yaml.safe_load(zfile.read(files[0]))) } +def __control_data_from_dirpath__(dirpath: Path): + """Load control data from a given directory path.""" + files = tuple(path for path in dirpath.iterdir() + if (not __special_file__(path.name) + and (path.suffix in (".yaml", ".json")))) + num_files = len(files) + if num_files == 0: + raise InvalidFormat(__CONTROL_FILE_ERROR_MESSAGE__) + + if num_files > 1: + raise InvalidFormat("Found more than one possible control file.") + + with open(files[0], "r", encoding="utf8") as infile: + return { + "na.strings": ["NA"], + "comment.char": "#", + "sep": ",", + **{ + f"{key}_transposed": False for key in FILE_TYPES + }, + **(json.loads(infile.read()) + if files[0].suffix == ".json" + else yaml.safe_load(infile.read())) + } + + +def control_data(control_src: Union[Path, ZipFile]) -> dict: + """Read the R/qtl2 bundle control file. + + Parameters + ---------- + control_src: Path object of ZipFile object. + If a directory path is provided, this function will read the control + data from the control file in that directory. + It is importand that the Path be a directory and contain data from one + and only one R/qtl2 bundle. + + If a ZipFile object is provided, then the control data is read from the + control file within the zip file. We are moving away from parsing data + directly from ZipFile objects, and this is retained only until the + transition to using extracted files is complete. + + Returns + ------- + Returns a dict object with the control data that determines what the files + in the bundle are and how to parse them. + + Raises + ------ + r_qtl.exceptions.InvalidFormat + """ + def __cleanup__(cdata): + _cdata = { + **cdata, + **dict((filetype, + ([cdata[filetype]] if isinstance(cdata[filetype], str) + else cdata[filetype]) + ) for filetype in + (typ for typ in cdata.keys() if typ in FILE_TYPES)) + } + if "na.string" in _cdata:# handle common error in file. + _cdata = { + **cdata, + "na.strings": list(set( + _cdata["na.string"] + _cdata["na.strings"])) + } + + return _cdata + + if isinstance(control_src, ZipFile): + return __cleanup__(__control_data_from_zipfile__(control_src)) + if isinstance(control_src, Path): + if is_zipfile(control_src): + return __cleanup__( + __control_data_from_zipfile__(ZipFile(control_src))) + if control_src.is_dir(): + return __cleanup__(__control_data_from_dirpath__(control_src)) + raise InvalidFormat( + "Expects either a zipped bundle of files or a path-like object " + "pointing to the zipped R/qtl2 bundle.") + + def replace_na_strings(cdata, val): """Replace values indicated in `na.strings` with `None`.""" return (None if val in cdata.get("na.strings", ["NA"]) else val) @@ -250,24 +411,21 @@ def file_data(zfile: ZipFile, try: if isinstance(cdata[member_key], list): - for row in (line for lines in + yield from (line for lines in (file_data( zfile, member_key, {**cdata, member_key: innerfile}, process_value, process_transposed_value) for innerfile in cdata[member_key]) - for line in lines): - yield row + for line in lines) return if not cdata.get(f"{member_key}_transposed", False): - for row in with_non_transposed(zfile, member_key, cdata, process_value): - yield row + yield from with_non_transposed(zfile, member_key, cdata, process_value) return - for row in with_transposed( - zfile, member_key, cdata, process_transposed_value): - yield row + yield from with_transposed( + zfile, member_key, cdata, process_transposed_value) except KeyError as exc: - raise MissingFileError(*exc.args) from exc + raise MissingFileException(*exc.args) from exc def cross_information(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load cross information where present.""" @@ -316,8 +474,7 @@ def raw_file_data(zipfilepath: Union[str, Path], with (ZipFile(str(zipfilepath), "r") as zfile, zfile.open(memberfilename) as innerfile): wrappedfile = io.TextIOWrapper(innerfile) - for line in wrappedfile: - yield line + yield from wrappedfile def strip_comments(rawdata: Iterator[str], commentchar) -> Iterator[str]: """Remove comments from raw text.""" @@ -401,3 +558,42 @@ def load_samples(zipfilepath: Union[str, Path], pass return tuple(samples) + + + +def read_text_file(filepath: Union[str, Path]) -> Iterator[str]: + """Read the raw text from a text file.""" + with open(filepath, "r", encoding="utf8") as _file: + yield from _file + + +def read_csv_file(filepath: Union[str, Path], + separator: str = ",", + comment_char: str = "#") -> Iterator[tuple[str, ...]]: + """Read a file as a csv file. This does not process the N/A values.""" + for line in read_text_file(filepath): + if line.startswith(comment_char): + continue + yield tuple(field.strip() for field in line.split(separator)) + + +def read_csv_file_headers( + filepath: Union[str, Path], + transposed: bool, + separator: str = ",", + comment_char: str = "#" +) -> tuple[str, ...]: + """Read the 'true' headers of a CSV file.""" + headers = tuple() + for line in read_text_file(filepath): + if line.startswith(comment_char): + continue + + line = tuple(field.strip() for field in line.split(separator)) + if not transposed: + return line + + headers = headers + (line[0],) + continue + + return headers diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index be1eac4..2d9e9a8 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -1,12 +1,14 @@ """Quality control checks for R/qtl2 data bundles.""" -from zipfile import ZipFile +from pathlib import Path from functools import reduce, partial +from zipfile import ZipFile, is_zipfile from typing import Union, Iterator, Optional, Callable -from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 +from r_qtl import exceptions as rqe from r_qtl.r_qtl2 import FILE_TYPES from r_qtl.fileerrors import MissingFile +from r_qtl.exceptions import InvalidFormat from quality_control.errors import InvalidValue from quality_control.checks import decimal_points_error @@ -39,11 +41,10 @@ def bundle_files_list(cdata: dict) -> tuple[tuple[str, str], ...]: return fileslist -def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]: - """ - Retrieve a list of files listed in the control file that do not exist in the - bundle. - """ + +def __missing_from_zipfile__( + zfile: ZipFile, cdata: dict) -> tuple[tuple[str, str], ...]: + """Check for missing files from a still-compressed zip file.""" def __missing_p__(filedetails: tuple[str, str]): _cfkey, thefile = filedetails try: @@ -52,14 +53,53 @@ def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]: except KeyError: return True - return tuple(afile for afile in bundle_files_list(rqtl2.control_data(zfile)) + return tuple(afile for afile in bundle_files_list(cdata) if __missing_p__(afile)) -def validate_bundle(zfile: ZipFile): + +def __missing_from_dirpath__( + dirpath: Path, cdata: dict) -> tuple[tuple[str, str], ...]: + """Check for missing files from an extracted bundle.""" + allfiles = tuple(_file.name for _file in dirpath.iterdir()) + return tuple(afile for afile in bundle_files_list(cdata) + if afile[1] not in allfiles) + + +def missing_files(bundlesrc: Union[Path, ZipFile]) -> tuple[tuple[str, str], ...]: + """ + Retrieve a list of files listed in the control file that do not exist in the + bundle. + + Parameters + ---------- + bundlesrc: Path object of ZipFile object: This is the bundle under check. + + Returns + ------- + A tuple of names listed in the control file that do not exist in the bundle. + + Raises + ------ + r_qtl.exceptions.InvalidFormat + """ + cdata = rqtl2.control_data(bundlesrc) + if isinstance(bundlesrc, ZipFile): + return __missing_from_zipfile__(bundlesrc, cdata) + if isinstance(bundlesrc, Path): + if is_zipfile(bundlesrc): + return __missing_from_zipfile__(ZipFile(bundlesrc), cdata) + if bundlesrc.is_dir(): + return __missing_from_dirpath__(bundlesrc, cdata) + raise InvalidFormat( + "Expects either a zipfile.ZipFile object or a pathlib.Path object " + "pointing to a directory containing the R/qtl2 bundle.") + + +def validate_bundle(zfile: Union[Path, ZipFile]): """Ensure the R/qtl2 bundle is valid.""" missing = missing_files(zfile) if len(missing) > 0: - raise rqe.MissingFileError( + raise rqe.MissingFileException( "The following files do not exist in the bundle: " + ", ".join(mfile[1] for mfile in missing)) @@ -111,6 +151,6 @@ def retrieve_errors(zfile: ZipFile, filetype: str, checkers: tuple[Callable]) -> if value is not None: for checker in checkers: yield checker(lineno=lineno, field=field, value=value) - except rqe.MissingFileError: + except rqe.MissingFileException: fname = cdata.get(filetype) yield MissingFile(filetype, fname, f"Missing '{filetype}' file '{fname}'.") |