diff options
Diffstat (limited to 'r_qtl')
-rw-r--r-- | r_qtl/exceptions.py (renamed from r_qtl/errors.py) | 2 | ||||
-rw-r--r-- | r_qtl/fileerrors.py | 9 | ||||
-rw-r--r-- | r_qtl/r_qtl2.py | 173 | ||||
-rw-r--r-- | r_qtl/r_qtl2_qc.py | 60 |
4 files changed, 228 insertions, 16 deletions
diff --git a/r_qtl/errors.py b/r_qtl/exceptions.py index 417eb58..9620cf4 100644 --- a/r_qtl/errors.py +++ b/r_qtl/exceptions.py @@ -6,7 +6,7 @@ class RQTLError(Exception): class InvalidFormat(RQTLError): """Raised when the format of the file(s) is invalid.""" -class MissingFileError(InvalidFormat): +class MissingFileException(InvalidFormat): """ Raise when at least one file listed in the control file is missing from the R/qtl2 bundle. diff --git a/r_qtl/fileerrors.py b/r_qtl/fileerrors.py index e76676c..c253d71 100644 --- a/r_qtl/fileerrors.py +++ b/r_qtl/fileerrors.py @@ -1,5 +1,14 @@ """QC errors as distinguished from actual exceptions""" from collections import namedtuple +InvalidValue = namedtuple( + "InvalidValue", + ("filename", + "rowtitle", + "coltitle", + "cellvalue", + "message")) + + MissingFile = namedtuple( "MissingFile", ("controlfilekey", "filename", "message")) diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py index 87491d0..9da4081 100644 --- a/r_qtl/r_qtl2.py +++ b/r_qtl/r_qtl2.py @@ -1,27 +1,115 @@ """The R/qtl2 parsing and processing code.""" import io +import os import csv import json from pathlib import Path -from zipfile import ZipFile from functools import reduce, partial +from zipfile import ZipFile, is_zipfile from typing import Union, Iterator, Iterable, Callable, Optional import yaml from functional_tools import take, chain -from r_qtl.errors import InvalidFormat, MissingFileError +from r_qtl.exceptions import InvalidFormat, MissingFileException FILE_TYPES = ( "geno", "founder_geno", "pheno", "covar", "phenocovar", "gmap", "pmap", "phenose") -def control_data(zfile: ZipFile) -> dict: + +def __special_file__(filename): + """ + Check whether the file is special in some ways, e.g. MacOSX seems to include + files in a directory `__MACOSX` that share parts of the name, and extensions + with the main files in the bundle. + """ + is_macosx_special_file = filename.startswith("__MACOSX") + is_nix_hidden_file = Path(filename).name.startswith(".") + + return (is_macosx_special_file or is_nix_hidden_file) + + +def extract(zfile: ZipFile, outputdir: Path) -> tuple[Path, ...]: + """Extract a ZipFile + + This function will extract a zipfile `zfile` to the directory `outputdir`. + + Parameters + ---------- + zfile: zipfile.ZipFile object - the zipfile to extract. + outputdir: Optional pathlib.Path object - where the extracted files go. + + Returns + ------- + A tuple of Path objects, each pointing to a member in the zipfile. + """ + outputdir.mkdir(parents=True, exist_ok=True) + return tuple(Path(zfile.extract(member, outputdir)) + for member in zfile.namelist() + if not __special_file__(member)) + + +def transpose_csv( + inpath: Path, + linesplitterfn: Callable, + linejoinerfn: Callable, + outpath: Path) -> Path: + """Transpose a file: Make its rows into columns and its columns into rows. + + This function will create a new file, `outfile`, with the same content as + the original, `infile`, except transposed i.e. The rows of `infile` are the + columns of `outfile` and the columns of `infile` are the rows of `outfile`. + + Parameters + ---------- + inpath: The CSV file to transpose. + linesplitterfn: A function to use for splitting each line into columns + linejoinerfn: A function to use to rebuild the lines + outpath: The path where the transposed data is stored + """ + def __read_by_line__(_path): + with open(_path, "r", encoding="utf8") as infile: + for line in infile: + yield line + + transposed_data= (f"{linejoinerfn(items)}\n" for items in zip(*( + linesplitterfn(line) for line in __read_by_line__(inpath)))) + + with open(outpath, "w", encoding="utf8") as outfile: + for line in transposed_data: + outfile.write(line) + + return outpath + + +def transpose_csv_with_rename(inpath: Path, + linesplitterfn: Callable, + linejoinerfn: Callable) -> Path: + """Renames input file and creates new transposed file with the original name + of the input file. + + Parameters + ---------- + inpath: Path to the input file. Should be a pathlib.Path object. + linesplitterfn: A function to use for splitting each line into columns + linejoinerfn: A function to use to rebuild the lines + """ + transposedfilepath = Path(inpath) + origbkp = inpath.parent.joinpath(f"{inpath.stem}___original{inpath.suffix}") + os.rename(inpath, origbkp) + return transpose_csv( + origbkp, linesplitterfn, linejoinerfn, transposedfilepath) + + +def __control_data_from_zipfile__(zfile: ZipFile) -> dict: """Retrieve the control file from the zip file info.""" files = tuple(filename for filename in zfile.namelist() - if (filename.endswith(".yaml") or filename.endswith(".json"))) + if (not __special_file__(filename) + and (filename.endswith(".yaml") + or filename.endswith(".json")))) num_files = len(files) if num_files == 0: raise InvalidFormat("Expected a json or yaml control file.") @@ -41,6 +129,81 @@ def control_data(zfile: ZipFile) -> dict: else yaml.safe_load(zfile.read(files[0]))) } + +def __control_data_from_dirpath__(dirpath: Path): + """Load control data from a given directory path.""" + files = tuple(path for path in dirpath.iterdir() + if (not __special_file__(path.name) + and (path.suffix in (".yaml", ".json")))) + num_files = len(files) + if num_files == 0: + raise InvalidFormat("Expected a json or yaml control file.") + + if num_files > 1: + raise InvalidFormat("Found more than one possible control file.") + + with open(files[0], "r", encoding="utf8") as infile: + return { + "na.strings": ["NA"], + "comment.char": "#", + "sep": ",", + **{ + f"{key}_transposed": False for key in FILE_TYPES + }, + **(json.loads(infile.read()) + if files[0].suffix == ".json" + else yaml.safe_load(infile.read())) + } + + +def control_data(control_src: Union[Path, ZipFile]) -> dict: + """Read the R/qtl2 bundle control file. + + Parameters + ---------- + control_src: Path object of ZipFile object. + If a directory path is provided, this function will read the control + data from the control file in that directory. + It is importand that the Path be a directory and contain data from one + and only one R/qtl2 bundle. + + If a ZipFile object is provided, then the control data is read from the + control file within the zip file. We are moving away from parsing data + directly from ZipFile objects, and this is retained only until the + transition to using extracted files is complete. + + Returns + ------- + Returns a dict object with the control data that determines what the files + in the bundle are and how to parse them. + + Raises + ------ + r_qtl.exceptions.InvalidFormat + """ + def __cleanup__(cdata): + return { + **cdata, + **dict((filetype, + ([cdata[filetype]] if isinstance(cdata[filetype], str) + else cdata[filetype]) + ) for filetype in + (typ for typ in cdata.keys() if typ in FILE_TYPES)) + } + + if isinstance(control_src, ZipFile): + return __cleanup__(__control_data_from_zipfile__(control_src)) + if isinstance(control_src, Path): + if is_zipfile(control_src): + return __cleanup__( + __control_data_from_zipfile__(ZipFile(control_src))) + if control_src.is_dir(): + return __cleanup__(__control_data_from_dirpath__(control_src)) + raise InvalidFormat( + "Expects either a zipfile.ZipFile object or a pathlib.Path object " + "pointing to a directory containing the R/qtl2 bundle.") + + def replace_na_strings(cdata, val): """Replace values indicated in `na.strings` with `None`.""" return (None if val in cdata.get("na.strings", ["NA"]) else val) @@ -252,7 +415,7 @@ def file_data(zfile: ZipFile, zfile, member_key, cdata, process_transposed_value): yield row except KeyError as exc: - raise MissingFileError(*exc.args) from exc + raise MissingFileException(*exc.args) from exc def cross_information(zfile: ZipFile, cdata: dict) -> Iterator[dict]: """Load cross information where present.""" diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py index be1eac4..7b26b50 100644 --- a/r_qtl/r_qtl2_qc.py +++ b/r_qtl/r_qtl2_qc.py @@ -1,12 +1,14 @@ """Quality control checks for R/qtl2 data bundles.""" -from zipfile import ZipFile +from pathlib import Path from functools import reduce, partial +from zipfile import ZipFile, is_zipfile from typing import Union, Iterator, Optional, Callable -from r_qtl import errors as rqe from r_qtl import r_qtl2 as rqtl2 +from r_qtl import exceptions as rqe from r_qtl.r_qtl2 import FILE_TYPES from r_qtl.fileerrors import MissingFile +from r_qtl.exceptions import InvalidFormat from quality_control.errors import InvalidValue from quality_control.checks import decimal_points_error @@ -39,11 +41,10 @@ def bundle_files_list(cdata: dict) -> tuple[tuple[str, str], ...]: return fileslist -def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]: - """ - Retrieve a list of files listed in the control file that do not exist in the - bundle. - """ + +def __missing_from_zipfile__( + zfile: ZipFile, cdata: dict) -> tuple[tuple[str, str], ...]: + """Check for missing files from a still-compressed zip file.""" def __missing_p__(filedetails: tuple[str, str]): _cfkey, thefile = filedetails try: @@ -52,14 +53,53 @@ def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]: except KeyError: return True - return tuple(afile for afile in bundle_files_list(rqtl2.control_data(zfile)) + return tuple(afile for afile in bundle_files_list(cdata) if __missing_p__(afile)) + +def __missing_from_dirpath__( + dirpath: Path, cdata: dict) -> tuple[tuple[str, str], ...]: + """Check for missing files from an extracted bundle.""" + allfiles = tuple(_file.name for _file in dirpath.iterdir()) + return tuple(afile for afile in bundle_files_list(cdata) + if afile[1] not in allfiles) + + +def missing_files(bundlesrc: Union[Path, ZipFile]) -> tuple[tuple[str, str], ...]: + """ + Retrieve a list of files listed in the control file that do not exist in the + bundle. + + Parameters + ---------- + bundlesrc: Path object of ZipFile object: This is the bundle under check. + + Returns + ------- + A tuple of names listed in the control file that do not exist in the bundle. + + Raises + ------ + r_qtl.exceptions.InvalidFormat + """ + cdata = rqtl2.control_data(bundlesrc) + if isinstance(bundlesrc, ZipFile): + return __missing_from_zipfile__(bundlesrc, cdata) + if isinstance(bundlesrc, Path): + if is_zipfile(bundlesrc): + return __missing_from_zipfile__(ZipFile(bundlesrc), cdata) + if bundlesrc.is_dir(): + return __missing_from_dirpath__(bundlesrc, cdata) + raise InvalidFormat( + "Expects either a zipfile.ZipFile object or a pathlib.Path object " + "pointing to a directory containing the R/qtl2 bundle.") + + def validate_bundle(zfile: ZipFile): """Ensure the R/qtl2 bundle is valid.""" missing = missing_files(zfile) if len(missing) > 0: - raise rqe.MissingFileError( + raise rqe.MissingFileException( "The following files do not exist in the bundle: " + ", ".join(mfile[1] for mfile in missing)) @@ -111,6 +151,6 @@ def retrieve_errors(zfile: ZipFile, filetype: str, checkers: tuple[Callable]) -> if value is not None: for checker in checkers: yield checker(lineno=lineno, field=field, value=value) - except rqe.MissingFileError: + except rqe.MissingFileException: fname = cdata.get(filetype) yield MissingFile(filetype, fname, f"Missing '{filetype}' file '{fname}'.") |