4 files changed, 273 insertions, 28 deletions
diff --git a/r_qtl/errors.py b/r_qtl/exceptions.py
index 417eb58..9620cf4 100644
--- a/r_qtl/errors.py
+++ b/r_qtl/exceptions.py
@@ -6,7 +6,7 @@ class RQTLError(Exception):
 class InvalidFormat(RQTLError):
     """Raised when the format of the file(s) is invalid."""
 
-class MissingFileError(InvalidFormat):
+class MissingFileException(InvalidFormat):
     """
     Raise when at least one file listed in the control file is missing from the
     R/qtl2 bundle.
diff --git a/r_qtl/fileerrors.py b/r_qtl/fileerrors.py
index e76676c..c253d71 100644
--- a/r_qtl/fileerrors.py
+++ b/r_qtl/fileerrors.py
@@ -1,5 +1,14 @@
 """QC errors as distinguished from actual exceptions"""
 from collections import namedtuple
 
+InvalidValue = namedtuple(
+    "InvalidValue",
+    ("filename",
+     "rowtitle",
+     "coltitle",
+     "cellvalue",
+     "message"))
+
+
 MissingFile = namedtuple(
     "MissingFile", ("controlfilekey", "filename", "message"))
diff --git a/r_qtl/r_qtl2.py b/r_qtl/r_qtl2.py
index 0a96e7c..0ef487f 100644
--- a/r_qtl/r_qtl2.py
+++ b/r_qtl/r_qtl2.py
@@ -1,21 +1,26 @@
 """The R/qtl2 parsing and processing code."""
 import io
+import os
 import csv
 import json
 from pathlib import Path
-from zipfile import ZipFile
 from functools import reduce, partial
+from zipfile import ZipFile, is_zipfile
 from typing import Union, Iterator, Iterable, Callable, Optional
 
 import yaml
 
 from functional_tools import take, chain
 
-from r_qtl.errors import InvalidFormat, MissingFileError
+from r_qtl.exceptions import InvalidFormat, MissingFileException
 
 FILE_TYPES = (
     "geno", "founder_geno", "pheno", "covar", "phenocovar", "gmap", "pmap",
-    "phenose")
+    "phenose", "phenonum")
+
+__CONTROL_FILE_ERROR_MESSAGE__ = (
+    "The zipped bundle that was provided does not contain a valid control file "
+    "in either JSON or YAML format.")
 
 
 def __special_file__(filename):
@@ -30,7 +35,81 @@ def __special_file__(filename):
     return (is_macosx_special_file or is_nix_hidden_file)
 
 
-def control_data(zfile: ZipFile) -> dict:
+def extract(zfile: ZipFile, outputdir: Path) -> tuple[Path, ...]:
+    """Extract a ZipFile
+
+    This function will extract a zipfile `zfile` to the directory `outputdir`.
+
+    Parameters
+    ----------
+    zfile: zipfile.ZipFile object - the zipfile to extract.
+    outputdir: Optional pathlib.Path object - where the extracted files go.
+
+    Returns
+    -------
+    A tuple of Path objects, each pointing to a member in the zipfile.
+    """
+    outputdir.mkdir(parents=True, exist_ok=True)
+    return tuple(Path(zfile.extract(member, outputdir))
+                 for member in zfile.namelist()
+                 if not __special_file__(member))
+
+
+def transpose_csv(
+        inpath: Path,
+        linesplitterfn: Callable,
+        linejoinerfn: Callable,
+        outpath: Path) -> Path:
+    """Transpose a file: Make its rows into columns and its columns into rows.
+
+    This function will create a new file, `outfile`, with the same content as
+    the original, `infile`, except transposed i.e. The rows of `infile` are the
+    columns of `outfile` and the columns of `infile` are the rows of `outfile`.
+
+    Parameters
+    ----------
+    inpath: The CSV file to transpose.
+    linesplitterfn: A function to use for splitting each line into columns
+    linejoinerfn: A function to use to rebuild the lines
+    outpath: The path where the transposed data is stored
+    """
+    def __read_by_line__(_path):
+        with open(_path, "r", encoding="utf8") as infile:
+            for line in infile:
+                if line.startswith("#"):
+                    continue
+                yield line
+
+    transposed_data= (f"{linejoinerfn(items)}\n" for items in zip(*(
+        linesplitterfn(line) for line in __read_by_line__(inpath))))
+
+    with open(outpath, "w", encoding="utf8") as outfile:
+        for line in transposed_data:
+            outfile.write(line)
+
+    return outpath
+
+
+def transpose_csv_with_rename(inpath: Path,
+                              linesplitterfn: Callable,
+                              linejoinerfn: Callable) -> Path:
+    """Renames input file and creates new transposed file with the original name
+    of the input file.
+
+    Parameters
+    ----------
+    inpath: Path to the input file. Should be a pathlib.Path object.
+    linesplitterfn: A function to use for splitting each line into columns
+    linejoinerfn: A function to use to rebuild the lines
+    """
+    transposedfilepath = Path(inpath)
+    origbkp = inpath.parent.joinpath(f"{inpath.stem}___original{inpath.suffix}")
+    os.rename(inpath, origbkp)
+    return transpose_csv(
+        origbkp, linesplitterfn, linejoinerfn, transposedfilepath)
+
+
+def __control_data_from_zipfile__(zfile: ZipFile) -> dict:
     """Retrieve the control file from the zip file info."""
     files = tuple(filename
                   for filename in zfile.namelist()
@@ -39,7 +118,7 @@ def control_data(zfile: ZipFile) -> dict:
                            or filename.endswith(".json"))))
     num_files = len(files)
     if num_files == 0:
-        raise InvalidFormat("Expected a json or yaml control file.")
+        raise InvalidFormat(__CONTROL_FILE_ERROR_MESSAGE__)
 
     if num_files > 1:
         raise InvalidFormat("Found more than one possible control file.")
@@ -56,6 +135,88 @@ def control_data(zfile: ZipFile) -> dict:
             else yaml.safe_load(zfile.read(files[0])))
     }
 
+def __control_data_from_dirpath__(dirpath: Path):
+    """Load control data from a given directory path."""
+    files = tuple(path for path in dirpath.iterdir()
+                  if (not __special_file__(path.name)
+                      and (path.suffix in (".yaml", ".json"))))
+    num_files = len(files)
+    if num_files == 0:
+        raise InvalidFormat(__CONTROL_FILE_ERROR_MESSAGE__)
+
+    if num_files > 1:
+        raise InvalidFormat("Found more than one possible control file.")
+
+    with open(files[0], "r", encoding="utf8") as infile:
+        return {
+            "na.strings": ["NA"],
+            "comment.char": "#",
+            "sep": ",",
+            **{
+                f"{key}_transposed": False for key in FILE_TYPES
+            },
+            **(json.loads(infile.read())
+               if files[0].suffix == ".json"
+               else yaml.safe_load(infile.read()))
+        }
+
+
+def control_data(control_src: Union[Path, ZipFile]) -> dict:
+    """Read the R/qtl2 bundle control file.
+
+    Parameters
+    ----------
+    control_src: Path object of ZipFile object.
+        If a directory path is provided, this function will read the control
+        data from the control file in that directory.
+        It is importand that the Path be a directory and contain data from one
+        and only one R/qtl2 bundle.
+
+        If a ZipFile object is provided, then the control data is read from the
+        control file within the zip file. We are moving away from parsing data
+        directly from ZipFile objects, and this is retained only until the
+        transition to using extracted files is complete.
+
+    Returns
+    -------
+    Returns a dict object with the control data that determines what the files
+    in the bundle are and how to parse them.
+
+    Raises
+    ------
+    r_qtl.exceptions.InvalidFormat
+    """
+    def __cleanup__(cdata):
+        _cdata = {
+            **cdata,
+            **dict((filetype,
+                    ([cdata[filetype]] if isinstance(cdata[filetype], str)
+                else cdata[filetype])
+                    ) for filetype in
+                   (typ for typ in cdata.keys() if typ in FILE_TYPES))
+        }
+        if "na.string" in _cdata:# handle common error in file.
+            _cdata = {
+                **cdata,
+                "na.strings": list(set(
+                    _cdata["na.string"] + _cdata["na.strings"]))
+            }
+
+        return _cdata
+
+    if isinstance(control_src, ZipFile):
+        return __cleanup__(__control_data_from_zipfile__(control_src))
+    if isinstance(control_src, Path):
+        if is_zipfile(control_src):
+            return __cleanup__(
+                __control_data_from_zipfile__(ZipFile(control_src)))
+        if control_src.is_dir():
+            return __cleanup__(__control_data_from_dirpath__(control_src))
+    raise InvalidFormat(
+        "Expects either a zipped bundle of files or a path-like object "
+        "pointing to the zipped R/qtl2 bundle.")
+
+
 def replace_na_strings(cdata, val):
     """Replace values indicated in `na.strings` with `None`."""
     return (None if val in cdata.get("na.strings", ["NA"]) else val)
@@ -250,24 +411,21 @@ def file_data(zfile: ZipFile,
 
     try:
         if isinstance(cdata[member_key], list):
-            for row in (line for lines in
+            yield from (line for lines in
                         (file_data(
                             zfile, member_key, {**cdata, member_key: innerfile},
                             process_value, process_transposed_value)
                          for innerfile in cdata[member_key])
-                        for line in lines):
-                yield row
+                        for line in lines)
             return
         if not cdata.get(f"{member_key}_transposed", False):
-            for row in with_non_transposed(zfile, member_key, cdata, process_value):
-                yield row
+            yield from with_non_transposed(zfile, member_key, cdata, process_value)
             return
 
-        for row in with_transposed(
-                zfile, member_key, cdata, process_transposed_value):
-            yield row
+        yield from with_transposed(
+            zfile, member_key, cdata, process_transposed_value)
     except KeyError as exc:
-        raise MissingFileError(*exc.args) from exc
+        raise MissingFileException(*exc.args) from exc
 
 def cross_information(zfile: ZipFile, cdata: dict) -> Iterator[dict]:
     """Load cross information where present."""
@@ -316,8 +474,7 @@ def raw_file_data(zipfilepath: Union[str, Path],
     with (ZipFile(str(zipfilepath), "r") as zfile,
           zfile.open(memberfilename) as innerfile):
         wrappedfile = io.TextIOWrapper(innerfile)
-        for  line in wrappedfile:
-            yield line
+        yield from wrappedfile
 
 def strip_comments(rawdata: Iterator[str], commentchar) -> Iterator[str]:
     """Remove comments from raw text."""
@@ -401,3 +558,42 @@ def load_samples(zipfilepath: Union[str, Path],
             pass
 
     return tuple(samples)
+
+
+
+def read_text_file(filepath: Union[str, Path]) -> Iterator[str]:
+    """Read the raw text from a text file."""
+    with open(filepath, "r", encoding="utf8") as _file:
+        yield from _file
+
+
+def read_csv_file(filepath: Union[str, Path],
+                  separator: str = ",",
+                  comment_char: str = "#") -> Iterator[tuple[str, ...]]:
+    """Read a file as a csv file. This does not process the N/A values."""
+    for line in read_text_file(filepath):
+        if line.startswith(comment_char):
+            continue
+        yield tuple(field.strip() for field in line.split(separator))
+
+
+def read_csv_file_headers(
+        filepath: Union[str, Path],
+        transposed: bool,
+        separator: str = ",",
+        comment_char: str = "#"
+) -> tuple[str, ...]:
+    """Read the 'true' headers of a CSV file."""
+    headers = tuple()
+    for line in read_text_file(filepath):
+        if line.startswith(comment_char):
+            continue
+
+        line = tuple(field.strip() for field in line.split(separator))
+        if not transposed:
+            return line
+
+        headers = headers + (line[0],)
+        continue
+
+    return headers
diff --git a/r_qtl/r_qtl2_qc.py b/r_qtl/r_qtl2_qc.py
index be1eac4..2d9e9a8 100644
--- a/r_qtl/r_qtl2_qc.py
+++ b/r_qtl/r_qtl2_qc.py
@@ -1,12 +1,14 @@
 """Quality control checks for R/qtl2 data bundles."""
-from zipfile import ZipFile
+from pathlib import Path
 from functools import reduce, partial
+from zipfile import ZipFile, is_zipfile
 from typing import Union, Iterator, Optional, Callable
 
-from r_qtl import errors as rqe
 from r_qtl import r_qtl2 as rqtl2
+from r_qtl import exceptions as rqe
 from r_qtl.r_qtl2 import FILE_TYPES
 from r_qtl.fileerrors import MissingFile
+from r_qtl.exceptions import InvalidFormat
 
 from quality_control.errors import InvalidValue
 from quality_control.checks import decimal_points_error
@@ -39,11 +41,10 @@ def bundle_files_list(cdata: dict) -> tuple[tuple[str, str], ...]:
 
     return fileslist
 
-def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]:
-    """
-    Retrieve a list of files listed in the control file that do not exist in the
-    bundle.
-    """
+
+def __missing_from_zipfile__(
+        zfile: ZipFile, cdata: dict) -> tuple[tuple[str, str], ...]:
+    """Check for missing files from a still-compressed zip file."""
     def __missing_p__(filedetails: tuple[str, str]):
         _cfkey, thefile = filedetails
         try:
@@ -52,14 +53,53 @@ def missing_files(zfile: ZipFile) -> tuple[tuple[str, str], ...]:
         except KeyError:
             return True
 
-    return tuple(afile for afile in bundle_files_list(rqtl2.control_data(zfile))
+    return tuple(afile for afile in bundle_files_list(cdata)
                  if __missing_p__(afile))
 
-def validate_bundle(zfile: ZipFile):
+
+def __missing_from_dirpath__(
+        dirpath: Path, cdata: dict) -> tuple[tuple[str, str], ...]:
+    """Check for missing files from an extracted bundle."""
+    allfiles = tuple(_file.name for _file in dirpath.iterdir())
+    return tuple(afile for afile in bundle_files_list(cdata)
+                 if afile[1] not in allfiles)
+
+
+def missing_files(bundlesrc: Union[Path, ZipFile]) -> tuple[tuple[str, str], ...]:
+    """
+    Retrieve a list of files listed in the control file that do not exist in the
+    bundle.
+
+    Parameters
+    ----------
+    bundlesrc: Path object of ZipFile object: This is the bundle under check.
+
+    Returns
+    -------
+    A tuple of names listed in the control file that do not exist in the bundle.
+
+    Raises
+    ------
+    r_qtl.exceptions.InvalidFormat
+    """
+    cdata = rqtl2.control_data(bundlesrc)
+    if isinstance(bundlesrc, ZipFile):
+        return __missing_from_zipfile__(bundlesrc, cdata)
+    if isinstance(bundlesrc, Path):
+        if is_zipfile(bundlesrc):
+            return __missing_from_zipfile__(ZipFile(bundlesrc), cdata)
+        if bundlesrc.is_dir():
+            return __missing_from_dirpath__(bundlesrc, cdata)
+    raise InvalidFormat(
+        "Expects either a zipfile.ZipFile object or a pathlib.Path object "
+        "pointing to a directory containing the R/qtl2 bundle.")
+
+
+def validate_bundle(zfile: Union[Path, ZipFile]):
     """Ensure the R/qtl2 bundle is valid."""
     missing = missing_files(zfile)
     if len(missing) > 0:
-        raise rqe.MissingFileError(
+        raise rqe.MissingFileException(
                         "The following files do not exist in the bundle: " +
                         ", ".join(mfile[1] for mfile in missing))
 
@@ -111,6 +151,6 @@ def retrieve_errors(zfile: ZipFile, filetype: str, checkers: tuple[Callable]) ->
                 if value is not None:
                     for checker in checkers:
                         yield checker(lineno=lineno, field=field, value=value)
-    except rqe.MissingFileError:
+    except rqe.MissingFileException:
         fname = cdata.get(filetype)
         yield MissingFile(filetype, fname, f"Missing '{filetype}' file '{fname}'.")