about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/index-genenetwork18
-rw-r--r--scripts/lmdb_matrix.py437
-rw-r--r--scripts/partial_correlations.py10
-rw-r--r--scripts/pub_med.py6
-rw-r--r--scripts/rqtl2_wrapper.R358
-rw-r--r--scripts/rqtl_wrapper.R8
-rwxr-xr-xscripts/update_rif_table.py2
7 files changed, 828 insertions, 11 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork
index 2779abc..0544bc7 100755
--- a/scripts/index-genenetwork
+++ b/scripts/index-genenetwork
@@ -514,9 +514,14 @@ def xapian_compact(combined_index: pathlib.Path, indices: List[pathlib.Path]) ->
 @click.argument("xapian_directory")
 @click.argument("sql_uri")
 @click.argument("sparql_uri")
+@click.option("-v", "--virtuoso-ttl-directory",
+              type=pathlib.Path,
+              default=pathlib.Path("/var/lib/data/"),
+              show_default=True)
 def is_data_modified(xapian_directory: str,
                      sql_uri: str,
-                     sparql_uri: str) -> None:
+                     sparql_uri: str,
+                     virtuoso_ttl_directory: pathlib.Path) -> None:
     dir_ = pathlib.Path(xapian_directory)
     with locked_xapian_writable_database(dir_) as db, database_connection(sql_uri) as conn:
         checksums = "-1"
@@ -529,7 +534,7 @@ def is_data_modified(xapian_directory: str,
             ])
         # Return a zero exit status code when the data has changed;
         # otherwise exit with a 1 exit status code.
-        generif = pathlib.Path("/var/lib/data/")
+        generif = virtuoso_ttl_directory
         if (db.get_metadata("generif-checksum").decode() == md5hash_ttl_dir(generif) and
             db.get_metadata("checksums").decode() == checksums):
             sys.exit(1)
@@ -540,9 +545,14 @@ def is_data_modified(xapian_directory: str,
 @click.argument("xapian_directory")
 @click.argument("sql_uri")
 @click.argument("sparql_uri")
+@click.option("-v", "--virtuoso-ttl-directory",
+              type=pathlib.Path,
+              default=pathlib.Path("/var/lib/data/"),
+              show_default=True)
 # pylint: disable=missing-function-docstring
 def create_xapian_index(xapian_directory: str, sql_uri: str,
-                        sparql_uri: str) -> None:
+                        sparql_uri: str,
+                        virtuoso_ttl_directory: pathlib.Path) -> None:
     logging.basicConfig(level=os.environ.get("LOGLEVEL", "DEBUG"),
                         format='%(asctime)s %(levelname)s: %(message)s',
                         datefmt='%Y-%m-%d %H:%M:%S %Z')
@@ -587,7 +597,7 @@ def create_xapian_index(xapian_directory: str, sql_uri: str,
                 logging.info("Writing generif checksums into index")
                 db.set_metadata(
                     "generif-checksum",
-                    md5hash_ttl_dir(pathlib.Path("/var/lib/data/")).encode())
+                    md5hash_ttl_dir(virtuoso_ttl_directory).encode())
         for child in combined_index.iterdir():
             shutil.move(child, xapian_directory)
     logging.info("Index built")
diff --git a/scripts/lmdb_matrix.py b/scripts/lmdb_matrix.py
new file mode 100644
index 0000000..22173af
--- /dev/null
+++ b/scripts/lmdb_matrix.py
@@ -0,0 +1,437 @@
+"""This scripts reads and store genotype files to an LMDB store.
+Similarly, it can be use to read this data.
+
+Example:
+
+guix shell python-click python-lmdb python-wrapper python-numpy -- \
+     python lmdb_matrix.py import-genotype \
+     <path-to-genotype-file> <path-to-lmdb-store>
+
+guix shell python-click python-lmdb python-wrapper python-numpy -- \
+     python lmdb_matrix.py print-current-matrix \
+     <path-to-lmdb-store>
+
+"""
+from dataclasses import dataclass
+from pathlib import Path
+from subprocess import check_output
+
+import json
+import click
+import lmdb
+import numpy as np
+
+
+@dataclass
+class GenotypeMatrix:
+    """Store the actual Genotype Matrix"""
+    matrix: np.ndarray
+    metadata: dict
+    file_info: dict
+
+
+def count_trailing_newlines(file_path):
+    """Count trailing newlines in a file"""
+    with open(file_path, 'rb', encoding="utf-8") as stream:
+        stream.seek(0, 2)  # Move to the end of the file
+        file_size = stream.tell()
+        if file_size == 0:
+            return 0
+        chunk_size = 1024  # Read in small chunks
+        empty_lines = 0
+        _buffer = b""
+
+        # Read chunks from the end backward
+        while stream.tell() > 0:
+            # Don't read beyond start
+            chunk_size = min(chunk_size, stream.tell())
+            stream.seek(-chunk_size, 1)  # Move backward
+            chunk = stream.read(chunk_size) + _buffer
+            stream.seek(-chunk_size, 1)  # Move back to start of chunk
+            # Decode chunk to text
+            try:
+                chunk_text = chunk.decode('utf-8', errors='ignore')
+            except UnicodeDecodeError:
+                # If decoding fails, keep some bytes for the next
+                # chunk Keep last 16 bytes to avoid splitting
+                # characters
+                _buffer = chunk[-16:]
+                continue
+
+            # Split into lines from the end
+            lines = chunk_text.splitlines()
+
+            # Count empty lines from the end
+            for line in reversed(lines):
+                if line.strip() != "":
+                    return empty_lines
+                empty_lines += 1
+            if stream.tell() == 0:
+                return empty_lines
+        return empty_lines
+
+
+def wc(filename):
+    """Get total file count of a file"""
+    return int(check_output(["wc", "-l", filename]).split()[0]) - \
+        count_trailing_newlines(filename)
+
+
+def get_genotype_metadata(genotype_file: str) -> tuple[dict, dict]:
+    """Parse metadata from a genotype file, separating '@'-prefixed
+    and '#'-prefixed entries.
+
+    This function reads a genotype file and extracts two types of metadata:
+    - '@'-prefixed metadata (e.g., '@name:BXD'), stored as key-value
+      pairs for dataset attributes.
+    - '#'-prefixed metadata (e.g., '# File name: BXD_Geno...'), stored
+      as key-value pairs for file information.  Lines starting with
+      '#' without a colon are skipped as comments. Parsing stops at
+      the first non-metadata line.
+
+    Args:
+        genotype_file (str): Path to the genotype file to be parsed.
+
+    Returns:
+        tuple[dict, dict]: A tuple containing two dictionaries:
+            - First dict: '@'-prefixed metadata (e.g., {'name': 'BXD',
+              'type': 'riset'}).
+            - Second dict: '#'-prefixed metadata with colons (e.g.,
+              {'File name': 'BXD_Geno...', 'Citation': '...'}).
+
+    Example:
+        >>> meta, file_info = get_genotype_metadata("BXD.small.geno")
+        >>> print(meta)
+        {'name': 'BXD', 'type': 'riset', 'mat': 'B', 'pat': 'D',
+        'het': 'H', 'unk': 'U'}
+        >>> print(file_info)
+        {'File name': 'BXD_Geno-19Jan2017b_forGN.xls', 'Metadata':
+        'Please retain...'}
+
+    """
+    metadata = {}
+    file_metadata = {}
+    with open(genotype_file, "r", encoding="utf-8") as stream:
+        while True:
+            line = stream.readline().strip()
+            match line:
+                case meta if line.startswith("#"):
+                    if ":" in meta:
+                        key, value = meta[2:].split(":", 1)
+                        file_metadata[key] = value
+                case meta if line.startswith("#"):
+                    continue
+                case meta if line.startswith("@") and ":" in line:
+                    key, value = meta[1:].split(":", 1)
+                    if value:
+                        metadata[key] = value.strip()
+                case _:
+                    break
+    return metadata, file_metadata
+
+
+def get_genotype_dimensions(genotype_file: str) -> tuple[int, int]:
+    """Calculate the dimensions of the data section in a genotype
+    file.
+
+    This function determines the number of data rows and columns in a
+    genotype file.  It skips metadata lines (starting with '#' or '@')
+    and uses the first non-metadata line as the header to count
+    columns. The total number of lines is counted in binary mode to
+    efficiently handle large files, and the number of data rows is
+    calculated by subtracting metadata and header lines. Accounts for
+    a potential trailing newline.
+
+    Args:
+        genotype_file (str): Path to the genotype file to be analyzed.
+
+    Returns:
+        tuple[int, int]: A tuple containing:
+            - First int: Number of data rows (excluding metadata and
+              header).
+            - Second int: Number of columns (based on the header row).
+
+    Example:
+        >>> rows, cols = get_genotype_dimensions("BXD.small.geno")
+        >>> print(rows, cols)
+        2, 202  # Example: 2 data rows, 202 columns (from header)
+
+    Note:
+        Assumes the first non-metadata line is the header row, split
+        by whitespace.  A trailing newline may be included in the line
+        count but is accounted for in the returned row count.
+
+    """
+    counter = 0
+    rows = []
+
+    with open(genotype_file, "r", encoding="utf-8") as stream:
+        while True:
+            line = stream.readline()
+            counter += 1
+            match line:
+                case "" | _ if line.startswith(("#", "@", "\n")):
+                    continue
+                case _:
+                    rows = line.split()
+                    break
+    return wc(genotype_file) - counter, len(rows)
+
+
+def read_genotype_headers(genotype_file: str) -> list[str]:
+    """Extract the header row from a genotype file.
+
+    This function reads a genotype file and returns the first
+    non-metadata line as a list of column headers. It skips lines
+    starting with '#' (comments), '@' (metadata), or empty lines,
+    assuming the first non-skipped line contains the header (e.g.,
+    'Chr', 'Locus', 'cM', 'Mb', followed by strain names like 'BXD1',
+    'BXD2', etc.). The header is split by whitespace to create the
+    list of column names.
+
+    Args:
+        genotype_file (str): Path to the genotype file to be parsed.
+
+    Returns:
+        list[str]: A list of column headers from the first non-metadata line.
+
+    Example:
+        >>> headers = read_genotype_headers("BXD.small.geno")
+        >>> print(headers)
+        ['Chr', 'Locus', 'cM', 'Mb', 'BXD1', 'BXD2', ..., 'BXD220']
+    """
+    rows = []
+    with open(genotype_file, "r", encoding="utf-8") as stream:
+        while True:
+            line = stream.readline()
+            match line:
+                case _ if line.startswith("#") or line.startswith("@") or line == "":
+                    continue
+                case _:
+                    rows = line.split()
+                    break
+    return rows
+
+
+# pylint: disable=too-many-locals
+def read_genotype_file(genotype_file: str) -> GenotypeMatrix:
+    """Read a genotype file and construct a GenotypeMatrix object.
+
+    This function parses a genotype file to extract metadata and
+    genotype data, creating a numerical matrix of genotype values and
+    associated metadata. It processes:
+    - '@'-prefixed metadata (e.g., '@mat:B') for dataset attributes
+      like maternal/paternal alleles.
+    - '#'-prefixed metadata (e.g., '# File name:...') for file
+      information.
+    - Header row for column names (e.g., 'Chr', 'Locus', BXD strains).
+    - Data rows, converting genotype symbols (e.g., 'B', 'D', 'H',
+    'U') to numeric values (0, 1, 2, 3) based on metadata mappings.
+
+    The function skips comment lines ('#'), metadata lines ('@'), and
+    empty lines, and organizes the data into a GenotypeMatrix with a
+    numpy array and metadata dictionaries.
+
+ Args:
+    genotype_file (str): Path to the genotype file to be parsed.
+
+ Returns:
+    GenotypeMatrix: An object containing:
+    - matrix: A numpy array (nrows x ncols) with genotype values (0:
+    maternal, 1: paternal, 2: heterozygous, 3: unknown).
+    - metadata: A dictionary with '@'-prefixed metadata, row/column
+    counts, individuals (BXD strains), metadata columns (e.g., 'Chr',
+    'Locus'), and lists of metadata values per row.
+    - file_info: A dictionary with '#'-prefixed metadata (e.g., 'File
+      name', 'Citation').
+
+ Raises:
+    ValueError: If an unrecognized genotype symbol is encountered in
+    the data.
+
+ Example:
+    >>> geno_matrix = read_genotype_file("BXD.small.geno")
+    >>> print(geno_matrix.matrix.shape)
+    (2, 198) # Example: 2 rows, 198 BXD strains
+    >>> print(geno_matrix.metadata["name"])
+    'BXD'
+    >>> print(geno_matrix.file_info["File name"])
+    'BXD_Geno-19Jan2017b_forGN.xls'
+    """
+    header = read_genotype_headers(genotype_file)
+
+    counter = 0
+    for i, el in enumerate(header):
+        if el not in ["Chr", "Locus", "cM", "Mb"]:
+            break
+        counter = i
+
+    metadata_columns, individuals = header[:counter], header[counter:]
+    nrows, ncols = get_genotype_dimensions(genotype_file)
+    ncols -= len(metadata_columns)
+    matrix = np.zeros((nrows, ncols), dtype=np.uint8)
+
+    metadata, file_metadata = get_genotype_metadata(genotype_file)
+    metadata = metadata | {
+        "nrows": nrows,
+        "ncols": ncols,
+        "individuals": individuals,
+        "metadata_columns": metadata_columns
+    }
+    for key in metadata_columns:
+        metadata[key] = []
+
+    maternal = metadata.get("mat")
+    paternal = metadata.get("pat")
+    heterozygous = metadata.get("het")
+    unknown = metadata.get("unk")
+    i = 0
+    sentinel = True
+    with open(genotype_file, "r", encoding="utf-8") as stream:
+        while True:
+            if i == nrows:
+                break
+            line = stream.readline().split()
+            meta, data = [], []
+            if line and line[0] in metadata_columns:
+                # Skip the metadata column
+                line = stream.readline().split()
+                sentinel = False
+            if len(line) == 0 or (line[0].startswith("#") and sentinel) \
+               or line[0].startswith("@"):
+                continue
+            meta, data = line[:len(metadata_columns)
+                              ], line[len(metadata_columns):]
+            # KLUDGE: It's not clear whether chromosome rows that
+            # start with a '#' should be a comment or not.  For some
+            # there's a mismatch between (E.g. B6D2F2_mm8) the size of
+            # the data values and ncols.  For now, skip them.
+            if len(data) != ncols:
+                i += 1
+                continue
+            for j, el in enumerate(data):
+                match el:
+                    case _ if el.isdigit():
+                        matrix[i, j] = int(el)
+                    case _ if maternal == el:
+                        matrix[i, j] = 0
+                    case _ if paternal == el:
+                        matrix[i, j] = 1
+                    case _ if heterozygous == el:
+                        matrix[i, j] = 2
+                    case _ if unknown == el:
+                        matrix[i, j] = 3
+                    case _:
+                        # KLUDGE: It's not clear how to handle float
+                        # types in a geno file
+                        # E.g. HSNIH-Palmer_true.geno which has float
+                        # values such as:  0.997.  Ideally maybe:
+                        # raise ValueError
+                        continue
+            i += 1
+            __map = dict(zip(metadata_columns, meta))
+            for key in metadata_columns:
+                metadata[key].append(__map.get(key))
+
+    genotype_matrix = GenotypeMatrix(
+        matrix=matrix,
+        metadata=metadata,
+        file_info=file_metadata
+    )
+    return genotype_matrix
+
+
+def create_database(db_path: str) -> lmdb.Environment:
+    """Create or open an LMDB environment."""
+    return lmdb.open(db_path, map_size=100 * 1024 * 1024 * 1024, create=True)
+
+
+def genotype_db_put(db: lmdb.Environment, genotype: GenotypeMatrix) -> bool:
+    """Put genotype GENOTYPEMATRIX from DB environment"""
+    metadata = json.dumps(genotype.metadata).encode("utf-8")
+    file_info = json.dumps(genotype.file_info).encode("utf-8")
+    with db.begin(write=True) as txn:
+        txn.put(b"matrix", genotype.matrix.tobytes())
+        txn.put(b"metadata", metadata)
+        # XXXX: KLUDGE: Put this in RDF instead
+        txn.put(b"file_info", file_info)
+    return True
+
+
+def genotype_db_get(db: lmdb.Environment) -> GenotypeMatrix:
+    """Get genotype GENOTYPEMATRIX from DB environment"""
+    with db.begin() as txn:
+        metadata = json.loads(txn.get(b"metadata").decode("utf-8"))
+        nrows, ncols = metadata.get("nrows"), metadata.get("ncols")
+        matrix = np.frombuffer(
+            txn.get(b"matrix"), dtype=np.uint8).reshape(nrows, ncols)
+        return GenotypeMatrix(
+            matrix=matrix,
+            metadata=metadata,
+            file_info=json.loads(txn.get(b"file_info"))
+        )
+
+
+def get_genotype_files(directory: str) -> list[tuple[str, int]]:
+    """Return a list of all the genotype files from a given
+    directory."""
+    geno_files = [
+        (_file.as_posix(), _file.stat().st_size)
+        for _file in Path(directory).glob("*.geno") if _file.is_file()]
+    return sorted(geno_files, key=lambda x: x[1])
+
+
+def __import_directory(directory: str, lmdb_path: str):
+    """Import all the genotype files from a given directory into
+    LMDB."""
+    for file_, file_size in get_genotype_files(directory):
+        genofile = Path(file_)
+        size_mb = file_size / (1024 ** 2)
+        lmdb_store = (Path(lmdb_path) / genofile.stem).as_posix()
+        print(f"Processing file: {genofile.name}")
+        with create_database(lmdb_store) as db:
+            genotype_db_put(
+                db=db, genotype=read_genotype_file(genofile.as_posix()))
+        print(f"\nSuccessfuly created: [{size_mb:.2f} MB] {genofile.stem}")
+
+
+@click.command(help="Import the genotype directory")
+@click.argument("genotype_directory")
+@click.argument("lmdb_path")
+def import_directory(genotype_directory: str, lmdb_path: str):
+    "Import a genotype directory into genotype_database path"
+    __import_directory(directory=genotype_directory, lmdb_path=lmdb_path)
+
+
+@click.command(help="Import the genotype file")
+@click.argument("geno_file")
+@click.argument("genotype_database")
+def import_genotype(geno_file: str, genotype_database: str):
+    "Import a genotype file into genotype_database path"
+    with create_database(genotype_database) as db:
+        genotype_db_put(db=db, genotype=read_genotype_file(geno_file))
+
+
+@click.command(help="Print the current matrix")
+@click.argument("database_directory")
+def print_current_matrix(database_directory: str):
+    """Print the current matrix in the database."""
+    with create_database(database_directory) as db:
+        current = genotype_db_get(db)
+        print(f"Matrix: {current.matrix}")
+        print(f"Metadata: {current.metadata}")
+        print(f"File Info: {current.file_info}")
+
+
+# pylint: disable=missing-function-docstring
+@click.group()
+def cli():
+    pass
+
+
+cli.add_command(print_current_matrix)
+cli.add_command(import_genotype)
+cli.add_command(import_directory)
+
+if __name__ == "__main__":
+    cli()
diff --git a/scripts/partial_correlations.py b/scripts/partial_correlations.py
index aab8f08..f47d9d6 100644
--- a/scripts/partial_correlations.py
+++ b/scripts/partial_correlations.py
@@ -1,7 +1,7 @@
 """Script to run partial correlations"""
-
 import json
 import traceback
+from pathlib import Path
 from argparse import ArgumentParser
 
 from gn3.db_utils import database_connection
@@ -48,7 +48,8 @@ def pcorrs_against_traits(dbconn, args):
 
 def pcorrs_against_db(dbconn, args):
     """Run partial correlations agaist the entire dataset provided."""
-    return partial_correlations_with_target_db(dbconn, **process_db_args(args))
+    return partial_correlations_with_target_db(
+        dbconn, **process_db_args(args), textdir=args.textdir)
 
 def run_pcorrs(dbconn, args):
     """Run the selected partial correlations function."""
@@ -89,6 +90,11 @@ def against_db_parser(parent_parser):
         "--criteria",
         help="Number of results to return",
         type=int, default=500)
+    parser.add_argument(
+        "--textdir",
+        help="Directory to read text files from",
+        type=Path,
+        default=Path("/tmp/"))
     parser.set_defaults(func=pcorrs_against_db)
     return parent_parser
 
diff --git a/scripts/pub_med.py b/scripts/pub_med.py
index 82b1730..0a94355 100644
--- a/scripts/pub_med.py
+++ b/scripts/pub_med.py
@@ -155,8 +155,8 @@ def fetch_id_lossy_search(query, db_name, max_results):
 
     try:
         response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}",
-                                headers={"content-type": "application/json"}
-                                )
+                                headers={"content-type": "application/json"},
+                                timeout=300)
         return response["esearchresult"]["idlist"]
 
     except requests.exceptions.RequestException as error:
@@ -174,7 +174,7 @@ def search_pubmed_lossy(pubmed_id, db_name):
     - dict: Records fetched based on PubMed ID.
     """
     url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json'
-    response = requests.get(url)
+    response = requests.get(url, timeout=300)
     response.raise_for_status()
     data = response.json()
     if db_name.lower() == "pmc":
diff --git a/scripts/rqtl2_wrapper.R b/scripts/rqtl2_wrapper.R
new file mode 100644
index 0000000..af13efa
--- /dev/null
+++ b/scripts/rqtl2_wrapper.R
@@ -0,0 +1,358 @@
+library(qtl2)
+library(rjson)
+library(stringi)
+library(optparse)
+
+# Define command-line options
+option_list <- list(
+  make_option(c("-d", "--directory"), action = "store", default = NULL, type = "character", 
+              help = "Temporary working directory: should also host the input file."),
+  make_option(c("-i", "--input_file"), action = "store", default = NULL, type = 'character', 
+              help = "A YAML or JSON file with required data to create the cross file."),
+  make_option(c("-o", "--output_file"), action = "store", default = NULL, type = 'character', 
+              help = "A file path of where to write the output JSON results."),
+  make_option(c("-c", "--cores"), type = "integer", default = 1, 
+              help = "Number of cores to use while making computation."),
+  make_option(c("-p", "--nperm"), type = "integer", default = 0, 
+              help = "Number of permutations."),
+  make_option(c("-m", "--method"), action = "store", default = "HK", type = "character", 
+              help = "Scan Mapping Method - HK (Haley Knott), LMM (Linear Mixed Model), LOCO (Leave One Chromosome Out)."),
+  make_option(c("--pstrata"), action = "store_true", default = NULL, 
+              help = "Use permutation strata."),
+  make_option(c("-t", "--threshold"), type = "integer", default = 1, 
+              help = "Minimum LOD score for a Peak.")
+)
+
+# Parse command-line arguments
+opt_parser <- OptionParser(option_list = option_list)
+opt <- parse_args(opt_parser)
+
+# Assign parsed arguments to variables
+NO_OF_CORES <- opt$cores
+SCAN_METHOD <- opt$method
+NO_OF_PERMUTATION <- opt$nperm
+
+NO_OF_CORES <- 20
+
+# Validate input and output file paths
+validate_file_paths <- function(opt) {
+  if (is.null(opt$directory) || !dir.exists(opt$directory)) {
+    print_help(opt_parser)
+    stop("The working directory does not exist or is NULL.\n")
+  }
+  
+  INPUT_FILE_PATH <- opt$input_file
+  OUTPUT_FILE_PATH <- opt$output_file
+  
+  if (!file.exists(INPUT_FILE_PATH)) {
+    print_help(opt_parser)
+    stop("The input file ", INPUT_FILE_PATH, " you provided does not exist.\n")
+  } else {
+    cat("Input file exists. Reading the input file...\n")
+  }
+  
+  if (!file.exists(OUTPUT_FILE_PATH)) {
+    print_help(opt_parser)
+    stop("The output file ", OUTPUT_FILE_PATH, " you provided does not exist.\n")
+  } else {
+    cat("Output file exists...", OUTPUT_FILE_PATH, "\n")
+  }
+  
+  return(list(input = INPUT_FILE_PATH, output = OUTPUT_FILE_PATH))
+}
+
+file_paths <- validate_file_paths(opt)
+INPUT_FILE_PATH <- file_paths$input
+OUTPUT_FILE_PATH <- file_paths$output
+
+# Utility function to generate random file names
+genRandomFileName <- function(prefix, string_size = 9, file_ext = ".txt") {
+  randStr <- paste(prefix, stri_rand_strings(1, string_size, pattern = "[A-Za-z0-9]"), sep = "_")
+  return(paste(randStr, file_ext, sep = ""))
+}
+
+# Generate control file path
+control_file_path <- file.path(opt$directory, genRandomFileName(prefix = "control", file_ext = ".json"))
+
+cat("Generated the control file path at", control_file_path, "\n")
+# Read and parse the input file
+cat("Reading and parsing the input file.\n")
+json_data <- fromJSON(file = INPUT_FILE_PATH)
+
+# Set default values for JSON data
+set_default_values <- function(json_data) {
+  if (is.null(json_data$sep)) {
+    cat("Using ',' as a default separator for cross file.\n")
+    json_data$sep <- ","
+  }
+  if (is.null(json_data$na.strings)) {
+    cat("Using '-' and 'NA' as the default na.strings.\n")
+    json_data$na.strings <- c("-", "NA")
+  }
+  
+  default_keys <- c("geno_transposed", "founder_geno_transposed", "pheno_transposed", 
+                    "covar_transposed", "phenocovar_transposed")
+  
+  for (item in default_keys) {
+    if (!(item %in% names(json_data))) {
+      cat("Using FALSE as default parameter for", item, "\n")
+      json_data[item] <- FALSE
+    }
+  }
+  
+  return(json_data)
+}
+
+json_data <- set_default_values(json_data)
+
+# Function to generate the cross object
+generate_cross_object <- function(control_file_path, json_data) {
+  write_control_file(
+    control_file_path,
+    crosstype = json_data$crosstype,
+    geno_file = json_data$geno_file,
+    pheno_file = json_data$pheno_file,
+    gmap_file = json_data$geno_map_file,
+    pmap_file = json_data$physical_map_file,
+    phenocovar_file = json_data$phenocovar_file,
+    geno_codes = json_data$geno_codes,
+    alleles = json_data$alleles,
+    na.strings = json_data$na.strings,
+    sex_file = json_data$sex_file,
+    founder_geno_file = json_data$founder_geno_file,
+    covar_file = json_data$covar_file,
+    sex_covar = json_data$sex_covar,
+    sex_codes = json_data$sex_codes,
+    crossinfo_file = json_data$crossinfo_file,
+    crossinfo_covar = json_data$crossinfo_covar,
+    crossinfo_codes = json_data$crossinfo_codes,
+    xchr = json_data$xchr,
+    overwrite = TRUE,
+    founder_geno_transposed = json_data$founder_geno_transposed,
+    geno_transposed = json_data$geno_transposed
+  )
+}
+
+# Generate the cross object
+cat("Generating the cross object at", control_file_path, "\n")
+generate_cross_object(control_file_path, json_data)
+
+# Read the cross object
+cat("Reading the cross object from", control_file_path, "\n")
+
+cross <- read_cross2(control_file_path, quiet = FALSE)
+
+# Check the integrity of the cross object
+cat("Checking the integrity of the cross object.\n")
+if (check_cross2(cross)) {
+  cat("Cross meets required specifications for a cross.\n")
+} else {
+  cat("Cross does not meet required specifications.\n")
+}
+
+# Print cross summary
+cat("A summary about the cross you provided:\n")
+summary(cross)
+
+# Function to compute genetic probabilities
+perform_genetic_pr <- function(cross, cores = NO_OF_CORES, step = 1, map = NULL, 
+                               map_function = c("haldane", "kosambi", "c-f", "morgan"), 
+                               error_prob = 0.002) {
+  calc_genoprob(cross, map = map, error_prob = error_prob, map_function = map_function, 
+                quiet = FALSE, cores = cores)
+}
+
+# Insert pseudomarkers to the genetic map
+cat("Inserting pseudomarkers to the genetic map with step 1 and stepwidth fixed.\n")
+
+MAP <- insert_pseudomarkers(cross$gmap, step = 1, stepwidth = "fixed", cores = NO_OF_CORES)
+
+# Calculate genetic probabilities
+cat("Calculating the genetic probabilities.\n")
+Pr <- perform_genetic_pr(cross)
+
+# Calculate allele probabilities for 4-way cross
+if (cross$crosstype == "4way") {
+  cat("Calculating allele genetic probability for 4-way cross.\n")
+  aPr <- genoprob_to_alleleprob(Pr)
+}
+
+# Calculate genotyping error LOD scores
+cat("Calculating the genotype error LOD scores.\n")
+error_lod <- calc_errorlod(cross, Pr, quiet = FALSE, cores = NO_OF_CORES)
+error_lod <- do.call("cbind", error_lod)
+
+# Get phenotypes and covariates
+cat("Getting the phenotypes and covariates.\n")
+pheno <- cross$pheno
+# covar <- match(cross$covar$sex, c("f", "m")) # make numeric
+# TODO rework on this
+covar <- NULL
+if (!is.null(covar)) {
+  names(covar) <- rownames(cross$covar)
+}
+
+Xcovar <- get_x_covar(cross)
+cat("The covariates are:\n")
+print(covar)
+cat("The Xcovar are:\n")
+print(Xcovar)
+
+# Function to calculate kinship
+get_kinship <- function(probability, method = "LMM") {
+  if (method == "LMM") {
+    kinship <- calc_kinship(probability)
+  } else if (method == "LOCO") {
+    kinship <- calc_kinship(probability, "loco")
+  } else {
+    kinship <- NULL
+  }
+  return(kinship)
+}
+
+# Calculate kinship for the genetic probability
+cat("Calculating the kinship for the genetic probability.\n")
+if (cross$crosstype == "4way") {
+  kinship <- get_kinship(aPr, opt$method)
+} else {
+  kinship <- get_kinship(Pr, "loco")
+}
+
+# Function to perform genome scan
+perform_genome_scan <- function(cross, genome_prob, method, addcovar = NULL, intcovar = NULL, 
+                                kinship = NULL, model = c("normal", "binary"), Xcovar = NULL) {
+  if (method == "LMM") {
+    cat("Performing scan1 using Linear Mixed Model.\n")
+    out <- scan1(genome_prob, cross$pheno, kinship = kinship, model = model, cores = NO_OF_CORES)
+  } else if (method == "LOCO") {
+    cat("Performing scan1 using Leave One Chromosome Out.\n")
+    out <- scan1(genome_prob, cross$pheno, kinship = kinship, model = model, cores = NO_OF_CORES)
+  } else if (method == "HK") {
+    cat("Performing scan1 using Haley Knott.\n")
+    out <- scan1(genome_prob, cross$pheno, addcovar = addcovar, intcovar = intcovar, 
+                 model = model, Xcovar = Xcovar, cores = NO_OF_CORES)
+  }
+  return(out)
+}
+
+# Perform the genome scan for the cross object
+if (cross$crosstype == "4way") {
+  sex <- setNames((cross$covar$Sex == "male") * 1, rownames(cross$covar))
+  scan_results <- perform_genome_scan(aPr, cross, kinship = kinship, method = "LOCO", addcovar = sex)
+} else {
+  scan_results <- perform_genome_scan(cross = cross, genome_prob = Pr, kinship = kinship, 
+                                     method = SCAN_METHOD)
+}
+
+# Save scan results
+scan_file <- file.path(opt$directory, "scan_results.csv")
+write.csv(scan_results, scan_file)
+
+# Function to perform permutation tests
+perform_permutation_test <- function(cross, genome_prob, n_perm, method = opt$method, 
+                                     covar = NULL, Xcovar = NULL, addcovar = NULL, 
+                                     intcovar = NULL, perm_Xsp = FALSE, kinship = NULL, 
+                                     model = c("normal", "binary"), chr_lengths = NULL, 
+                                     perm_strata = NULL) {
+  scan1perm(genome_prob, cross$pheno, kinship = kinship, Xcovar = Xcovar, intcovar = intcovar, 
+            addcovar = addcovar, n_perm = n_perm, perm_Xsp = perm_Xsp, model = model, 
+            chr_lengths = chr_lengths, cores = NO_OF_CORES)
+}
+
+# Check if permutation strata is needed
+if (!is.null(opt$pstrata) && !is.null(Xcovar)) {
+  perm_strata <- mat2strata(Xcovar)
+} else {
+  perm_strata <- NULL
+}
+
+# Perform permutation test if requested
+permutation_results_file <- file.path(opt$directory, "permutation.csv")
+significance_results_file <- file.path(opt$directory, "significance.csv")
+
+if (NO_OF_PERMUTATION > 0) {
+  cat("Performing permutation test for the cross object with", NO_OF_PERMUTATION, "permutations.\n")
+  perm <- perform_permutation_test(cross, Pr, n_perm = NO_OF_PERMUTATION, perm_strata = perm_strata, 
+                                   method = opt$method)
+  
+  # Function to get LOD significance thresholds
+  get_lod_significance <- function(perm, thresholds = c(0.01, 0.05, 0.63)) {
+    cat("Getting the permutation summary with significance thresholds:", thresholds, "\n")
+    summary(perm, alpha = thresholds)
+  }
+  
+  # Compute LOD significance
+  lod_significance <- get_lod_significance(perm)
+  
+  # Save results
+  write.csv(lod_significance, significance_results_file)
+  write.csv(perm, permutation_results_file)
+}
+
+
+
+# Function to get QTL effects
+get_qtl_effect <- function(chromosome, geno_prob, pheno, covar = NULL, LOCO = NULL) {
+  cat("Finding the QTL effect for chromosome", chromosome, "\n")
+  chr_Pr <- geno_prob[, chromosome]
+  if (!is.null(chr_Pr)) {
+    if (!is.null(LOCO)) {
+      cat("Finding QTL effect for chromosome", chromosome, "with LOCO.\n")
+      kinship <- calc_kinship(chr_Pr, "loco")[[chromosome]]
+      return(scan1coef(chr_Pr, pheno, kinship, addcovar = covar))
+    } else {
+      return(scan1coef(chr_Pr, pheno, addcovar = covar))
+    }
+  }
+  return(NULL)
+}
+
+# Get QTL effects for each chromosome
+# TODO
+
+# Prepare output data
+gmap_file <- file.path(opt$directory, json_data$geno_map_file)
+pmap_file <- file.path(opt$directory, json_data$physical_map_file)
+
+
+
+
+
+# Construct the Map object from cross with columns (Marker, chr, cM, Mb)
+gmap <- cross$gmap  # Genetic map in cM
+pmap <- cross$pmap  # Physical map in Mb
+# Convert lists to data frames
+gmap_df <- data.frame(
+  marker = unlist(lapply(gmap, names)), 
+  chr = rep(names(gmap), sapply(gmap, length)),  # Add chromosome info
+  CM = unlist(gmap), 
+  stringsAsFactors = FALSE
+)
+
+pmap_df <- data.frame(
+  marker = unlist(lapply(pmap, names)), 
+  chr = rep(names(pmap), sapply(pmap, length)),  # Add chromosome info
+  MB = unlist(pmap), 
+  stringsAsFactors = FALSE
+)
+# Merge using full outer join (by marker and chromosome)
+merged_map <- merge(gmap_df, pmap_df, by = c("marker", "chr"), all = TRUE)
+map_file <- file.path(opt$directory, "map.csv")
+write.csv(merged_map, map_file, row.names = FALSE)
+
+output <- list(
+  permutation_file = permutation_results_file,
+  significance_file = significance_results_file,
+  scan_file = scan_file,
+  gmap_file = gmap_file,
+  pmap_file = pmap_file,
+  map_file  = map_file,
+  permutations = NO_OF_PERMUTATION,
+  scan_method = SCAN_METHOD
+)
+
+# Write output to JSON file
+output_json_data <- toJSON(output)
+cat("The output file path generated is", OUTPUT_FILE_PATH, "\n")
+cat("Writing to the output file.\n")
+write(output_json_data, file = OUTPUT_FILE_PATH)
\ No newline at end of file
diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R
index 31c1277..0b39a6a 100644
--- a/scripts/rqtl_wrapper.R
+++ b/scripts/rqtl_wrapper.R
@@ -4,6 +4,9 @@ library(stringi)
 library(stringr)
 
 
+
+cat("Running the qtl script.\n")
+
 tmp_dir = Sys.getenv("TMPDIR")
 if (!dir.exists(tmp_dir)) {
   tmp_dir = "/tmp"
@@ -24,7 +27,7 @@ option_list = list(
   make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"),
   make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "gn3"), help="Directory in which to write result file"),
   make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"),
-  make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information")
+  make_option(c("-v", "--verbose"), action="store_true", default=TRUE, help="Show extra information")
 );
 
 opt_parser = OptionParser(option_list=option_list);
@@ -353,5 +356,8 @@ if (!is.null(opt$pairscan)) {
     colnames(qtl_results)[4:7] <- c("AC", "AD", "BC", "BD")
   }
 
+
   write.csv(qtl_results, out_file)
 }
+
+cat("End of script. Now working on processing the results.\n")
diff --git a/scripts/update_rif_table.py b/scripts/update_rif_table.py
index 24edf3d..f936f5b 100755
--- a/scripts/update_rif_table.py
+++ b/scripts/update_rif_table.py
@@ -35,7 +35,7 @@ VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
 
 def download_file(url: str, dest: pathlib.Path):
     """Saves the contents of url in dest"""
-    with requests.get(url, stream=True) as resp:
+    with requests.get(url, stream=True, timeout=300) as resp:
         resp.raise_for_status()
         with open(dest, "wb") as downloaded_file:
             for chunk in resp.iter_content(chunk_size=8192):