diff options
Diffstat (limited to 'scripts')
| -rwxr-xr-x | scripts/index-genenetwork | 18 | ||||
| -rw-r--r-- | scripts/lmdb_matrix.py | 437 | ||||
| -rw-r--r-- | scripts/partial_correlations.py | 10 | ||||
| -rw-r--r-- | scripts/pub_med.py | 6 | ||||
| -rw-r--r-- | scripts/rqtl2_wrapper.R | 358 | ||||
| -rw-r--r-- | scripts/rqtl_wrapper.R | 8 | ||||
| -rwxr-xr-x | scripts/update_rif_table.py | 2 |
7 files changed, 828 insertions, 11 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index 2779abc..0544bc7 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -514,9 +514,14 @@ def xapian_compact(combined_index: pathlib.Path, indices: List[pathlib.Path]) -> @click.argument("xapian_directory") @click.argument("sql_uri") @click.argument("sparql_uri") +@click.option("-v", "--virtuoso-ttl-directory", + type=pathlib.Path, + default=pathlib.Path("/var/lib/data/"), + show_default=True) def is_data_modified(xapian_directory: str, sql_uri: str, - sparql_uri: str) -> None: + sparql_uri: str, + virtuoso_ttl_directory: pathlib.Path) -> None: dir_ = pathlib.Path(xapian_directory) with locked_xapian_writable_database(dir_) as db, database_connection(sql_uri) as conn: checksums = "-1" @@ -529,7 +534,7 @@ def is_data_modified(xapian_directory: str, ]) # Return a zero exit status code when the data has changed; # otherwise exit with a 1 exit status code. - generif = pathlib.Path("/var/lib/data/") + generif = virtuoso_ttl_directory if (db.get_metadata("generif-checksum").decode() == md5hash_ttl_dir(generif) and db.get_metadata("checksums").decode() == checksums): sys.exit(1) @@ -540,9 +545,14 @@ def is_data_modified(xapian_directory: str, @click.argument("xapian_directory") @click.argument("sql_uri") @click.argument("sparql_uri") +@click.option("-v", "--virtuoso-ttl-directory", + type=pathlib.Path, + default=pathlib.Path("/var/lib/data/"), + show_default=True) # pylint: disable=missing-function-docstring def create_xapian_index(xapian_directory: str, sql_uri: str, - sparql_uri: str) -> None: + sparql_uri: str, + virtuoso_ttl_directory: pathlib.Path) -> None: logging.basicConfig(level=os.environ.get("LOGLEVEL", "DEBUG"), format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S %Z') @@ -587,7 +597,7 @@ def create_xapian_index(xapian_directory: str, sql_uri: str, logging.info("Writing generif checksums into index") db.set_metadata( "generif-checksum", - md5hash_ttl_dir(pathlib.Path("/var/lib/data/")).encode()) + md5hash_ttl_dir(virtuoso_ttl_directory).encode()) for child in combined_index.iterdir(): shutil.move(child, xapian_directory) logging.info("Index built") diff --git a/scripts/lmdb_matrix.py b/scripts/lmdb_matrix.py new file mode 100644 index 0000000..22173af --- /dev/null +++ b/scripts/lmdb_matrix.py @@ -0,0 +1,437 @@ +"""This scripts reads and store genotype files to an LMDB store. +Similarly, it can be use to read this data. + +Example: + +guix shell python-click python-lmdb python-wrapper python-numpy -- \ + python lmdb_matrix.py import-genotype \ + <path-to-genotype-file> <path-to-lmdb-store> + +guix shell python-click python-lmdb python-wrapper python-numpy -- \ + python lmdb_matrix.py print-current-matrix \ + <path-to-lmdb-store> + +""" +from dataclasses import dataclass +from pathlib import Path +from subprocess import check_output + +import json +import click +import lmdb +import numpy as np + + +@dataclass +class GenotypeMatrix: + """Store the actual Genotype Matrix""" + matrix: np.ndarray + metadata: dict + file_info: dict + + +def count_trailing_newlines(file_path): + """Count trailing newlines in a file""" + with open(file_path, 'rb', encoding="utf-8") as stream: + stream.seek(0, 2) # Move to the end of the file + file_size = stream.tell() + if file_size == 0: + return 0 + chunk_size = 1024 # Read in small chunks + empty_lines = 0 + _buffer = b"" + + # Read chunks from the end backward + while stream.tell() > 0: + # Don't read beyond start + chunk_size = min(chunk_size, stream.tell()) + stream.seek(-chunk_size, 1) # Move backward + chunk = stream.read(chunk_size) + _buffer + stream.seek(-chunk_size, 1) # Move back to start of chunk + # Decode chunk to text + try: + chunk_text = chunk.decode('utf-8', errors='ignore') + except UnicodeDecodeError: + # If decoding fails, keep some bytes for the next + # chunk Keep last 16 bytes to avoid splitting + # characters + _buffer = chunk[-16:] + continue + + # Split into lines from the end + lines = chunk_text.splitlines() + + # Count empty lines from the end + for line in reversed(lines): + if line.strip() != "": + return empty_lines + empty_lines += 1 + if stream.tell() == 0: + return empty_lines + return empty_lines + + +def wc(filename): + """Get total file count of a file""" + return int(check_output(["wc", "-l", filename]).split()[0]) - \ + count_trailing_newlines(filename) + + +def get_genotype_metadata(genotype_file: str) -> tuple[dict, dict]: + """Parse metadata from a genotype file, separating '@'-prefixed + and '#'-prefixed entries. + + This function reads a genotype file and extracts two types of metadata: + - '@'-prefixed metadata (e.g., '@name:BXD'), stored as key-value + pairs for dataset attributes. + - '#'-prefixed metadata (e.g., '# File name: BXD_Geno...'), stored + as key-value pairs for file information. Lines starting with + '#' without a colon are skipped as comments. Parsing stops at + the first non-metadata line. + + Args: + genotype_file (str): Path to the genotype file to be parsed. + + Returns: + tuple[dict, dict]: A tuple containing two dictionaries: + - First dict: '@'-prefixed metadata (e.g., {'name': 'BXD', + 'type': 'riset'}). + - Second dict: '#'-prefixed metadata with colons (e.g., + {'File name': 'BXD_Geno...', 'Citation': '...'}). + + Example: + >>> meta, file_info = get_genotype_metadata("BXD.small.geno") + >>> print(meta) + {'name': 'BXD', 'type': 'riset', 'mat': 'B', 'pat': 'D', + 'het': 'H', 'unk': 'U'} + >>> print(file_info) + {'File name': 'BXD_Geno-19Jan2017b_forGN.xls', 'Metadata': + 'Please retain...'} + + """ + metadata = {} + file_metadata = {} + with open(genotype_file, "r", encoding="utf-8") as stream: + while True: + line = stream.readline().strip() + match line: + case meta if line.startswith("#"): + if ":" in meta: + key, value = meta[2:].split(":", 1) + file_metadata[key] = value + case meta if line.startswith("#"): + continue + case meta if line.startswith("@") and ":" in line: + key, value = meta[1:].split(":", 1) + if value: + metadata[key] = value.strip() + case _: + break + return metadata, file_metadata + + +def get_genotype_dimensions(genotype_file: str) -> tuple[int, int]: + """Calculate the dimensions of the data section in a genotype + file. + + This function determines the number of data rows and columns in a + genotype file. It skips metadata lines (starting with '#' or '@') + and uses the first non-metadata line as the header to count + columns. The total number of lines is counted in binary mode to + efficiently handle large files, and the number of data rows is + calculated by subtracting metadata and header lines. Accounts for + a potential trailing newline. + + Args: + genotype_file (str): Path to the genotype file to be analyzed. + + Returns: + tuple[int, int]: A tuple containing: + - First int: Number of data rows (excluding metadata and + header). + - Second int: Number of columns (based on the header row). + + Example: + >>> rows, cols = get_genotype_dimensions("BXD.small.geno") + >>> print(rows, cols) + 2, 202 # Example: 2 data rows, 202 columns (from header) + + Note: + Assumes the first non-metadata line is the header row, split + by whitespace. A trailing newline may be included in the line + count but is accounted for in the returned row count. + + """ + counter = 0 + rows = [] + + with open(genotype_file, "r", encoding="utf-8") as stream: + while True: + line = stream.readline() + counter += 1 + match line: + case "" | _ if line.startswith(("#", "@", "\n")): + continue + case _: + rows = line.split() + break + return wc(genotype_file) - counter, len(rows) + + +def read_genotype_headers(genotype_file: str) -> list[str]: + """Extract the header row from a genotype file. + + This function reads a genotype file and returns the first + non-metadata line as a list of column headers. It skips lines + starting with '#' (comments), '@' (metadata), or empty lines, + assuming the first non-skipped line contains the header (e.g., + 'Chr', 'Locus', 'cM', 'Mb', followed by strain names like 'BXD1', + 'BXD2', etc.). The header is split by whitespace to create the + list of column names. + + Args: + genotype_file (str): Path to the genotype file to be parsed. + + Returns: + list[str]: A list of column headers from the first non-metadata line. + + Example: + >>> headers = read_genotype_headers("BXD.small.geno") + >>> print(headers) + ['Chr', 'Locus', 'cM', 'Mb', 'BXD1', 'BXD2', ..., 'BXD220'] + """ + rows = [] + with open(genotype_file, "r", encoding="utf-8") as stream: + while True: + line = stream.readline() + match line: + case _ if line.startswith("#") or line.startswith("@") or line == "": + continue + case _: + rows = line.split() + break + return rows + + +# pylint: disable=too-many-locals +def read_genotype_file(genotype_file: str) -> GenotypeMatrix: + """Read a genotype file and construct a GenotypeMatrix object. + + This function parses a genotype file to extract metadata and + genotype data, creating a numerical matrix of genotype values and + associated metadata. It processes: + - '@'-prefixed metadata (e.g., '@mat:B') for dataset attributes + like maternal/paternal alleles. + - '#'-prefixed metadata (e.g., '# File name:...') for file + information. + - Header row for column names (e.g., 'Chr', 'Locus', BXD strains). + - Data rows, converting genotype symbols (e.g., 'B', 'D', 'H', + 'U') to numeric values (0, 1, 2, 3) based on metadata mappings. + + The function skips comment lines ('#'), metadata lines ('@'), and + empty lines, and organizes the data into a GenotypeMatrix with a + numpy array and metadata dictionaries. + + Args: + genotype_file (str): Path to the genotype file to be parsed. + + Returns: + GenotypeMatrix: An object containing: + - matrix: A numpy array (nrows x ncols) with genotype values (0: + maternal, 1: paternal, 2: heterozygous, 3: unknown). + - metadata: A dictionary with '@'-prefixed metadata, row/column + counts, individuals (BXD strains), metadata columns (e.g., 'Chr', + 'Locus'), and lists of metadata values per row. + - file_info: A dictionary with '#'-prefixed metadata (e.g., 'File + name', 'Citation'). + + Raises: + ValueError: If an unrecognized genotype symbol is encountered in + the data. + + Example: + >>> geno_matrix = read_genotype_file("BXD.small.geno") + >>> print(geno_matrix.matrix.shape) + (2, 198) # Example: 2 rows, 198 BXD strains + >>> print(geno_matrix.metadata["name"]) + 'BXD' + >>> print(geno_matrix.file_info["File name"]) + 'BXD_Geno-19Jan2017b_forGN.xls' + """ + header = read_genotype_headers(genotype_file) + + counter = 0 + for i, el in enumerate(header): + if el not in ["Chr", "Locus", "cM", "Mb"]: + break + counter = i + + metadata_columns, individuals = header[:counter], header[counter:] + nrows, ncols = get_genotype_dimensions(genotype_file) + ncols -= len(metadata_columns) + matrix = np.zeros((nrows, ncols), dtype=np.uint8) + + metadata, file_metadata = get_genotype_metadata(genotype_file) + metadata = metadata | { + "nrows": nrows, + "ncols": ncols, + "individuals": individuals, + "metadata_columns": metadata_columns + } + for key in metadata_columns: + metadata[key] = [] + + maternal = metadata.get("mat") + paternal = metadata.get("pat") + heterozygous = metadata.get("het") + unknown = metadata.get("unk") + i = 0 + sentinel = True + with open(genotype_file, "r", encoding="utf-8") as stream: + while True: + if i == nrows: + break + line = stream.readline().split() + meta, data = [], [] + if line and line[0] in metadata_columns: + # Skip the metadata column + line = stream.readline().split() + sentinel = False + if len(line) == 0 or (line[0].startswith("#") and sentinel) \ + or line[0].startswith("@"): + continue + meta, data = line[:len(metadata_columns) + ], line[len(metadata_columns):] + # KLUDGE: It's not clear whether chromosome rows that + # start with a '#' should be a comment or not. For some + # there's a mismatch between (E.g. B6D2F2_mm8) the size of + # the data values and ncols. For now, skip them. + if len(data) != ncols: + i += 1 + continue + for j, el in enumerate(data): + match el: + case _ if el.isdigit(): + matrix[i, j] = int(el) + case _ if maternal == el: + matrix[i, j] = 0 + case _ if paternal == el: + matrix[i, j] = 1 + case _ if heterozygous == el: + matrix[i, j] = 2 + case _ if unknown == el: + matrix[i, j] = 3 + case _: + # KLUDGE: It's not clear how to handle float + # types in a geno file + # E.g. HSNIH-Palmer_true.geno which has float + # values such as: 0.997. Ideally maybe: + # raise ValueError + continue + i += 1 + __map = dict(zip(metadata_columns, meta)) + for key in metadata_columns: + metadata[key].append(__map.get(key)) + + genotype_matrix = GenotypeMatrix( + matrix=matrix, + metadata=metadata, + file_info=file_metadata + ) + return genotype_matrix + + +def create_database(db_path: str) -> lmdb.Environment: + """Create or open an LMDB environment.""" + return lmdb.open(db_path, map_size=100 * 1024 * 1024 * 1024, create=True) + + +def genotype_db_put(db: lmdb.Environment, genotype: GenotypeMatrix) -> bool: + """Put genotype GENOTYPEMATRIX from DB environment""" + metadata = json.dumps(genotype.metadata).encode("utf-8") + file_info = json.dumps(genotype.file_info).encode("utf-8") + with db.begin(write=True) as txn: + txn.put(b"matrix", genotype.matrix.tobytes()) + txn.put(b"metadata", metadata) + # XXXX: KLUDGE: Put this in RDF instead + txn.put(b"file_info", file_info) + return True + + +def genotype_db_get(db: lmdb.Environment) -> GenotypeMatrix: + """Get genotype GENOTYPEMATRIX from DB environment""" + with db.begin() as txn: + metadata = json.loads(txn.get(b"metadata").decode("utf-8")) + nrows, ncols = metadata.get("nrows"), metadata.get("ncols") + matrix = np.frombuffer( + txn.get(b"matrix"), dtype=np.uint8).reshape(nrows, ncols) + return GenotypeMatrix( + matrix=matrix, + metadata=metadata, + file_info=json.loads(txn.get(b"file_info")) + ) + + +def get_genotype_files(directory: str) -> list[tuple[str, int]]: + """Return a list of all the genotype files from a given + directory.""" + geno_files = [ + (_file.as_posix(), _file.stat().st_size) + for _file in Path(directory).glob("*.geno") if _file.is_file()] + return sorted(geno_files, key=lambda x: x[1]) + + +def __import_directory(directory: str, lmdb_path: str): + """Import all the genotype files from a given directory into + LMDB.""" + for file_, file_size in get_genotype_files(directory): + genofile = Path(file_) + size_mb = file_size / (1024 ** 2) + lmdb_store = (Path(lmdb_path) / genofile.stem).as_posix() + print(f"Processing file: {genofile.name}") + with create_database(lmdb_store) as db: + genotype_db_put( + db=db, genotype=read_genotype_file(genofile.as_posix())) + print(f"\nSuccessfuly created: [{size_mb:.2f} MB] {genofile.stem}") + + +@click.command(help="Import the genotype directory") +@click.argument("genotype_directory") +@click.argument("lmdb_path") +def import_directory(genotype_directory: str, lmdb_path: str): + "Import a genotype directory into genotype_database path" + __import_directory(directory=genotype_directory, lmdb_path=lmdb_path) + + +@click.command(help="Import the genotype file") +@click.argument("geno_file") +@click.argument("genotype_database") +def import_genotype(geno_file: str, genotype_database: str): + "Import a genotype file into genotype_database path" + with create_database(genotype_database) as db: + genotype_db_put(db=db, genotype=read_genotype_file(geno_file)) + + +@click.command(help="Print the current matrix") +@click.argument("database_directory") +def print_current_matrix(database_directory: str): + """Print the current matrix in the database.""" + with create_database(database_directory) as db: + current = genotype_db_get(db) + print(f"Matrix: {current.matrix}") + print(f"Metadata: {current.metadata}") + print(f"File Info: {current.file_info}") + + +# pylint: disable=missing-function-docstring +@click.group() +def cli(): + pass + + +cli.add_command(print_current_matrix) +cli.add_command(import_genotype) +cli.add_command(import_directory) + +if __name__ == "__main__": + cli() diff --git a/scripts/partial_correlations.py b/scripts/partial_correlations.py index aab8f08..f47d9d6 100644 --- a/scripts/partial_correlations.py +++ b/scripts/partial_correlations.py @@ -1,7 +1,7 @@ """Script to run partial correlations""" - import json import traceback +from pathlib import Path from argparse import ArgumentParser from gn3.db_utils import database_connection @@ -48,7 +48,8 @@ def pcorrs_against_traits(dbconn, args): def pcorrs_against_db(dbconn, args): """Run partial correlations agaist the entire dataset provided.""" - return partial_correlations_with_target_db(dbconn, **process_db_args(args)) + return partial_correlations_with_target_db( + dbconn, **process_db_args(args), textdir=args.textdir) def run_pcorrs(dbconn, args): """Run the selected partial correlations function.""" @@ -89,6 +90,11 @@ def against_db_parser(parent_parser): "--criteria", help="Number of results to return", type=int, default=500) + parser.add_argument( + "--textdir", + help="Directory to read text files from", + type=Path, + default=Path("/tmp/")) parser.set_defaults(func=pcorrs_against_db) return parent_parser diff --git a/scripts/pub_med.py b/scripts/pub_med.py index 82b1730..0a94355 100644 --- a/scripts/pub_med.py +++ b/scripts/pub_med.py @@ -155,8 +155,8 @@ def fetch_id_lossy_search(query, db_name, max_results): try: response = requests.get(f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db={db_name}&retmode=json&retmax={max_results}&term={query}", - headers={"content-type": "application/json"} - ) + headers={"content-type": "application/json"}, + timeout=300) return response["esearchresult"]["idlist"] except requests.exceptions.RequestException as error: @@ -174,7 +174,7 @@ def search_pubmed_lossy(pubmed_id, db_name): - dict: Records fetched based on PubMed ID. """ url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db={db_name}&id={",".join(pubmed_id)}&retmode=json' - response = requests.get(url) + response = requests.get(url, timeout=300) response.raise_for_status() data = response.json() if db_name.lower() == "pmc": diff --git a/scripts/rqtl2_wrapper.R b/scripts/rqtl2_wrapper.R new file mode 100644 index 0000000..af13efa --- /dev/null +++ b/scripts/rqtl2_wrapper.R @@ -0,0 +1,358 @@ +library(qtl2) +library(rjson) +library(stringi) +library(optparse) + +# Define command-line options +option_list <- list( + make_option(c("-d", "--directory"), action = "store", default = NULL, type = "character", + help = "Temporary working directory: should also host the input file."), + make_option(c("-i", "--input_file"), action = "store", default = NULL, type = 'character', + help = "A YAML or JSON file with required data to create the cross file."), + make_option(c("-o", "--output_file"), action = "store", default = NULL, type = 'character', + help = "A file path of where to write the output JSON results."), + make_option(c("-c", "--cores"), type = "integer", default = 1, + help = "Number of cores to use while making computation."), + make_option(c("-p", "--nperm"), type = "integer", default = 0, + help = "Number of permutations."), + make_option(c("-m", "--method"), action = "store", default = "HK", type = "character", + help = "Scan Mapping Method - HK (Haley Knott), LMM (Linear Mixed Model), LOCO (Leave One Chromosome Out)."), + make_option(c("--pstrata"), action = "store_true", default = NULL, + help = "Use permutation strata."), + make_option(c("-t", "--threshold"), type = "integer", default = 1, + help = "Minimum LOD score for a Peak.") +) + +# Parse command-line arguments +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +# Assign parsed arguments to variables +NO_OF_CORES <- opt$cores +SCAN_METHOD <- opt$method +NO_OF_PERMUTATION <- opt$nperm + +NO_OF_CORES <- 20 + +# Validate input and output file paths +validate_file_paths <- function(opt) { + if (is.null(opt$directory) || !dir.exists(opt$directory)) { + print_help(opt_parser) + stop("The working directory does not exist or is NULL.\n") + } + + INPUT_FILE_PATH <- opt$input_file + OUTPUT_FILE_PATH <- opt$output_file + + if (!file.exists(INPUT_FILE_PATH)) { + print_help(opt_parser) + stop("The input file ", INPUT_FILE_PATH, " you provided does not exist.\n") + } else { + cat("Input file exists. Reading the input file...\n") + } + + if (!file.exists(OUTPUT_FILE_PATH)) { + print_help(opt_parser) + stop("The output file ", OUTPUT_FILE_PATH, " you provided does not exist.\n") + } else { + cat("Output file exists...", OUTPUT_FILE_PATH, "\n") + } + + return(list(input = INPUT_FILE_PATH, output = OUTPUT_FILE_PATH)) +} + +file_paths <- validate_file_paths(opt) +INPUT_FILE_PATH <- file_paths$input +OUTPUT_FILE_PATH <- file_paths$output + +# Utility function to generate random file names +genRandomFileName <- function(prefix, string_size = 9, file_ext = ".txt") { + randStr <- paste(prefix, stri_rand_strings(1, string_size, pattern = "[A-Za-z0-9]"), sep = "_") + return(paste(randStr, file_ext, sep = "")) +} + +# Generate control file path +control_file_path <- file.path(opt$directory, genRandomFileName(prefix = "control", file_ext = ".json")) + +cat("Generated the control file path at", control_file_path, "\n") +# Read and parse the input file +cat("Reading and parsing the input file.\n") +json_data <- fromJSON(file = INPUT_FILE_PATH) + +# Set default values for JSON data +set_default_values <- function(json_data) { + if (is.null(json_data$sep)) { + cat("Using ',' as a default separator for cross file.\n") + json_data$sep <- "," + } + if (is.null(json_data$na.strings)) { + cat("Using '-' and 'NA' as the default na.strings.\n") + json_data$na.strings <- c("-", "NA") + } + + default_keys <- c("geno_transposed", "founder_geno_transposed", "pheno_transposed", + "covar_transposed", "phenocovar_transposed") + + for (item in default_keys) { + if (!(item %in% names(json_data))) { + cat("Using FALSE as default parameter for", item, "\n") + json_data[item] <- FALSE + } + } + + return(json_data) +} + +json_data <- set_default_values(json_data) + +# Function to generate the cross object +generate_cross_object <- function(control_file_path, json_data) { + write_control_file( + control_file_path, + crosstype = json_data$crosstype, + geno_file = json_data$geno_file, + pheno_file = json_data$pheno_file, + gmap_file = json_data$geno_map_file, + pmap_file = json_data$physical_map_file, + phenocovar_file = json_data$phenocovar_file, + geno_codes = json_data$geno_codes, + alleles = json_data$alleles, + na.strings = json_data$na.strings, + sex_file = json_data$sex_file, + founder_geno_file = json_data$founder_geno_file, + covar_file = json_data$covar_file, + sex_covar = json_data$sex_covar, + sex_codes = json_data$sex_codes, + crossinfo_file = json_data$crossinfo_file, + crossinfo_covar = json_data$crossinfo_covar, + crossinfo_codes = json_data$crossinfo_codes, + xchr = json_data$xchr, + overwrite = TRUE, + founder_geno_transposed = json_data$founder_geno_transposed, + geno_transposed = json_data$geno_transposed + ) +} + +# Generate the cross object +cat("Generating the cross object at", control_file_path, "\n") +generate_cross_object(control_file_path, json_data) + +# Read the cross object +cat("Reading the cross object from", control_file_path, "\n") + +cross <- read_cross2(control_file_path, quiet = FALSE) + +# Check the integrity of the cross object +cat("Checking the integrity of the cross object.\n") +if (check_cross2(cross)) { + cat("Cross meets required specifications for a cross.\n") +} else { + cat("Cross does not meet required specifications.\n") +} + +# Print cross summary +cat("A summary about the cross you provided:\n") +summary(cross) + +# Function to compute genetic probabilities +perform_genetic_pr <- function(cross, cores = NO_OF_CORES, step = 1, map = NULL, + map_function = c("haldane", "kosambi", "c-f", "morgan"), + error_prob = 0.002) { + calc_genoprob(cross, map = map, error_prob = error_prob, map_function = map_function, + quiet = FALSE, cores = cores) +} + +# Insert pseudomarkers to the genetic map +cat("Inserting pseudomarkers to the genetic map with step 1 and stepwidth fixed.\n") + +MAP <- insert_pseudomarkers(cross$gmap, step = 1, stepwidth = "fixed", cores = NO_OF_CORES) + +# Calculate genetic probabilities +cat("Calculating the genetic probabilities.\n") +Pr <- perform_genetic_pr(cross) + +# Calculate allele probabilities for 4-way cross +if (cross$crosstype == "4way") { + cat("Calculating allele genetic probability for 4-way cross.\n") + aPr <- genoprob_to_alleleprob(Pr) +} + +# Calculate genotyping error LOD scores +cat("Calculating the genotype error LOD scores.\n") +error_lod <- calc_errorlod(cross, Pr, quiet = FALSE, cores = NO_OF_CORES) +error_lod <- do.call("cbind", error_lod) + +# Get phenotypes and covariates +cat("Getting the phenotypes and covariates.\n") +pheno <- cross$pheno +# covar <- match(cross$covar$sex, c("f", "m")) # make numeric +# TODO rework on this +covar <- NULL +if (!is.null(covar)) { + names(covar) <- rownames(cross$covar) +} + +Xcovar <- get_x_covar(cross) +cat("The covariates are:\n") +print(covar) +cat("The Xcovar are:\n") +print(Xcovar) + +# Function to calculate kinship +get_kinship <- function(probability, method = "LMM") { + if (method == "LMM") { + kinship <- calc_kinship(probability) + } else if (method == "LOCO") { + kinship <- calc_kinship(probability, "loco") + } else { + kinship <- NULL + } + return(kinship) +} + +# Calculate kinship for the genetic probability +cat("Calculating the kinship for the genetic probability.\n") +if (cross$crosstype == "4way") { + kinship <- get_kinship(aPr, opt$method) +} else { + kinship <- get_kinship(Pr, "loco") +} + +# Function to perform genome scan +perform_genome_scan <- function(cross, genome_prob, method, addcovar = NULL, intcovar = NULL, + kinship = NULL, model = c("normal", "binary"), Xcovar = NULL) { + if (method == "LMM") { + cat("Performing scan1 using Linear Mixed Model.\n") + out <- scan1(genome_prob, cross$pheno, kinship = kinship, model = model, cores = NO_OF_CORES) + } else if (method == "LOCO") { + cat("Performing scan1 using Leave One Chromosome Out.\n") + out <- scan1(genome_prob, cross$pheno, kinship = kinship, model = model, cores = NO_OF_CORES) + } else if (method == "HK") { + cat("Performing scan1 using Haley Knott.\n") + out <- scan1(genome_prob, cross$pheno, addcovar = addcovar, intcovar = intcovar, + model = model, Xcovar = Xcovar, cores = NO_OF_CORES) + } + return(out) +} + +# Perform the genome scan for the cross object +if (cross$crosstype == "4way") { + sex <- setNames((cross$covar$Sex == "male") * 1, rownames(cross$covar)) + scan_results <- perform_genome_scan(aPr, cross, kinship = kinship, method = "LOCO", addcovar = sex) +} else { + scan_results <- perform_genome_scan(cross = cross, genome_prob = Pr, kinship = kinship, + method = SCAN_METHOD) +} + +# Save scan results +scan_file <- file.path(opt$directory, "scan_results.csv") +write.csv(scan_results, scan_file) + +# Function to perform permutation tests +perform_permutation_test <- function(cross, genome_prob, n_perm, method = opt$method, + covar = NULL, Xcovar = NULL, addcovar = NULL, + intcovar = NULL, perm_Xsp = FALSE, kinship = NULL, + model = c("normal", "binary"), chr_lengths = NULL, + perm_strata = NULL) { + scan1perm(genome_prob, cross$pheno, kinship = kinship, Xcovar = Xcovar, intcovar = intcovar, + addcovar = addcovar, n_perm = n_perm, perm_Xsp = perm_Xsp, model = model, + chr_lengths = chr_lengths, cores = NO_OF_CORES) +} + +# Check if permutation strata is needed +if (!is.null(opt$pstrata) && !is.null(Xcovar)) { + perm_strata <- mat2strata(Xcovar) +} else { + perm_strata <- NULL +} + +# Perform permutation test if requested +permutation_results_file <- file.path(opt$directory, "permutation.csv") +significance_results_file <- file.path(opt$directory, "significance.csv") + +if (NO_OF_PERMUTATION > 0) { + cat("Performing permutation test for the cross object with", NO_OF_PERMUTATION, "permutations.\n") + perm <- perform_permutation_test(cross, Pr, n_perm = NO_OF_PERMUTATION, perm_strata = perm_strata, + method = opt$method) + + # Function to get LOD significance thresholds + get_lod_significance <- function(perm, thresholds = c(0.01, 0.05, 0.63)) { + cat("Getting the permutation summary with significance thresholds:", thresholds, "\n") + summary(perm, alpha = thresholds) + } + + # Compute LOD significance + lod_significance <- get_lod_significance(perm) + + # Save results + write.csv(lod_significance, significance_results_file) + write.csv(perm, permutation_results_file) +} + + + +# Function to get QTL effects +get_qtl_effect <- function(chromosome, geno_prob, pheno, covar = NULL, LOCO = NULL) { + cat("Finding the QTL effect for chromosome", chromosome, "\n") + chr_Pr <- geno_prob[, chromosome] + if (!is.null(chr_Pr)) { + if (!is.null(LOCO)) { + cat("Finding QTL effect for chromosome", chromosome, "with LOCO.\n") + kinship <- calc_kinship(chr_Pr, "loco")[[chromosome]] + return(scan1coef(chr_Pr, pheno, kinship, addcovar = covar)) + } else { + return(scan1coef(chr_Pr, pheno, addcovar = covar)) + } + } + return(NULL) +} + +# Get QTL effects for each chromosome +# TODO + +# Prepare output data +gmap_file <- file.path(opt$directory, json_data$geno_map_file) +pmap_file <- file.path(opt$directory, json_data$physical_map_file) + + + + + +# Construct the Map object from cross with columns (Marker, chr, cM, Mb) +gmap <- cross$gmap # Genetic map in cM +pmap <- cross$pmap # Physical map in Mb +# Convert lists to data frames +gmap_df <- data.frame( + marker = unlist(lapply(gmap, names)), + chr = rep(names(gmap), sapply(gmap, length)), # Add chromosome info + CM = unlist(gmap), + stringsAsFactors = FALSE +) + +pmap_df <- data.frame( + marker = unlist(lapply(pmap, names)), + chr = rep(names(pmap), sapply(pmap, length)), # Add chromosome info + MB = unlist(pmap), + stringsAsFactors = FALSE +) +# Merge using full outer join (by marker and chromosome) +merged_map <- merge(gmap_df, pmap_df, by = c("marker", "chr"), all = TRUE) +map_file <- file.path(opt$directory, "map.csv") +write.csv(merged_map, map_file, row.names = FALSE) + +output <- list( + permutation_file = permutation_results_file, + significance_file = significance_results_file, + scan_file = scan_file, + gmap_file = gmap_file, + pmap_file = pmap_file, + map_file = map_file, + permutations = NO_OF_PERMUTATION, + scan_method = SCAN_METHOD +) + +# Write output to JSON file +output_json_data <- toJSON(output) +cat("The output file path generated is", OUTPUT_FILE_PATH, "\n") +cat("Writing to the output file.\n") +write(output_json_data, file = OUTPUT_FILE_PATH) \ No newline at end of file diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R index 31c1277..0b39a6a 100644 --- a/scripts/rqtl_wrapper.R +++ b/scripts/rqtl_wrapper.R @@ -4,6 +4,9 @@ library(stringi) library(stringr) + +cat("Running the qtl script.\n") + tmp_dir = Sys.getenv("TMPDIR") if (!dir.exists(tmp_dir)) { tmp_dir = "/tmp" @@ -24,7 +27,7 @@ option_list = list( make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"), make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "gn3"), help="Directory in which to write result file"), make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"), - make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information") + make_option(c("-v", "--verbose"), action="store_true", default=TRUE, help="Show extra information") ); opt_parser = OptionParser(option_list=option_list); @@ -353,5 +356,8 @@ if (!is.null(opt$pairscan)) { colnames(qtl_results)[4:7] <- c("AC", "AD", "BC", "BD") } + write.csv(qtl_results, out_file) } + +cat("End of script. Now working on processing the results.\n") diff --git a/scripts/update_rif_table.py b/scripts/update_rif_table.py index 24edf3d..f936f5b 100755 --- a/scripts/update_rif_table.py +++ b/scripts/update_rif_table.py @@ -35,7 +35,7 @@ VALUES (%s, %s, %s, %s, %s, %s, %s, %s) def download_file(url: str, dest: pathlib.Path): """Saves the contents of url in dest""" - with requests.get(url, stream=True) as resp: + with requests.get(url, stream=True, timeout=300) as resp: resp.raise_for_status() with open(dest, "wb") as downloaded_file: for chunk in resp.iter_content(chunk_size=8192): |
