diff options
author | Muriithi Frederick Muriuki | 2021-08-31 06:51:18 +0300 |
---|---|---|
committer | Muriithi Frederick Muriuki | 2021-08-31 06:55:10 +0300 |
commit | 6ab866183aeac8553fdcda9217e4445da2b4836b (patch) | |
tree | a94e70179af04b5cffe04b27f27b29e69c104997 /gn3/db | |
parent | b8777bcfee70325263d5389367e3a93ec2842f69 (diff) | |
download | genenetwork3-6ab866183aeac8553fdcda9217e4445da2b4836b.tar.gz |
Provide utilities for genotype files
Issue:
https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/clustering.gmi
* gn3/db/genotypes.py: New module
* gn3/settings.py: Add new configuration variable
* qtlfilesexport.py: Test out new code
Add a module containing functions dealing with the genotype files.
Add a configuration variable to point to the location of the genotype files.
Diffstat (limited to 'gn3/db')
-rw-r--r-- | gn3/db/genotypes.py | 69 |
1 files changed, 69 insertions, 0 deletions
diff --git a/gn3/db/genotypes.py b/gn3/db/genotypes.py new file mode 100644 index 0000000..610ddde --- /dev/null +++ b/gn3/db/genotypes.py @@ -0,0 +1,69 @@ +"""Genotype utilities""" + +import os +import gzip +from gn3.settings import GENOTYPE_FILES + +def build_genotype_file( + geno_name: str, base_dir: str = GENOTYPE_FILES, + extension: str = "geno"): + """Build the absolute path for the genotype file.""" + return "{}/{}.{}".format(os.path.abspath(base_dir), geno_name, extension) + +def load_genotype_samples(genotype_filename: str, file_type: str = "geno"): + """ + Load sample of strains from genotype files. + + DESCRIPTION: + Traits can contain a varied number of strains, some of which do not exist in + certain genotypes. In order to compute QTLs, GEMMAs, etc, we need to ensure + to pick only those strains that exist in the genotype under consideration + for the traits used in the computation. + + This function loads a list of samples from the genotype files for use in + filtering out unusable strains. + + + PARAMETERS: + genotype_filename: The absolute path to the genotype file to load the + samples from. + file_type: The type of file. Currently supported values are 'geno' and + 'plink'. + """ + file_type_fns = { + "geno": __load_genotype_samples_from_geno, + "plink": __load_genotype_samples_from_plink + } + return file_type_fns[file_type](genotype_filename) + +def __load_genotype_samples_from_geno(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.geno' files. + """ + gzipped_filename = "{}.gz".format(genotype_filename) + if os.path.isfile(gzipped_filename): + genofile = gzip.open(gzipped_filename) + else: + genofile = open(genotype_filename) + + for row in genofile: + line = row.strip() + if (not line) or (line.startswith(("#", "@"))): + continue + break + + headers = line.split("\t") + if headers[3] == "Mb": + return headers[4:] + return headers[3:] + +def __load_genotype_samples_from_plink(genotype_filename: str): + """ + Helper function for `load_genotype_samples` function. + + Loads samples from '.plink' files. + """ + genofile = open(genotype_filename) + return [line.split(" ")[1] for line in genofile] |