From 6aaefdaae3a9fb068278d9b94d8cdf25d4f8d852 Mon Sep 17 00:00:00 2001 From: Zachary Sloan Date: Fri, 19 Jul 2013 16:13:47 -0500 Subject: Created file gen_group_samplelists that iterates through all genofiles and builds each groups' samplelist --- wqflask/base/data_set.py | 50 ++++++++++++++++++++++++++++++++++++++------ wqflask/base/webqtlConfig.py | 8 +++---- 2 files changed, 48 insertions(+), 10 deletions(-) (limited to 'wqflask/base') diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py index 30221503..cf219fda 100755 --- a/wqflask/base/data_set.py +++ b/wqflask/base/data_set.py @@ -16,8 +16,6 @@ # Contact Drs. Robert W. Williams and Xiaodong Zhou (2010) # at rwilliams@uthsc.edu and xzhou15@uthsc.edu # -#we -# # This module is used by GeneNetwork project (www.genenetwork.org) from __future__ import absolute_import, print_function, division @@ -27,6 +25,7 @@ import string import collections import json +import gzip import cPickle as pickle import itertools @@ -52,8 +51,6 @@ from pprint import pformat as pf DS_NAME_MAP = {} def create_dataset(dataset_name, dataset_type = None): - - print("dataset_type:", dataset_type) if not dataset_type: dataset_type = Dataset_Getter(dataset_name) #dataset_type = get_dataset_type_from_json(dataset_name) @@ -129,7 +126,7 @@ def create_datasets_list(): for result in g.db.execute(query).fetchall(): #The query at the beginning of this function isn't necessary here, but still would #rather just reuse it - print("type: {}\tname: {}".format(dataset_type, result.Name)) + #print("type: {}\tname: {}".format(dataset_type, result.Name)) dataset = create_dataset(result.Name, dataset_type) datasets.append(dataset) @@ -261,6 +258,36 @@ class DatasetGroup(object): if maternal and paternal: self.parlist = [maternal, paternal] + def get_sample_list(self): + genofilename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno')) + genofile = open(genofilename, "r") + for line in genofile: + line = line.strip() + if line.startswith(("#", "@")): + continue + headline = line + break + headers = headline.split("\t") + if headers[3] == "Mb": + self.samplelist = headers[4:] + else: + self.samplelist = headers[3:] + + #if genotype_1.type == "group" and self.parlist: + # genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1) + #else: + # genotype_2 = genotype_1 + + #determine default genotype object + #if self.incparentsf1 and genotype_1.type != "intercross": + # genotype = genotype_2 + #else: + # self.incparentsf1 = 0 + # genotype = genotype_1 + + #self.samplelist = list(genotype.prgy) + + def read_genotype_file(self): '''Read genotype from .geno file instead of database''' #if self.group == 'BXD300': @@ -275,7 +302,18 @@ class DatasetGroup(object): # reaper barfs on unicode filenames, so here we ensure it's a string full_filename = str(os.path.join(webqtlConfig.GENODIR, self.name + '.geno')) - genotype_1.read(full_filename) + if os.path.isfile(full_filename): + print("Reading file: ", full_filename) + genotype_1.read(full_filename) + print("File read") + else: + try: + full_filename = str(os.path.join(webqtlConfig.TMPDIR, self.name + '.geno')) + #print("Reading file") + genotype_1.read(full_filename) + #print("File read") + except IOError: + print("File doesn't exist!") if genotype_1.type == "group" and self.parlist: genotype_2 = genotype_1.add(Mat=self.parlist[0], Pat=self.parlist[1]) #, F1=_f1) diff --git a/wqflask/base/webqtlConfig.py b/wqflask/base/webqtlConfig.py index d4511212..67a9c63f 100755 --- a/wqflask/base/webqtlConfig.py +++ b/wqflask/base/webqtlConfig.py @@ -35,7 +35,7 @@ NCBI_LOCUSID = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrie UCSC_REFSEQ = "http://genome.cse.ucsc.edu/cgi-bin/hgGene?db=%s&hgg_gene=%s&hgg_chrom=chr%s&hgg_start=%s&hgg_end=%s" GENBANK_ID = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=Nucleotide&cmd=search&doptcmdl=DocSum&term=%s" OMIM_ID = "http://www.ncbi.nlm.nih.gov/omim/%s" -UNIGEN_ID = "http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=%s&CID=%s" +UNIGEN_ID = "http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=%s&CID=%s"; HOMOLOGENE_ID = "http://www.ncbi.nlm.nih.gov/sites/entrez?Db=homologene&Cmd=DetailsSearch&Term=%s" PUBMEDLINK_URL = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=%s&dopt=Abstract" UCSC_POS = "http://genome.ucsc.edu/cgi-bin/hgTracks?clade=mammal&org=%s&db=%s&position=chr%s:%s-%s&pix=800&Submit=submit" @@ -53,12 +53,12 @@ GNROOT = "/home/zas1024/gene/" # Will remove this and dependent items later SECUREDIR = GNROOT + 'secure/' COMMON_LIB = GNROOT + 'support/admin' HTMLPATH = GNROOT + 'web/' -PYLMM_PATH = '/home/zas1024/plink/' -SNP_PATH = '/home/zas1024/snps/' +PYLMM_PATH = '/home/zas1024/' +SNP_PATH = '/mnt/xvdf1/snps/' IMGDIR = HTMLPATH +'image/' IMAGESPATH = HTMLPATH + 'images/' UPLOADPATH = IMAGESPATH + 'upload/' -TMPDIR = HTMLPATH + 'tmp/' +TMPDIR = '/tmp/' GENODIR = HTMLPATH + 'genotypes/' NEWGENODIR = HTMLPATH + 'new_genotypes/' GENO_ARCHIVE_DIR = GENODIR + 'archive/' -- cgit v1.2.3