about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/maintenance/QTL_Reaper_v6.py108
-rw-r--r--scripts/maintenance/README.txt66
-rw-r--r--scripts/maintenance/Update_Case_Attributes_MySQL_tab.py27
-rwxr-xr-xscripts/maintenance/delete_genotypes.py33
-rwxr-xr-xscripts/maintenance/delete_phenotypes.py35
-rwxr-xr-xscripts/maintenance/load_genotypes.py154
-rwxr-xr-xscripts/maintenance/load_phenotypes.py171
-rwxr-xr-xscripts/maintenance/readProbeSetMean_v7.py274
-rwxr-xr-xscripts/maintenance/readProbeSetSE_v7.py254
9 files changed, 1122 insertions, 0 deletions
diff --git a/scripts/maintenance/QTL_Reaper_v6.py b/scripts/maintenance/QTL_Reaper_v6.py
new file mode 100755
index 00000000..e50dbd40
--- /dev/null
+++ b/scripts/maintenance/QTL_Reaper_v6.py
@@ -0,0 +1,108 @@
+####### To run this program do: python QTL_Reaper_v2.py 235
+#Where 163 is the ProbeSetFreeze Id of the database that we want to calculate #the LRS
+#!/usr/bin/python
+import sys
+import os
+import reaper
+import MySQLdb
+import time
+
+con = MySQLdb.Connect(db='db_webqtl',user='username',passwd='', host="localhost")
+cursor = con.cursor()
+
+genotypeDir = '/gnshare/gn/web/genotypes/'
+genotype_1 = reaper.Dataset()
+
+#####get all of the genotypes
+cursor.execute('select Id, Name from InbredSet')
+results = cursor.fetchall()
+InbredSets = {}
+for item in results:
+	InbredSets[item[0]] = genotypeDir+str(item[1])+'.geno'
+
+ProbeSetFreezeIds=sys.argv[1:]
+if ProbeSetFreezeIds:
+	#####convert the Ids to integer
+	ProbeSetFreezeIds=map(int, ProbeSetFreezeIds)
+
+else:
+	#####get all of the dataset that need be updated
+	cursor.execute('select distinct(ProbeSetFreezeId) from ProbeSetXRef where pValue is NULL order by ProbeSetFreezeId desc')
+	results = cursor.fetchall()
+	ProbeSetFreezeIds = []
+	for item in results:
+		ProbeSetFreezeIds.append(item[0])
+
+####human dataset can NOT use this program calculate the LRS, ignore it
+if ProbeSetFreezeIds.__contains__(215):
+	ProbeSetFreezeIds.remove(215)
+
+#output_file = open ('/home/xzhou/work/DatabaseTools/cal_LRS_Additive_result.txt', 'w')
+
+#####update 
+for ProbeSetFreezeId in ProbeSetFreezeIds:
+	cursor.execute("""
+		select InbredSetId 
+		from ProbeFreeze, ProbeSetFreeze 
+		where ProbeFreeze.Id=ProbeSetFreeze.ProbeFreezeId and ProbeSetFreeze.Id=%d
+	"""%ProbeSetFreezeId);
+
+	InbredSetId = cursor.fetchone()[0]
+	if InbredSetId==3:
+		InbredSetId=1
+	#if InbredSetId==12:
+	#	InbredSetId=2
+
+	print ProbeSetFreezeId, InbredSets[InbredSetId]
+
+	genotype_1.read(InbredSets[InbredSetId])
+	locuses = []
+	for geno in genotype_1:
+		for locus in geno:
+			locuses.append(locus.name)
+
+	cursor.execute('select ProbeSetId, Locus, DataId from ProbeSetXRef where ProbeSetFreezeId=%s'%ProbeSetFreezeId)
+	ProbeSetXRefInfos = cursor.fetchall()
+
+	kj=0
+	for aProbeSetXRef in ProbeSetXRefInfos:
+		ProbeSetId, Locus, DataId = aProbeSetXRef
+		prgy = genotype_1.prgy
+
+		cursor.execute("select Strain.Name, ProbeSetData.value from Strain, ProbeSetData where Strain.Id = ProbeSetData.StrainId and ProbeSetData.Id = %d" % DataId)
+		results = cursor.fetchall()
+		if not results:
+			continue
+		_strains = []
+		_values = []
+		for item2 in results:
+			strain, value = item2
+			if strain in prgy:
+				_strains.append(strain)
+				_values.append(value)
+		if not _strains or not _values:
+			continue
+		
+		if len(_strains) < 8:
+			continue
+		qtlresults = genotype_1.regression(strains = _strains, trait = _values)
+		_max = max(qtlresults)
+		_locus = _max.locus.name
+		_additive = _max.additive
+		_max = _max.lrs
+
+		#output_file.write('%s\t%s\t%s\t%s\t%s\n' % (ProbeSetFreezeId, ProbeSetId, _locus, _max, _additive))
+		
+		# _max(LRS) maybe is infinite sometimes, so define it as a very big number
+		if _max == float('inf'):
+			_max = 10000
+
+		cursor.execute('update ProbeSetXRef set Locus=%s, LRS=%s, additive=%s where ProbeSetId=%s and ProbeSetFreezeId=%s', \
+				(_locus, _max, _additive, ProbeSetId, ProbeSetFreezeId))
+
+		kj += 1
+		if kj%1000==0:
+			print ProbeSetFreezeId, InbredSets[InbredSetId],kj
+
+
+	print ProbeSetFreezeIds
diff --git a/scripts/maintenance/README.txt b/scripts/maintenance/README.txt
new file mode 100644
index 00000000..8604a5c9
--- /dev/null
+++ b/scripts/maintenance/README.txt
@@ -0,0 +1,66 @@
+Theses are Python2 scripts used for uploading data into the MySQL
+database (current as per April 2018)
+
+
+Last updated by A.Centeno 4-2-18
+==========================
+load_genotypes.py
+==========================
+Mainly used to enter genotype batch records
+Run load_genotypes.py as:
+python delete_genotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Load_Genotypes/config.ini
+
+==========================
+delete_genotypes.py
+==========================
+Mainly used to delete genotype batch records
+Run delete_genotypes.py as:
+python delete_genotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Delete_Genotypes/config.ini
+
+==========================
+load_phenotypes.py
+==========================
+Mainly used to enter phenotype trait batch records
+Run load_phenotypes.py as:
+python load_phenotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Load_Phenotypes/config.ini
+
+==========================
+delete_phenotypes.py
+==========================
+Mainly used to delete phenotype trait full records
+Run delete_phenotypes.py as:
+python delete_phenotypes.py /home/acenteno/copyfrom_spring211/Maintenance/dataset/Arthur-Geno-Pheno-Data/Delete_Phenotypes/config.ini
+
+==========================
+QTL_Reaper_v6.py
+==========================
+Mainly used to perform QTL reaper and obtain Max LRS values.
+Run QTL_Reaper_v6.py as:
+python QTL_Reaper_v6.py (and GN accession number here)
+
+==========================
+Update_Case_Attributes_MySQL_tab.py
+==========================
+Mainly used to enter Case attributes.
+Run Update_Case_Attributes_MySQL_tab.py as:
+python Update_Case_Attributes_MySQL_tab.py
+
+==========================
+readProbeSetMean_v7.py
+==========================
+Mainly used to enter mean expression data.
+Run readProbeSetMean_v7.py as:
+python readProbeSetMean_v7.py
+
+==========================
+readProbeSetSE_v7.py
+==========================
+Mainly used to enter Standard Error values from expression data.
+Run readProbeSetSE_v7.py as:
+python readProbeSetSE_v7.py
+
+==========================
+MYSQL command CALCULATE MEANS NEW Structure:
+==========================
+Mainly used to calculate mean from expression data values.
+update ProbeSetXRef set mean = (select AVG(value) from ProbeSetData where ProbeSetData.Id = ProbeSetXRef.DataId) where ProbeSetXRef.ProbeSetFreezeId = 811;
diff --git a/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
new file mode 100644
index 00000000..0f8602c9
--- /dev/null
+++ b/scripts/maintenance/Update_Case_Attributes_MySQL_tab.py
@@ -0,0 +1,27 @@
+#!/usr/bin/python2
+########################################################################
+# Last Updated Apr 11, 2016 By Arthur and Zach
+########################################################################
+import string
+import sys
+import MySQLdb
+import getpass
+import time
+import csv
+########################################################################
+
+mydb = MySQLdb.connect(host='localhost',
+    user='username',
+    passwd='',
+    db='db_webqtl')
+cursor = mydb.cursor()
+
+csv_data = csv.reader(file('GN711_pvalues.txt'), delimiter ="\t")
+for row in csv_data:
+
+    cursor.execute("""UPDATE ProbeSetXRef SET pValue = %s WHERE ProbeSetFreezeId = %s AND ProbeSetId = %s """,
+          (row))
+#close the connection to the database.
+mydb.commit()
+cursor.close()
+print "Done"
\ No newline at end of file
diff --git a/scripts/maintenance/delete_genotypes.py b/scripts/maintenance/delete_genotypes.py
new file mode 100755
index 00000000..fa693f0f
--- /dev/null
+++ b/scripts/maintenance/delete_genotypes.py
@@ -0,0 +1,33 @@
+import sys
+import csv
+
+import utilities
+import datastructure
+import genotypes
+
+def main(argv):
+    # config
+    config = utilities.get_config(argv[1])
+    print "config:"
+    for item in config.items('config'):
+        print "\t%s" % (str(item))
+    # var
+    print "variable:"
+    inbredsetid = config.get('config', 'inbredsetid')
+    print "\tinbredsetid: %s" % inbredsetid
+    # datafile
+    datafile = open(config.get('config', 'datafile'), 'r')
+    datafile = csv.reader(datafile, delimiter='\t', quotechar='"')
+    datafile.next()
+    delrowcount = 0
+    for row in datafile:
+        if len(row) == 0:
+            continue
+        genoname = row[0]
+        delrowcount += genotypes.delete(genoname, inbredsetid)
+    print "deleted %d genotypes" % (delrowcount)
+
+if __name__ == "__main__":
+    print "command line arguments:\n\t%s" % sys.argv
+    main(sys.argv)
+    print "exit successfully"
diff --git a/scripts/maintenance/delete_phenotypes.py b/scripts/maintenance/delete_phenotypes.py
new file mode 100755
index 00000000..326c466e
--- /dev/null
+++ b/scripts/maintenance/delete_phenotypes.py
@@ -0,0 +1,35 @@
+import sys
+import csv
+
+import utilities
+import datastructure
+import phenotypes
+
+def main(argv):
+    # config
+    config = utilities.get_config(argv[1])
+    print "config:"
+    for item in config.items('config'):
+        print "\t%s" % (str(item))
+    # var
+    print "variable:"
+    inbredsetid = config.get('config', 'inbredsetid')
+    print "\tinbredsetid: %s" % inbredsetid
+    # datafile
+    datafile = open(config.get('config', 'datafile'), 'r')
+    datafile = csv.reader(datafile, delimiter='\t', quotechar='"')
+    delrowcount = 0
+    for row in datafile:
+        if len(row) == 0:
+            continue
+        try:
+            publishxrefid = int(row[0])
+        except:
+            continue
+        delrowcount += phenotypes.delete(publishxrefid=publishxrefid, inbredsetid=inbredsetid)
+    print "deleted %d phenotypes" % (delrowcount)
+
+if __name__ == "__main__":
+    print "command line arguments:\n\t%s" % sys.argv
+    main(sys.argv)
+    print "exit successfully"
diff --git a/scripts/maintenance/load_genotypes.py b/scripts/maintenance/load_genotypes.py
new file mode 100755
index 00000000..338483f4
--- /dev/null
+++ b/scripts/maintenance/load_genotypes.py
@@ -0,0 +1,154 @@
+import sys
+import re
+
+import utilities
+import datastructure
+
+def main(argv):
+    config = utilities.get_config(argv[1])
+    print("config file:")
+    for item in config.items('config'):
+        print("\t%s" % str(item))
+    parse_genofile(config, fetch_parameters(config))
+
+def fetch_parameters(config):
+    config_dic = {}
+    config_dic['inbredsetid'] = config.get('config', 'inbredsetid')
+    config_dic["speciesid"] = datastructure.get_species(config_dic['inbredsetid'])[0]
+    config_dic['genofreezeid'] = datastructure.get_genofreeze_byinbredsetid(config_dic['inbredsetid'])[0]
+    config_dic['dataid'] = datastructure.get_nextdataid_genotype()
+    config_dic['genofile'] = config.get('config', 'genofile')
+    print("config dictionary:")
+    for k, v in config_dic.items():
+        print("\t%s: %s" % (k, v))
+    return config_dic
+
+def parse_genofile(config, config_dic):
+    genofile = open(config_dic['genofile'], 'r')
+    meta_dic = {}
+    for line in genofile:
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        if line.startswith('#'):
+            continue
+        if line.startswith('@'):
+            line = line.strip('@')
+            items = line.split(';')
+            for item in items:
+                kv = re.split(':|=', item)
+                meta_dic[kv[0].strip()] = kv[1].strip()
+            continue
+        if line.lower().startswith("chr"):
+            #
+            print("geno file meta dictionary:")
+            for k, v in meta_dic.items():
+                print("\t%s: %s" % (k, v))
+            #
+            print("geno file head:\n\t%s" % line)
+            strainnames = line.split()[4:]
+            config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes")
+            continue
+        # geno file line, marker
+        marker_dic = parse_marker(line)
+        marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic)
+        if check_genoxref(config_dic, marker_dic):
+            continue
+        insert_genodata(config, config_dic, marker_dic)
+        insert_genoxref(config_dic, marker_dic)
+        config_dic['dataid'] += 1
+    genofile.close()
+
+def parse_marker(line):
+    marker_dic = {}
+    cells = line.split()
+    marker_dic['chromosome'] = cells[0]
+    marker_dic['locus'] = cells[1]
+    marker_dic['cm'] = cells[2]
+    marker_dic['mb'] = cells[3]
+    marker_dic['values'] = cells[4:]
+    return marker_dic
+        
+def check_or_insert_geno(config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
+    sql = """
+        SELECT Geno.`Id`
+        FROM Geno
+        WHERE Geno.`SpeciesId`=%s
+        AND Geno.`Name` like %s
+        """
+    cursor.execute(sql, (config_dic["speciesid"], marker_dic['locus']))
+    result = cursor.fetchone()
+    if result:
+        genoid = result[0]
+        print("get geno record: %d" % genoid)
+    else:
+        sql = """
+            INSERT INTO Geno
+            SET
+            Geno.`SpeciesId`=%s,
+            Geno.`Name`=%s,
+            Geno.`Marker_Name`=%s,
+            Geno.`Chr`=%s,
+            Geno.`Mb`=%s
+            """
+        cursor.execute(sql, (config_dic['speciesid'], marker_dic['locus'], marker_dic['locus'], marker_dic['chromosome'], marker_dic['mb']))
+        rowcount = cursor.rowcount
+        genoid = con.insert_id()
+        print("INSERT INTO Geno: %d record: %d" % (rowcount, genoid))
+    return genoid
+
+def check_genoxref(config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
+    sql = """
+        select GenoXRef.*
+        from GenoXRef
+        where GenoXRef.`GenoFreezeId`=%s
+        AND GenoXRef.`GenoId`=%s
+        """
+    cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid']))
+    rowcount = cursor.rowcount
+    return rowcount
+
+def insert_genodata(config, config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
+    for index, strain in enumerate(config_dic['strains']):
+        strainid = strain[0]
+        value = utilities.to_db_string(marker_dic['values'][index], None)
+        if not value:
+            continue
+        value = config.get('config', "genovalue_" + value)
+        try:
+            number = int(value)
+        except:
+            continue
+        if not number in [-1, 0, 1]:
+            continue
+        sql = """
+            INSERT INTO GenoData
+            SET
+            GenoData.`Id`=%s,
+            GenoData.`StrainId`=%s,
+            GenoData.`value`=%s
+            """
+        cursor.execute(sql, (config_dic['dataid'], strainid, number))
+
+def insert_genoxref(config_dic, marker_dic):
+    cursor, con = utilities.get_cursor()
+    sql = """
+        INSERT INTO GenoXRef
+        SET
+        GenoXRef.`GenoFreezeId`=%s,
+        GenoXRef.`GenoId`=%s,
+        GenoXRef.`DataId`=%s,
+        GenoXRef.`cM`=%s,
+        GenoXRef.`Used_for_mapping`=%s
+        """
+    cursor.execute(sql, (config_dic['genofreezeid'], marker_dic['genoid'], config_dic['dataid'], marker_dic['cm'], 'N'))
+    rowcount = cursor.rowcount
+    print("INSERT INTO GenoXRef: %d record" % (rowcount))
+
+if __name__ == "__main__":
+    print("command line arguments:\n\t%s" % sys.argv)
+    main(sys.argv)
+    print("exit successfully")
diff --git a/scripts/maintenance/load_phenotypes.py b/scripts/maintenance/load_phenotypes.py
new file mode 100755
index 00000000..c3c6570b
--- /dev/null
+++ b/scripts/maintenance/load_phenotypes.py
@@ -0,0 +1,171 @@
+import sys
+import csv
+
+import utilities
+import datastructure
+
+def main(argv):
+    # config
+    config = utilities.get_config(argv[1])
+    print "config:"
+    for item in config.items('config'):
+        print "\t%s" % (str(item))
+    # var
+    inbredsetid = config.get('config', 'inbredsetid')
+    print "inbredsetid: %s" % inbredsetid
+    species = datastructure.get_species(inbredsetid)
+    speciesid = species[0]
+    print "speciesid: %s" % speciesid
+    dataid = datastructure.get_nextdataid_phenotype()
+    print "next data id: %s" % dataid
+    cursor, con = utilities.get_cursor()
+    # datafile
+    datafile = open(config.get('config', 'datafile'), 'r')
+    phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"')
+    phenotypedata_head = phenotypedata.next()
+    print "phenotypedata head:\n\t%s" % phenotypedata_head
+    strainnames = phenotypedata_head[1:]
+    strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
+    # metafile
+    metafile = open(config.get('config', 'metafile'), 'r')
+    phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"')
+    phenotypemeta_head = phenotypemeta.next()
+    print "phenotypemeta head:\n\t%s" % phenotypemeta_head
+    print
+    # load
+    for metarow in phenotypemeta:
+        #
+        datarow_value = phenotypedata.next()
+        datarow_se = phenotypedata.next()
+        datarow_n = phenotypedata.next()
+        # Phenotype
+        sql = """
+            INSERT INTO Phenotype
+            SET
+            Phenotype.`Pre_publication_description`=%s,
+            Phenotype.`Post_publication_description`=%s,
+            Phenotype.`Original_description`=%s,
+            Phenotype.`Pre_publication_abbreviation`=%s,
+            Phenotype.`Post_publication_abbreviation`=%s,
+            Phenotype.`Lab_code`=%s,
+            Phenotype.`Submitter`=%s,
+            Phenotype.`Owner`=%s,
+            Phenotype.`Authorized_Users`=%s,
+            Phenotype.`Units`=%s
+            """
+        cursor.execute(sql, (
+            utilities.to_db_string(metarow[1], None),
+            utilities.to_db_string(metarow[2], None),
+            utilities.to_db_string(metarow[3], None),
+            utilities.to_db_string(metarow[4], None),
+            utilities.to_db_string(metarow[5], None),
+            utilities.to_db_string(metarow[6], None),
+            utilities.to_db_string(metarow[7], None),
+            utilities.to_db_string(metarow[8], None),
+            utilities.to_db_string(metarow[9], ""),
+            utilities.to_db_string(metarow[18], ""),
+            ))
+        rowcount = cursor.rowcount
+        phenotypeid = con.insert_id()
+        print "INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid)
+        # Publication
+        publicationid = None # reset
+        pubmed_id = utilities.to_db_string(metarow[0], None)
+        if pubmed_id:
+            sql = """
+                SELECT Publication.`Id`
+                FROM Publication
+                WHERE Publication.`PubMed_ID`=%s
+                """
+            cursor.execute(sql, (pubmed_id))
+            re = cursor.fetchone()
+            if re:
+                publicationid = re[0]
+                print "get Publication record: %d" % publicationid
+        if not publicationid:
+            sql = """
+                INSERT INTO Publication
+                SET
+                Publication.`PubMed_ID`=%s,
+                Publication.`Abstract`=%s,
+                Publication.`Authors`=%s,
+                Publication.`Title`=%s,
+                Publication.`Journal`=%s,
+                Publication.`Volume`=%s,
+                Publication.`Pages`=%s,
+                Publication.`Month`=%s,
+                Publication.`Year`=%s
+                """
+            cursor.execute(sql, (
+                utilities.to_db_string(metarow[0], None),
+                utilities.to_db_string(metarow[12], None),
+                utilities.to_db_string(metarow[10], ""),
+                utilities.to_db_string(metarow[11], None),
+                utilities.to_db_string(metarow[13], None),
+                utilities.to_db_string(metarow[14], None),
+                utilities.to_db_string(metarow[15], None),
+                utilities.to_db_string(metarow[16], None),
+                utilities.to_db_string(metarow[17], ""),
+                ))
+            rowcount = cursor.rowcount
+            publicationid = con.insert_id()
+            print "INSERT INTO Publication: %d record: %d" % (rowcount, publicationid)
+        # data
+        for index, strain in enumerate(strains):
+            #
+            strainid = strain[0]
+            value   = utilities.to_db_float(datarow_value[index+1], None)
+            se      = utilities.to_db_float(datarow_se[index+1], None)
+            n       = utilities.to_db_int(datarow_n[index+1], None)
+            #
+            if value is not None:
+                sql = """
+                    INSERT INTO PublishData
+                    SET
+                    PublishData.`Id`=%s,
+                    PublishData.`StrainId`=%s,
+                    PublishData.`value`=%s
+                    """
+                cursor.execute(sql, (dataid, strainid, value))
+            if se is not None:
+                sql = """
+                    INSERT INTO PublishSE
+                    SET
+                    PublishSE.`DataId`=%s,
+                    PublishSE.`StrainId`=%s,
+                    PublishSE.`error`=%s
+                    """
+                cursor.execute(sql, (dataid, strainid, se))
+            if n is not None:
+                sql = """
+                    INSERT INTO NStrain
+                    SET
+                    NStrain.`DataId`=%s,
+                    NStrain.`StrainId`=%s,
+                    NStrain.`count`=%s
+                    """
+                cursor.execute(sql, (dataid, strainid, n))
+        # PublishXRef
+        sql = """
+            INSERT INTO PublishXRef
+            SET
+            PublishXRef.`InbredSetId`=%s,
+            PublishXRef.`PhenotypeId`=%s,
+            PublishXRef.`PublicationId`=%s,
+            PublishXRef.`DataId`=%s,
+            PublishXRef.`comments`=%s
+            """
+        cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, ""))
+        rowcount = cursor.rowcount
+        publishxrefid = con.insert_id()
+        print "INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid)
+        # for loop next
+        dataid += 1
+        print
+    # release
+    con.close()
+
+if __name__ == "__main__":
+    print "command line arguments:\n\t%s" % sys.argv
+    main(sys.argv)
+    print "exit successfully"
diff --git a/scripts/maintenance/readProbeSetMean_v7.py b/scripts/maintenance/readProbeSetMean_v7.py
new file mode 100755
index 00000000..e9c8f25c
--- /dev/null
+++ b/scripts/maintenance/readProbeSetMean_v7.py
@@ -0,0 +1,274 @@
+#!/usr/bin/python2
+"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""
+
+########################################################################
+# Last Updated Sep 27, 2011 by Xiaodong
+########################################################################
+import string
+import sys
+import MySQLdb
+import getpass
+import time
+#import pdb
+
+#pdb.set_trace()
+
+########################################################################
+
+def translateAlias(str):
+	if str == "B6":
+		return "C57BL/6J"
+	elif str == "D2":
+		return "DBA/2J"
+	else:
+		return str
+
+########################################################################
+#
+#  Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile
+#
+########################################################################
+
+dataStart = 1
+
+GeneChipId = int( raw_input("Enter GeneChipId:") )
+ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") )
+input_file_name = raw_input("Enter file name with suffix:")
+
+fp = open("%s" % input_file_name, 'rb')
+
+try:
+        passwd = getpass.getpass('Please enter mysql password here : ') 
+        con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd)
+
+        db = con.cursor()
+        print "You have successfully connected to mysql.\n"
+except:
+        print "You entered incorrect password.\n"
+        sys.exit(0)
+
+time0 = time.time()
+
+#########################################################################
+#
+#  Check if each line have same number of members
+#  generate the gene list of expression data here
+#
+#########################################################################
+print 'Checking if each line have same number of members'
+
+GeneList = []
+isCont = 1
+header = fp.readline()
+header = string.split(string.strip(header),'\t')
+header = map(string.strip, header)
+nfield = len(header)
+line = fp.readline()
+
+kj=0
+while line:
+	line2 = string.split(string.strip(line),'\t')
+	line2 = map(string.strip, line2)
+	if len(line2) != nfield:
+		print "Error : " + line
+		isCont = 0
+
+	GeneList.append(line2[0])
+	line = fp.readline()
+
+	kj+=1
+	if kj%100000 == 0:
+		print 'checked ',kj,' lines'
+
+GeneList = map(string.lower, GeneList)
+GeneList.sort()
+	
+if isCont==0:
+	sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+#########################################################################
+#
+#  Check if each strain exist in database
+#  generate the string id list of expression data here
+#
+#########################################################################
+print 'Checking if each strain exist in database'
+
+isCont = 1
+fp.seek(0)
+header = fp.readline()
+header = string.split(string.strip(header),'\t')
+header = map(string.strip, header)
+header = map(translateAlias, header)
+header = header[dataStart:]
+Ids = []
+for item in header:
+	try:
+		db.execute('select Id from Strain where Name = "%s"' % item)
+		Ids.append(db.fetchall()[0][0])
+	except:
+		print item,'does not exist, check the if the strain name is correct'
+		isCont=0
+
+if isCont==0:
+	sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+########################################################################
+#
+# Check if each ProbeSet exist in database
+#
+########################################################################
+print 'Check if each ProbeSet exist in database'
+
+##---- find PID is name or target ----##
+line = fp.readline()
+line = fp.readline()
+line2 = string.split(string.strip(line),'\t')
+line2 = map(string.strip, line2)
+PId = line2[0]
+
+db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId) )
+results = db.fetchall()
+IdStr = 'TargetId'
+if len(results)>0:
+	IdStr = 'Name'
+
+
+##---- get Name/TargetId list from database ----##
+db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr))
+results = db.fetchall()
+	
+Names = []
+for item in results:
+	Names.append(item[0])
+	
+print Names
+
+Names = map(string.lower, Names)
+
+Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at  doesn't exist --#
+
+
+##---- compare genelist with names ----##
+x=y=0
+x1=-1
+GeneList2=[]
+while x<len(GeneList) and y<len(Names):
+	if GeneList[x]==Names[y]:
+		x += 1
+		y += 1
+	elif GeneList[x]<Names[y]:
+		if x!=x1:
+			GeneList2.append(GeneList[x])
+			x1 = x
+		x += 1
+	elif GeneList[x]>Names[y]:
+		y += 1
+
+	if x%100000==0:
+		print 'check Name, checked %d lines'%x
+
+while x<len(GeneList):
+	GeneList2.append(GeneList[x])
+	x += 1
+
+isCont=1
+ferror = open("ProbeSetError.txt", "wb")
+for item in GeneList2:
+	ferror.write(item + " doesn't exist \n")
+	print item, " doesn't exist, check if the ProbeSet name is correct"
+	isCont = 0
+        
+if isCont==0:
+	sys.exit(0)
+
+
+print 'used ',time.time()-time0,' seconds'
+#########################################################################
+#
+# Insert data into database
+#
+#########################################################################
+print 'getting ProbeSet/Id'
+
+
+#---- get Name/Id map ----#
+db.execute('select %s, Id from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr))
+results = db.fetchall()
+NameIds = {}
+for item in results:
+	NameIds[item[0]] = item[1]
+print 'used ',time.time()-time0,' seconds'
+
+
+print 'inserting data'
+
+##---- get old max dataId ----##
+db.execute('select max(Id) from ProbeSetData')
+maxDataId = int(db.fetchall()[0][0])
+bmax = maxDataId
+print "old_max = %d\n" % bmax
+
+##---- insert data ----##
+fp.seek(0)
+line = fp.readline()
+line = fp.readline()
+kj = 0
+
+values1 = []
+values2 = []
+while line:
+	line2 = string.split(string.strip(line),'\t')
+	line2 = map(string.strip, line2)
+	PId = line2[0]
+	recordId = NameIds[PId]
+
+	maxDataId += 1
+	datasorig = line2[dataStart:]
+	
+	###### Data Table items ######
+	i=0
+	for item in datasorig:
+		try:
+			values1.append('(%d,%d,%s)' % (maxDataId, Ids[i], float(item)))
+      		except:
+			pass
+		i += 1
+
+	values2.append("(%d,%d,%d)" % (ProbeSetFreezeId, recordId, maxDataId))
+
+
+	##---- insert into table ----##
+	kj += 1
+	if kj % 100 == 0:
+		cmd = ','.join(values1)
+		cmd = 'insert into ProbeSetData values %s' % cmd
+		db.execute(cmd)
+
+		cmd = ','.join(values2)
+		cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
+		db.execute(cmd)
+
+		values1=[]
+		values2=[]
+		print 'Inserted ', kj,' lines'
+		print 'used ',time.time()-time0,' seconds'
+	
+	line = fp.readline()
+
+
+
+if len(values1)>0:
+	cmd = ','.join(values1)
+	cmd = 'insert into ProbeSetData values %s' % cmd
+	db.execute(cmd)
+
+	cmd = ','.join(values2)
+	cmd = 'insert into ProbeSetXRef(ProbeSetFreezeId, ProbeSetId, DataId) values %s' % cmd
+	db.execute(cmd)
+
+con.close()
diff --git a/scripts/maintenance/readProbeSetSE_v7.py b/scripts/maintenance/readProbeSetSE_v7.py
new file mode 100755
index 00000000..fd6f0bb8
--- /dev/null
+++ b/scripts/maintenance/readProbeSetSE_v7.py
@@ -0,0 +1,254 @@
+#!/usr/bin/python2

+"""This script use the nearest marker to the transcript as control, increasing permutation rounds according to the p-value"""

+########################################################################

+# Last Updated Sep 27, 2011 by Xiaodong

+# This version fix the bug that incorrectly exclude the first 2 probesetIDs

+########################################################################

+

+import string

+import sys

+import MySQLdb

+import getpass

+import time

+

+

+def translateAlias(str):

+	if str == "B6":

+		return "C57BL/6J"

+	elif str == "D2":

+		return "DBA/2J"

+	else:

+		return str

+

+########################################################################

+#

+#  Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile

+#

+########################################################################

+

+dataStart = 1

+

+GeneChipId = int( raw_input("Enter GeneChipId:") )

+ProbeSetFreezeId = int( raw_input("Enter ProbeSetFreezeId:") )

+input_file_name = raw_input("Enter file name with suffix:")

+

+fp = open("%s" % input_file_name, 'rb')

+

+

+try:

+	passwd = getpass.getpass('Please enter mysql password here : ')

+        con = MySQLdb.Connect(db='db_webqtl',host='localhost', user='username',passwd=passwd)

+

+	db = con.cursor()

+        print "You have successfully connected to mysql.\n"

+except:

+	print "You entered incorrect password.\n"

+        sys.exit(0)

+	

+time0 = time.time()

+########################################################################

+#

+#  Indicate Data Start Position, ProbeFreezeId, GeneChipId, DataFile

+#

+########################################################################

+

+#GeneChipId = 4 

+#dataStart = 1 	

+#ProbeSetFreezeId = 359 #JAX Liver 6C Affy M430 2.0 (Jul11) MDP

+#fp = open("GSE10493_AllSamples_6C_Z_AvgSE.txt", 'rb')

+

+

+#########################################################################

+#

+#  Check if each line have same number of members

+#  generate the gene list of expression data here

+#

+#########################################################################

+print 'Checking if each line have same number of members'

+

+GeneList = []

+isCont = 1

+header = fp.readline()

+header = string.split(string.strip(header),'\t')

+header = map(string.strip, header)

+nfield = len(header)

+line = fp.readline()

+

+kj=0

+while line:

+	line2 = string.split(string.strip(line),'\t')

+	line2 = map(string.strip, line2)

+	if len(line2) != nfield:

+		print "Error : " + line

+		isCont = 0

+

+	GeneList.append(line2[0])

+	line = fp.readline()

+

+	kj+=1

+	if kj%100000 == 0:

+		print 'checked ',kj,' lines'

+

+GeneList = map(string.lower, GeneList)

+GeneList.sort()

+	

+if isCont==0:

+	sys.exit(0)

+

+

+print 'used ',time.time()-time0,' seconds'

+#########################################################################

+#

+#  Check if each strain exist in database

+#  generate the string id list of expression data here

+#

+#########################################################################

+print 'Checking if each strain exist in database'

+

+isCont = 1

+fp.seek(0)

+header = fp.readline()

+header = string.split(string.strip(header),'\t')

+header = map(string.strip, header)

+header = map(translateAlias, header)

+header = header[dataStart:]

+Ids = []

+for item in header:

+	try:

+		db.execute('select Id from Strain where Name = "%s"' % item)

+		Ids.append(db.fetchall()[0][0])

+	except:

+		print item,'does not exist, check the if the strain name is correct'

+		isCont=0

+

+if isCont==0:

+	sys.exit(0)

+

+

+print 'used ',time.time()-time0,' seconds'

+########################################################################

+#

+# Check if each ProbeSet exist in database

+#

+########################################################################

+print 'Check if each ProbeSet exist in database'

+

+##---- find PID is name or target ----##

+line = fp.readline()

+line = fp.readline()

+line2 = string.split(string.strip(line),'\t')

+line2 = map(string.strip, line2)

+PId = line2[0]

+

+db.execute('select Id from ProbeSet where Name="%s" and ChipId=%d' % (PId, GeneChipId))

+results = db.fetchall()

+IdStr = 'TargetId'

+if len(results)>0:

+	IdStr = 'Name'

+

+

+##---- get Name/TargetId list from database ----##

+db.execute('select distinct(%s) from ProbeSet where ChipId=%d order by %s' % (IdStr, GeneChipId, IdStr))

+results = db.fetchall()

+	

+Names = []

+for item in results:

+	Names.append(item[0])

+Names = map(string.lower, Names)

+Names.sort() # -- Fixed the lower case problem of ProbeSets affx-mur_b2_at  doesn't exist --#

+

+##---- compare genelist with names ----##

+x=y=0

+x1=-1

+GeneList2=[]

+while x<len(GeneList) and y<len(Names):

+	if GeneList[x]==Names[y]:

+		x += 1

+		y += 1

+	elif GeneList[x]<Names[y]:

+		if x!=x1:

+			GeneList2.append(GeneList[x])

+			x1 = x

+		x += 1

+	elif GeneList[x]>Names[y]:

+		y += 1

+

+	if x%100000==0:

+		print 'check Name, checked %d lines'%x

+

+while x<len(GeneList):

+	GeneList2.append(GeneList[x])

+	x += 1

+

+isCont=1

+ferror = open("ProbeSetError.txt", "wb")

+for item in GeneList2:

+	ferror.write(item + " doesn't exist \n")

+	print item, " doesn't exist"

+	isCont = 0

+        

+if isCont==0:

+	sys.exit(0)

+

+

+print 'used ',time.time()-time0,' seconds'

+#############################

+#Insert new Data into SE

+############################

+db.execute("""

+	select ProbeSet.%s, ProbeSetXRef.DataId from ProbeSet, ProbeSetXRef 

+	where ProbeSet.Id=ProbeSetXRef.ProbeSetId and ProbeSetXRef.ProbeSetFreezeId=%d"""

+	 % (IdStr, ProbeSetFreezeId))

+results = db.fetchall()

+

+ProbeNameId = {}

+for Name, Id in results:

+	ProbeNameId[Name] = Id

+

+ferror = open("ProbeError.txt", "wb")

+

+DataValues = []

+

+fp.seek(0) #XZ add this line

+line = fp.readline() #XZ add this line

+line = fp.readline()

+

+kj = 0

+while line:

+	line2 = string.split(string.strip(line),'\t')

+	line2 = map(string.strip, line2)

+

+	CellId = line2[0]

+	if not ProbeNameId.has_key(CellId):

+		ferror.write(CellId + " doesn't exist\n")

+		print CellId, " doesn't exist"

+	else:

+		DataId = ProbeNameId[CellId]

+		datasorig = line2[dataStart:]

+

+		i = 0

+		for item in datasorig:

+			if item != '':

+				value = '('+str(DataId)+','+str(Ids[i])+','+str(item)+')'

+				DataValues.append(value)

+			i += 1

+

+	kj += 1

+	if kj % 100 == 0:

+		Dataitems = ','.join(DataValues)

+		cmd = 'insert ProbeSetSE values %s' % Dataitems

+		db.execute(cmd)

+

+		DataValues = []

+		print 'inserted ',kj,' lines'

+		print 'used ',time.time()-time0,' seconds'

+	line = fp.readline()

+

+if len(DataValues)>0:

+	DataValues = ','.join(DataValues)

+	cmd = 'insert ProbeSetSE values %s' % DataValues

+	db.execute(cmd)

+

+con.close()

+

+