aboutsummaryrefslogtreecommitdiff
path: root/scripts/addRif
blob: 4d33af3fe424944c8eb9efa1471cc34fa7aad8b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Copyright (C) University of Tennessee Health Science Center, Memphis, TN.
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License
# as published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero General Public License for more details.
#
# This program is available from Source Forge: at GeneNetwork Project
# (sourceforge.net/projects/genenetwork/).
#
# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010)
# at rwilliams@uthsc.edu and xzhou15@uthsc.edu
#
#
#
# This module is used by GeneNetwork project (www.genenetwork.org)
#
# Created by GeneNetwork Core Team 2010/08/10
#
# Last updated by Lei Yan 2011/02/08

# created by Lei Yan 02/08/2011

import os
import sys
import MySQLdb

path1 = os.path.abspath(os.path.dirname(__file__))
path2 = path1 + "/.."
path3 = path1 + "/../../tmp"
sys.path.insert(0, path2)


def fetchrif():
    try:
        con = MySQLdb.Connect(db="gn3", host="localhost", user="gn2", passwd="password")
        cursor = con.cursor()
        print("You have successfully connected to mysql.\n")
    except:
        print("You entered incorrect password.\n")
        sys.exit(0)

    taxIds = {"10090": 1, "9606": 4, "10116": 2, "3702": 3}
    taxIdKeys = taxIds.keys()

    os.chdir(path3)
    print("path3: %s" % (path3))
    genedict = {}

    os.system("rm -vf gene_info")
    os.system("wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz")
    os.system("gunzip gene_info.gz")

    file = open("gene_info", "r")
    i = 0
    for line1 in file:
        line1 = line1.strip()
        if line1.startswith("#"):
            continue
        line2 = line1.strip().split("\t")
        if line2[0] in taxIdKeys:
            genedict[line2[1]] = line2[2]
        i += 1
        if i % 1000000 == 0:
            print("finished: %d" % (i))
    print("finished all: %d" % (i))
    file.close()

    os.system("rm -vf generifs_basic")
    os.system("wget ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/generifs_basic.gz")
    os.system("gunzip generifs_basic.gz")

    file = open("generifs_basic", "r")
    i = 0
    for line1 in file:
        line1 = line1.strip()
        if line1.startswith("#"):
            continue
        line2 = line1.strip().split("\t")
        if line2[0] in taxIdKeys and len(line2) >= 5:
            line2[0] = taxIds[line2[0]]
            try:
                symbol = genedict[line2[1]]
            except:
                symbol = ""
            sql = """
				SELECT COUNT(*)
				FROM GeneRIF_BASIC
				WHERE GeneRIF_BASIC.`SpeciesId`=%s
				AND GeneRIF_BASIC.`GeneId`=%s
				AND GeneRIF_BASIC.`PubMed_ID`=%s
				AND GeneRIF_BASIC.`createtime`=%s
				AND GeneRIF_BASIC.`comment`=%s
				"""
            cursor.execute(sql, (line2[0], line2[1], line2[2], line2[3], line2[4]))
            c = cursor.fetchone()[0]
            if c == 0:
                print("to insert...")
                sql = """
					INSERT INTO GeneRIF_BASIC
					SET GeneRIF_BASIC.`SpeciesId`=%s,
						GeneRIF_BASIC.`GeneId`=%s,
						GeneRIF_BASIC.`symbol`=%s,
						GeneRIF_BASIC.`PubMed_ID`=%s,
						GeneRIF_BASIC.`createtime`=%s,
						GeneRIF_BASIC.`comment`=%s
					"""
                cursor.execute(
                    sql, (line2[0], line2[1], symbol, line2[2], line2[3], line2[4])
                )
        i += 1
        if i % 100000 == 0:
            print("finished: %d" % (i))
    print("finished all: %d" % (i))
    file.close()
    cursor.close()


# /usr/bin/python addRif.py

if __name__ == "__main__":
    print("command line arguments:\n\t%s" % sys.argv)
    fetchrif()
    print("exit successfully")