From d40664196495a76ce344d94bdf283b512004ce67 Mon Sep 17 00:00:00 2001
From: Hao Chen
Date: Sun, 19 May 2019 12:17:50 -0500
Subject: search for top addiction genes
---
extract_gene_symb_name_from_ncbi_gene_info.sh | 4 -
templates/topAddictionGene.html | 355 ++++++++++++---------
topGene_step0_extract_gene_alias_from_gene_info.sh | 4 +
topGene_step1_cnt_abstracts.py | 57 ++--
topGene_step2_cnt_sentences.py | 16 +-
topGene_step3_generate_html.py | 15 +-
topGene_step4_get_pmids_for_all_top_genes.py | 33 ++
7 files changed, 292 insertions(+), 192 deletions(-)
delete mode 100644 extract_gene_symb_name_from_ncbi_gene_info.sh
create mode 100755 topGene_step0_extract_gene_alias_from_gene_info.sh
create mode 100755 topGene_step4_get_pmids_for_all_top_genes.py
diff --git a/extract_gene_symb_name_from_ncbi_gene_info.sh b/extract_gene_symb_name_from_ncbi_gene_info.sh
deleted file mode 100644
index 4d3118b..0000000
--- a/extract_gene_symb_name_from_ncbi_gene_info.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-#-e "s/\(|\)/ /g" -e "s/\[|\]/ /g"
-grep ^9606 ~/Downloads/gene_info |cut -f 3,5,12|grep -v ^LOC|grep -v -i pseudogene |sed -e "s/\t-//" -e "s/\t/|/2" -e "s/\t-//" -e "s/\t/\|/" -e "s/(\|)\|\[\|\]\|{\|}/ /g" | sort >ncbi_gene_symb_syno_name_txid9606.txt
-
diff --git a/templates/topAddictionGene.html b/templates/topAddictionGene.html
index 5709bba..e427560 100644
--- a/templates/topAddictionGene.html
+++ b/templates/topAddictionGene.html
@@ -1,164 +1,213 @@
+
{% extends "layout.html" %}
{% block content %}
-
Top 150 addiction related genes
+ Top addiction related genes
These genes are ranked by the number of PubMed abstracts that contain the name of the gene and one or more addiction related keyword.
-
+- CNR1 (CNRS1; PROTOCADHERIN ALPHA CLUSTER, COMPLEX LOCUS)
+ - OPRM1 (LMOR; M-OR-1; MOP; MOR; MOR1; OPRM; OPIOID RECEPTOR MU 1)
+ - FOS (C-FOS; P55; FOS PROTO-ONCOGENE, AP-1 TRANSCRIPTION FACTOR SUBUNIT)
+ - SLC6A3 (DAT; DAT1; PKDYS; PKDYS1; SOLUTE CARRIER FAMILY 6 MEMBER 3)
+ - CRH (CRF; CRH1; CORTICOTROPIN RELEASING HORMONE)
+ - BDNF (ANON2; BULN2; BRAIN DERIVED NEUROTROPHIC FACTOR)
+ - DRD2 (D2DR; D2R; DOPAMINE RECEPTOR D2)
+ - POMC (ACTH; OBAIRH; PROOPIOMELANOCORTIN)
+ - CREB1 (CREB; CREB-1; CAMP RESPONSIVE ELEMENT BINDING PROTEIN 1)
+ - OPRK1 (K-OR-1; KOR; KOR-1; OPRK; OPIOID RECEPTOR KAPPA 1)
+ - TNF (TNF-ALPHA; TNFA; TNFSF2; TNLG1F; TUMOR NECROSIS FACTOR)
+ - MAPK1 (ERK; ERK-2; ERK2; ERT1; MAPK2; P42MAPK; PRKM1; PRKM2; P41; P41MAPK; P42-MAPK; MITOGEN-ACTIVATED PROTEIN KINASE 1)
+ - SLC6A4 (5-HTT; 5-HTTLPR; 5HTT; OCD1; SERT; SERT1; HSERT; SOLUTE CARRIER FAMILY 6 MEMBER 4)
+ - CNR2 (CB-2; CB2; CX5; CANNABINOID RECEPTOR 2)
+ - OXT (OT-NPI; OXT-NPI; OXYTOCIN; NEUROPHYSIN I PREPROPEPTIDE)
+ - GRM5 (GPRC1E; MGLUR5; PPP1R86; MGLU5; GLUTAMATE METABOTROPIC RECEPTOR 5)
+ - NPY (PYY4; NEUROPEPTIDE Y)
+ - NOS1 (IHPS1; N-NOS; NC-NOS; NOS; BNOS; NNOS; NITRIC OXIDE SYNTHASE 1)
+ - ALDH2 (ALDH-E2; ALDHI; ALDM; ALDEHYDE DEHYDROGENASE 2 FAMILY MEMBER)
+ - PNOC (NOP; OFQ; PPNOC; PREPRONOCICEPTIN)
+ - GRIA1 (GLUH1; GLUR1; GLURA; GLUA1; HBGR1; GLUTAMATE IONOTROPIC RECEPTOR AMPA TYPE SUBUNIT 1)
+ - CHRNA5 (LNCR2; CHOLINERGIC RECEPTOR NICOTINIC ALPHA 5 SUBUNIT)
+ - GRIN2B (EIEE27; GLUN2B; MRD6; NMDAR2B; NR2B; NR3; HNR3; GLUTAMATE IONOTROPIC RECEPTOR NMDA TYPE SUBUNIT 2B)
+ - IL6 (BSF-2; BSF2; HGF; HSF; IFN-BETA-2; IFNB2; IL-6; INTERLEUKIN 6)
+ - PDYN (ADCA; PENKB; SCA23; PRODYNORPHIN)
+ - GRM2 (GLUR2; GPRC1B; MGLUR2; MGLU2; GLUTAMATE METABOTROPIC RECEPTOR 2)
+ - HTR1A (5-HT-1A; 5-HT1A; 5HT1A; ADRB2RL1; ADRBRL1; PFMCD; 5-HYDROXYTRYPTAMINE RECEPTOR 1A)
+ - COMT (HEL-S-98N; CATECHOL-O-METHYLTRANSFERASE)
+ - CRHR1 (CRF-R; CRF-R-1; CRF-R1; CRF1; CRFR-1; CRFR1; CRH-R-1; CRH-R1; CRHR; CRHR1L; CORTICOTROPIN RELEASING HORMONE RECEPTOR 1)
+ - GGT1 (CD224; D22S672; D22S732; GGT; GGT 1; GGTD; GAMMA-GLUTAMYLTRANSFERASE 1)
+ - MAPK3 (ERK-1; ERK1; ERT2; HS44KDAP; HUMKER1A; P44ERK1; P44MAPK; PRKM3; P44-ERK1; P44-MAPK; MITOGEN-ACTIVATED PROTEIN KINASE 3)
+ - CYP2A6 (CPA6; CYP2A; CYP2A3; CYPIIA6; P450C2A; P450PB; CYTOCHROME P450 FAMILY 2 SUBFAMILY A MEMBER 6)
+ - ADH1B (ADH2; HEL-S-117; ALCOHOL DEHYDROGENASE 1B CLASS I , BETA POLYPEPTIDE)
+ - GRIN1 (GLUN1; MRD8; NDHMSD; NDHMSR; NMD-R1; NMDA1; NMDAR1; NR1; GLUTAMATE IONOTROPIC RECEPTOR NMDA TYPE SUBUNIT 1)
+ - DRD4 (D4DR; DOPAMINE RECEPTOR D4)
+ - NPS (NEUROPEPTIDE S)
+ - HTR2A (5-HT2A; HTR2; 5-HYDROXYTRYPTAMINE RECEPTOR 2A)
+ - GAD1 (CPSQ1; GAD; SCP; GLUTAMATE DECARBOXYLASE 1)
+ - PENK (PENK-A; PROENKEPHALIN)
+ - GRIA2 (GLUR2; GLURB; GLUA2; GLUR-K2; HBGR2; GLUTAMATE IONOTROPIC RECEPTOR AMPA TYPE SUBUNIT 2)
+ - TLR4 (ARMD10; CD284; TLR-4; TOLL LIKE RECEPTOR 4)
+ - MAOA (BRNRS; MAO-A; MONOAMINE OXIDASE A)
+ - ARC (ARG3.1; HARC; ACTIVITY REGULATED CYTOSKELETON ASSOCIATED PROTEIN)
+ - CHRNA3 (LNCR2; NACHRA3; PAOD2; CHOLINERGIC RECEPTOR NICOTINIC ALPHA 3 SUBUNIT)
+ - FOSB (G0S3; GOS3; GOSB; FOSB PROTO-ONCOGENE, AP-1 TRANSCRIPTION FACTOR SUBUNIT)
+ - CYP2E1 (CPE1; CYP2E; P450-J; P450C2E; CYTOCHROME P450 FAMILY 2 SUBFAMILY E MEMBER 1)
+ - TRPV1 (TRANSIENT RECEPTOR POTENTIAL CATION CHANNEL SUBFAMILY V MEMBER 1)
+ - IFNA1 (IFL; IFN; IFN-ALPHA; IFN-ALPHAD; IFNA13; IFNA@; INTERFERON ALPHA 1)
+ - HTR2C (5-HT1C; 5-HT2C; 5-HTR2C; 5HTR2C; HTR1C; 5-HYDROXYTRYPTAMINE RECEPTOR 2C)
+ - DBH (ORTHYP1; DOPAMINE BETA-HYDROXYLASE)
+ - NGF (BETA-NGF; HSAN5; NGFB; NERVE GROWTH FACTOR)
+ - GABRA2 (GAMMA-AMINOBUTYRIC ACID TYPE A RECEPTOR ALPHA2 SUBUNIT)
+ - NTRK2 (EIEE58; GP145-TRKB; OBHD; TRKB; TRK-B; NEUROTROPHIC RECEPTOR TYROSINE KINASE 2)
+ - SLC1A2 (EAAT2; EIEE41; GLT-1; HBGT; SOLUTE CARRIER FAMILY 1 MEMBER 2)
+ - GRIN2A (EPND; FESD; GLUN2A; LKS; NMDAR2A; NR2A; GLUTAMATE IONOTROPIC RECEPTOR NMDA TYPE SUBUNIT 2A)
+ - CRP (PTX1; C-REACTIVE PROTEIN)
+ - HTR1B (5-HT-1B; 5-HT-1D-BETA; 5-HT1B; 5-HT1DB; HTR1D2; HTR1DB; 5-HYDROXYTRYPTAMINE RECEPTOR 1B)
+ - ANKK1 (PKK2; SGK288; ANKYRIN REPEAT AND KINASE DOMAIN CONTAINING 1)
+ - OPRD1 (DOP; DOR1; OPRD; OPIOID RECEPTOR DELTA 1)
+ - GFAP (ALXDRD; GLIAL FIBRILLARY ACIDIC PROTEIN)
+ - CHRNB4 (CHOLINERGIC RECEPTOR NICOTINIC BETA 4 SUBUNIT)
+ - CALCA (CALC1; CGRP; CGRP-I; CGRP1; PCT; CALCITONIN RELATED POLYPEPTIDE ALPHA)
+ - IL10 (CSIF; GVHDS; IL-10; IL10A; TGIF; INTERLEUKIN 10)
+ - JUN (AP1; C-JUN; JUN PROTO-ONCOGENE, AP-1 TRANSCRIPTION FACTOR SUBUNIT)
+ - PSENEN (ACNINV2; MDS033; MSTP064; PEN-2; PEN2; PRESENILIN ENHANCER, GAMMA-SECRETASE SUBUNIT)
+ - PTGS2 (COX-2; COX2; GRIPGHS; PGG/HS; PGHS-2; PHS-2; HCOX-2; PROSTAGLANDIN-ENDOPEROXIDE SYNTHASE 2)
+ - CYP3A4 (CP33; CP34; CYP3A; CYP3A3; CYPIIIA3; CYPIIIA4; HLP; NF-25; P450C3; P450PCN1; CYTOCHROME P450 FAMILY 3 SUBFAMILY A MEMBER 4)
+ - ADH1C (ADH3; ALCOHOL DEHYDROGENASE 1C CLASS I , GAMMA POLYPEPTIDE)
+ - GDNF (ATF1; ATF2; HFB1-GDNF; HSCR3; GLIAL CELL DERIVED NEUROTROPHIC FACTOR)
+ - GRM1 (GPRC1A; MGLU1; MGLUR1; PPP1R85; SCA44; SCAR13; GLUTAMATE METABOTROPIC RECEPTOR 1)
+ - DRD1 (DADR; DRD1A; DOPAMINE RECEPTOR D1)
+ - MTOR (FRAP; FRAP1; FRAP2; RAFT1; RAPT1; SKS; MECHANISTIC TARGET OF RAPAMYCIN KINASE)
+ - BCHE (BCHED; CHE1; CHE2; BUTYRYLCHOLINESTERASE)
+ - PPARA (NR1C1; PPAR; PPARALPHA; HPPAR; PEROXISOME PROLIFERATOR ACTIVATED RECEPTOR ALPHA)
+ - TRH (PRO-TRH; THYROTROPIN RELEASING HORMONE)
+ - OPRL1 (KOR-3; NOCIR; NOPR; ORL1; OPIOID RELATED NOCICEPTIN RECEPTOR 1)
+ - IL2 (IL-2; TCGF; INTERLEUKIN 2)
+ - RTN4 (NOGO; NSP; NSP-CL; NBLA00271; NBLA10545; RTN-X; RTN4-A; RTN4-B1; RTN4-B2; RTN4-C; RETICULON 4)
+ - PPP1R1B (DARPP-32; DARPP32; PROTEIN PHOSPHATASE 1 REGULATORY INHIBITOR SUBUNIT 1B)
+ - DRD3 (D3DR; ETM1; FET1; DOPAMINE RECEPTOR D3)
+ - CYP2B6 (CPB6; CYP2B; CYP2B7; CYP2B7P; CYPIIB6; EFVM; IIB1; CYTOCHROME P450 FAMILY 2 SUBFAMILY B MEMBER 6)
+ - ABCB1 (ABC20; CD243; CLCS; GP170; MDR1; P-GP; PGY1; ATP BINDING CASSETTE SUBFAMILY B MEMBER 1)
+ - MGLL (HU-K5; HUK5; MAGL; MGL; MONOGLYCERIDE LIPASE)
+ - CCL2 (GDCF-2; HC11; HSMCR30; MCAF; MCP-1; MCP1; SCYA2; SMC-CF; C-C MOTIF CHEMOKINE LIGAND 2)
+ - NOS2 (HEP-NOS; INOS; NOS2A; NITRIC OXIDE SYNTHASE 2)
+ - EGFR (ERBB; ERBB1; HER1; NISBD2; PIG61; MENA; EPIDERMAL GROWTH FACTOR RECEPTOR)
+ - HOMER1 (HOMER; HOMER1A; HOMER1B; HOMER1C; SYN47; VES-1; HOMER SCAFFOLD PROTEIN 1)
+ - BCL2 (BCL-2; PPP1R50; BCL2 APOPTOSIS REGULATOR)
+ - PLAT (T-PA; TPA; PLASMINOGEN ACTIVATOR, TISSUE TYPE)
+ - IL1B (IL-1; IL1-BETA; IL1F2; INTERLEUKIN 1 BETA)
+ - HCRTR1 (OX1R; HYPOCRETIN RECEPTOR 1)
+ - CDK5 (LIS7; PSSALRE; CYCLIN DEPENDENT KINASE 5)
+ - ADA2 (ADGF; CECR1; IDGFL; PAN; SNEDS; VAIHS; ADENOSINE DEAMINASE 2)
+ - CHAT (CHOACTASE; CMS1A; CMS1A2; CMS6; CHOLINE O-ACETYLTRANSFERASE)
+ - MAP2K7 (JNKK2; MAPKK7; MEK; MEK 7; MKK7; PRKMK7; SAPKK-4; SAPKK4; MITOGEN-ACTIVATED PROTEIN KINASE KINASE 7)
+ - IL4 (BCGF-1; BCGF1; BSF-1; BSF1; IL-4; INTERLEUKIN 4)
+ - BAX (BCL2L4; BCL2 ASSOCIATED X, APOPTOSIS REGULATOR)
+ - SLC18A2 (PKDYS2; SVAT; SVMT; VAT2; VMAT2; SOLUTE CARRIER FAMILY 18 MEMBER A2)
+ - MAPK8 (JNK; JNK-46; JNK1; JNK1A2; JNK21B1/2; PRKM8; SAPK1; SAPK1C; MITOGEN-ACTIVATED PROTEIN KINASE 8)
+ - ADORA2A (A2AR; ADORA2; RDC8; ADENOSINE A2A RECEPTOR)
+ - PIK3CD (APDS; IMD14; P110DELTA; PI3K; P110D; PHOSPHATIDYLINOSITOL-4,5-BISPHOSPHATE 3-KINASE CATALYTIC SUBUNIT DELTA)
+ - CHRNA4 (BFNC; EBN1; NACHRA4; NACRA4; CHOLINERGIC RECEPTOR NICOTINIC ALPHA 4 SUBUNIT)
+ - SRC (SRC1; THC6; C-SRC; P60-SRC; SRC PROTO-ONCOGENE, NON-RECEPTOR TYROSINE KINASE)
+ - NPFF (FMRFAL; NEUROPEPTIDE FF-AMIDE PEPTIDE PRECURSOR)
+ - HOMER2 (ACPD; DFNA68; HOMER-2; VESL-2; HOMER SCAFFOLD PROTEIN 2)
+ - EGR1 (AT225; G0S30; KROX-24; NGFI-A; TIS8; ZIF-268; ZNF225; EARLY GROWTH RESPONSE 1)
+ - ACHE (ACEE; ARACHE; N-ACHE; ACETYLCHOLINESTERASE CARTWRIGHT BLOOD GROUP)
+ - NKS1 ('EC-1; NATURAL KILLER CELL SUSCEPTIBILITY 1)
+ - GNRH1 (GNRH; LHRH; LNRH; GONADOTROPIN RELEASING HORMONE 1)
+ - EGF (HOMG4; EPIDERMAL GROWTH FACTOR)
+ - CRHR2 (CRF-RB; CRF2; CRFR2; HM-CRF; CORTICOTROPIN RELEASING HORMONE RECEPTOR 2)
+ - TAAR1 (TAR1; TRAR1; TRACE AMINE ASSOCIATED RECEPTOR 1)
+ - ADA (ADENOSINE DEAMINASE)
+ - MECP2 (AUTSX3; MRX16; MRX79; MRXS13; MRXSL; PPMX; METHYL-CPG BINDING PROTEIN 2)
+ - APOE (APO-E; APOE4; LDLCQ5; APOLIPOPROTEIN E)
+ - NFKB1 (CVID12; EBP-1; KBF1; NF-KB1; NF-KAPPA-B1; NF-KAPPAB; NFKB-P105; NFKB-P50; NFKAPPAB; NUCLEAR FACTOR KAPPA B SUBUNIT 1)
+ - TPH2 (ADHD7; NTPH; TRYPTOPHAN HYDROXYLASE 2)
+ - PRKAB1 (AMPK; HAMPKB; PROTEIN KINASE AMP-ACTIVATED NON-CATALYTIC SUBUNIT BETA 1)
+ - PRKAA2 (AMPK; AMPK2; AMPKA2; PRKAA; PROTEIN KINASE AMP-ACTIVATED CATALYTIC SUBUNIT ALPHA 2)
+ - PRKAA1 (AMPK; AMPKA1; PROTEIN KINASE AMP-ACTIVATED CATALYTIC SUBUNIT ALPHA 1)
+ - MMP9 (CLG4B; GELB; MANDP2; MMP-9; MATRIX METALLOPEPTIDASE 9)
+ - ADH7 (ADH4; ALCOHOL DEHYDROGENASE 7 CLASS IV , MU OR SIGMA POLYPEPTIDE)
+ - MPO (MYELOPEROXIDASE)
+ - VEGFA (MVCD1; VEGF; VASCULAR ENDOTHELIAL GROWTH FACTOR A)
+ - HCRT (NRCLP1; PPOX; HYPOCRETIN NEUROPEPTIDE PRECURSOR)
+ - CYP1A1 (AHRR; CP11; CYP1; CYPIA1; P1-450; P450-C; P450DX; CYTOCHROME P450 FAMILY 1 SUBFAMILY A MEMBER 1)
+ - CHRNB3 (CHOLINERGIC RECEPTOR NICOTINIC BETA 3 SUBUNIT)
+ - CHRNB2 (EFNL3; NACHRB2; CHOLINERGIC RECEPTOR NICOTINIC BETA 2 SUBUNIT)
+ - VIP (PHM27; VASOACTIVE INTESTINAL PEPTIDE)
+ - TACR1 (NK1R; NKIR; TAC1R; TACHYKININ RECEPTOR 1)
+ - RGS9 (PERRS; RGS9L; REGULATOR OF G PROTEIN SIGNALING 9)
+ - NR4A2 (HZF-3; NURR1; RNR1; TINUR; NUCLEAR RECEPTOR SUBFAMILY 4 GROUP A MEMBER 2)
+ - HCRTR2 (OX2R; HYPOCRETIN RECEPTOR 2)
+ - GRM3 (GLUR3; GPRC1C; MGLUR3; MGLU3; GLUTAMATE METABOTROPIC RECEPTOR 3)
+ - CYP1A2 (CP12; CYPIA2; P3-450; P450 PA ; CYTOCHROME P450 FAMILY 1 SUBFAMILY A MEMBER 2)
+ - AP5B1 (AP-5; PP1030; ADAPTOR RELATED PROTEIN COMPLEX 5 SUBUNIT BETA 1)
+ - PRKCG (PKC-GAMMA; PKCC; PKCG; PKCI 3 ; PKCGAMMA; SCA14; PROTEIN KINASE C GAMMA)
+ - SIRT1 (SIR2; SIR2L1; SIR2ALPHA; SIRTUIN 1)
+ - RGS4 (RGP4; SCZD9; REGULATOR OF G PROTEIN SIGNALING 4)
+ - PTH (PTH1; PARATHYROID HORMONE)
+ - PER2 (FASPS; FASPS1; PERIOD CIRCADIAN REGULATOR 2)
+ - FGF2 (BFGF; FGF-2; FGFB; HBGF-2; FIBROBLAST GROWTH FACTOR 2)
+ - ADH1A (ADH1; ALCOHOL DEHYDROGENASE 1A CLASS I , ALPHA POLYPEPTIDE)
+ - GRM7 (GLUR7; GPRC1G; MGLU7; MGLUR7; PPP1R87; GLUTAMATE METABOTROPIC RECEPTOR 7)
+ - ADH4 (ADH-2; HEL-S-4; ALCOHOL DEHYDROGENASE 4 CLASS II , PI POLYPEPTIDE)
+ - NOS3 (ECNOS; ENOS; NITRIC OXIDE SYNTHASE 3)
+ - ITGAM (CD11B; CR3A; MAC-1; MAC1A; MO1A; SLEB6; INTEGRIN SUBUNIT ALPHA M)
+ - GSTM1 (GST1; GSTM1-1; GSTM1A-1A; GSTM1B-1B; GTH4; GTM1; GLUTATHIONE S-TRANSFERASE MU 1)
+ - GPR55 (LPIR1; G PROTEIN-COUPLED RECEPTOR 55)
+ - CYP2C19 (CPCJ; CYP2C; CYPIIC17; CYPIIC19; P450C2C; P450IIC19; CYTOCHROME P450 FAMILY 2 SUBFAMILY C MEMBER 19)
+ - CCKBR ('xxCCK-B; CCK2R; GASR; CHOLECYSTOKININ B RECEPTOR)
+ - TANK (I-TRAF; ITRAF; TRAF2; TRAF FAMILY MEMBER ASSOCIATED NFKB ACTIVATOR)
+ - NCAM1 (CD56; MSK39; NCAM; NEURAL CELL ADHESION MOLECULE 1)
+ - KCNJ6 (BIR1; GIRK-2; GIRK2; KATP-2; KATP2; KCNJ7; KIR3.2; KPLBS; HIGIRK2; POTASSIUM VOLTAGE-GATED CHANNEL SUBFAMILY J MEMBER 6)
+ - HMGB1 (HMG-1; HMG1; HMG3; SBP-1; HIGH MOBILITY GROUP BOX 1)
+ - HDAC2 (KDAC2; RPD3; YAF1; HISTONE DEACETYLASE 2)
+ - CHRM2 (CHOLINERGIC RECEPTOR MUSCARINIC 2)
+ - CCL5 (D17S136E; RANTES; SCYA5; SIS-DELTA; SISD; TCP228; EOCP; C-C MOTIF CHEMOKINE LIGAND 5)
+ - ADCYAP1 (PACAP; ADENYLATE CYCLASE ACTIVATING POLYPEPTIDE 1)
+ - SULT2A1 (DHEA-ST; DHEAS; ST2A1; ST2A3; HSTA; SULFOTRANSFERASE FAMILY 2A MEMBER 1)
+ - PER1 (RIGUI; HPER; PERIOD CIRCADIAN REGULATOR 1)
+ - MMP2 (CLG4; CLG4A; MMP-2; MMP-II; MONA; TBE-1; MATRIX METALLOPEPTIDASE 2)
+ - ARRB2 (ARB2; ARR2; BARR2; ARRESTIN BETA 2)
+ - AGRP (AGRT; ASIP2; AGOUTI RELATED NEUROPEPTIDE)
+ - SOD2 (IPO-B; IPOB; MNSOD; MVCD6; MN-SOD; SUPEROXIDE DISMUTASE 2)
+ - MBP (MYELIN BASIC PROTEIN)
+ - GAD2 (GAD65; GLUTAMATE DECARBOXYLASE 2)
+ - EIF4EBP1 (4E-BP1; 4EBP1; BP-1; PHAS-I; EUKARYOTIC TRANSLATION INITIATION FACTOR 4E BINDING PROTEIN 1)
+ - CREM (CREM-2; ICER; HCREM-2; CAMP RESPONSIVE ELEMENT MODULATOR)
+ - CHRNA6 (CHNRA6; CHOLINERGIC RECEPTOR NICOTINIC ALPHA 6 SUBUNIT)
+ - ALK (CD246; NBLST3; ALK RECEPTOR TYROSINE KINASE)
+ - UGT2B7 (UDPGT 2B7; UDPGT 2B9; UDPGT2B7; UDPGTH2; UDPGTH-2; UGT2B9; UDP GLUCURONOSYLTRANSFERASE FAMILY 2 MEMBER B7)
+ - UGT1A4 (GNT1; HUG-BR2; UDPGT; UDPGT 1-4; UGT-1A; UGT-1D; UGT1; UGT1-01; UGT1-04; UGT1.1; UGT1.4; UGT1A; UGT1A1; UGT1A4S; UGT1D; HUG-BR1; UDP GLUCURONOSYLTRANSFERASE FAMILY 1 MEMBER A4)
+ - SLC1A3 (EAAT1; GLAST; GLAST1; SOLUTE CARRIER FAMILY 1 MEMBER 3)
+ - SHBG (TEBG; SEX HORMONE BINDING GLOBULIN)
+ - PARP1 (ADPRT; ADPRT 1; ADPRT1; ARTD1; PARP; PARP-1; PPOL; PADPRT-1; POLY ADP-RIBOSE POLYMERASE 1)
+ - NLRP3 (AGTAVPRL; C1ORF7; CIAS1; CLR1.1; DFNA34; FCAS; FCAS1; KEFH; NALP3; PYPAF1; NLR FAMILY PYRIN DOMAIN CONTAINING 3)
+ - NFE2L2 (HEBP1; IMDDHH; NRF2; NUCLEAR FACTOR, ERYTHROID 2 LIKE 2)
+ - IL1RN (DIRA; ICIL-1RA; IL-1RN; IL-1RA; IL-1RA3; IL1F3; IL1RA; IRAP; MVCD4; INTERLEUKIN 1 RECEPTOR ANTAGONIST)
+ - GABPA (E4TF1-60; E4TF1A; NFT2; NRF2; NRF2A; RCH04A07; GA BINDING PROTEIN TRANSCRIPTION FACTOR SUBUNIT ALPHA)
+ - FKBP5 (AIG6; FKBP51; FKBP54; PPIASE; PTG-10; FKBP PROLYL ISOMERASE 5)
+ - ALDH1A1 (ALDC; ALDH-E1; ALDH1; ALDH11; HEL-9; HEL-S-53E; HEL12; PUMB1; RALDH1; ALDEHYDE DEHYDROGENASE 1 FAMILY MEMBER A1)
+ - UGT1A6 (GNT1; HLUGP; HLUGP1; UDPGT; UDPGT 1-6; UGT-1A; UGT-1C; UGT-1E; UGT1; UGT1-01; UGT1-03; UGT1-05; UGT1.1; UGT1.3; UGT1.5; UGT1A; UGT1A1; UGT1A3; UGT1A5; UGT1A6S; UGT1C; UGT1E; UGT1F; HUG-BR1; UDP GLUCURONOSYLTRANSFERASE FAMILY 1 MEMBER A6)
+ - SIGMAR1 (ALS16; DSMA2; OPRS1; SIG-1R; SR-BP; SR-BP1; SRBP; HSIGMAR1; SIGMA1R; SIGMA NON-OPIOID INTRACELLULAR RECEPTOR 1)
+ - RAC1 (MIG5; MRD48; RAC-1; TC-25; P21-RAC1; RAC FAMILY SMALL GTPASE 1)
+ - P4HTM (EGLN4; HIFPH4; P4H-TM; 'PH-4; PHD4; PROLYL 4-HYDROXYLASE, TRANSMEMBRANE)
+ - KRAS (C-K-RAS; CFC2; K-RAS2A; K-RAS2B; K-RAS4A; K-RAS4B; K-RAS; KI-RAS; KRAS1; KRAS2; RALD; RASK2; C-KI-RAS2; KRAS PROTO-ONCOGENE, GTPASE)
+ - IL18 (IGIF; IL-18; IL-1G; IL1F4; INTERLEUKIN 18)
+ - HTR3A (5-HT-3; 5-HT3A; 5-HT3R; 5HT3R; HTR3; 5-HYDROXYTRYPTAMINE RECEPTOR 3A)
+ - HSPA4 (APG-2; HEL-S-5A; HS24/P52; HSPH2; HSP70; HSP70RY; HEAT SHOCK PROTEIN FAMILY A HSP70 MEMBER 4)
+ - FYN (P59-FYN; FYN PROTO-ONCOGENE, SRC FAMILY TYROSINE KINASE)
+ - COL11A2 (DFNA13; DFNB53; FBCG2; HKE5; OSMEDA; OSMEDB; PARP; STL3; COLLAGEN TYPE XI ALPHA 2 CHAIN)
+ - UGT1A7 (GNT1; UDPGT; UDPGT 1-7; UGT-1A; UGT-1G; UGT1; UGT1-01; UGT1-07; UGT1.1; UGT1.7; UGT1A; UGT1A1; UGT1G; HUG-BR1; UDP GLUCURONOSYLTRANSFERASE FAMILY 1 MEMBER A7)
+ - TRPA1 (ANKTM1; FEPS; FEPS1; TRANSIENT RECEPTOR POTENTIAL CATION CHANNEL SUBFAMILY A MEMBER 1)
+ - STAT3 (ADMIO; ADMIO1; APRF; HIES; SIGNAL TRANSDUCER AND ACTIVATOR OF TRANSCRIPTION 3)
+ - PRKCA (AAG6; PKC-ALPHA; PKCA; PKCI+/-; PKCALPHA; PRKACA; PROTEIN KINASE C ALPHA)
+ - MTHFR (METHYLENETETRAHYDROFOLATE REDUCTASE)
+ - GRK2 (ADRBK1; BARK1; BETA-ARK1; G PROTEIN-COUPLED RECEPTOR KINASE 2)
+ - DNMT1 (ADCADN; CXXC9; DNMT; HSN1E; MCMT; M.HSAI; DNA METHYLTRANSFERASE 1)
+ - UGT1A8 (GNT1; UDPGT; UDPGT 1-8; UGT-1A; UGT-1H; UGT1; UGT1-01; UGT1-08; UGT1.1; UGT1.8; UGT1A; UGT1A1; UGT1A8S; UGT1H; HUG-BR1; UDP GLUCURONOSYLTRANSFERASE FAMILY 1 MEMBER A8)
- - FOS
- - BDNF
- - DRD2
- - OPRM1
- - TH
- - ALDH2
- - TNF
- - CHRNA5
- - NPY
- - FAAH
- - CCK
- - COMT
- - CYP2A6
- - DRD4
- - ARC
- - CHRNA3
- - FOSB
- - CRH
- - TLR4
- - TAT
- - APP
- - TRPV1
- - GABRA2
- - CYP2E1
- - CYP2D6
- - ANKK1
- - CORT
- - PDYN
- - POMC
- - CNR1
- - CNR1
- - CHRNB4
- - SLC6A3
- - HTT
- - JUN
- - NGF
- - GDNF
- - DBH
- - GFAP
- - AVP
- - MTOR
- - DRD3
- - SLC6A4
- - MAOA
- - CYP2B6
- - TRH
- - OPRK1
- - DRD1
- - OPRD1
- - CDK5
- - BCHE
- - CRP
- - BAX
- - PENK
- - CHRNA4
- - CHAT
- - NPFF
- - SRC
- - CYP3A4
- - ABCB1
- - MECP2
- - HOMER2
- - TAAR1
- - EGFR
- - TPH2
- - HTR2A
- - CCL2
- - OXT
- - HCRT
- - CHRNB3
- - CHRNB2
- - RGS9
- - PER2
- - CRHR1
- - RGS4
- - MIF
- - SIRT1
- - DBI
- - CHRM2
- - MIP
- - CCL4
- - APOE
- - SNCA
- - PTH
- - PER1
- - GSTM1
- - CHRNA6
- - HTR1B
- - HMGB1
- - ANG
- - HDAC2
- - GPR55
- - FYN
- - AGRP
- - STAT3
- - MTHFR
- - HOMER1
- - GRIN1
- - GAD1
- - GABRA1
- - FKBP5
- - EGR1
- - CYP2C19
- - VIP
- - TRPA1
- - NLRP3
- - GRK2
- - GRIN2B
- - ARRB2
- - ALDH1A1
- - CYP1A2
- - CYP1A1
- - CNR2
- - MBP
- - JUNB
- - GRIN2A
- - GRIA1
- - FGF2
- - SP1
- - NTRK2
- - HTR3A
- - OPRL1
- - NR4A2
- - EGF
- - ALK
- - NCAM1
- - GAD2
- - CD14
- - MAP2
- - HTR1A
- - GRM3
- - CCL5
- - TACR1
- - NOS1
- - KCNJ6
- - HCRTR1
- - CREM
- - CREB1
- - BCL2
- - SOD2
- - SNAP25
- - DLG4
- - SLC18A2
- - MGLL
- - IL4
- - IL1B
- - IL10
- - GRM7
- - GRIA2
- - DNMT1
-
{% endblock %}
diff --git a/topGene_step0_extract_gene_alias_from_gene_info.sh b/topGene_step0_extract_gene_alias_from_gene_info.sh
new file mode 100755
index 0000000..4d3118b
--- /dev/null
+++ b/topGene_step0_extract_gene_alias_from_gene_info.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+#-e "s/\(|\)/ /g" -e "s/\[|\]/ /g"
+grep ^9606 ~/Downloads/gene_info |cut -f 3,5,12|grep -v ^LOC|grep -v -i pseudogene |sed -e "s/\t-//" -e "s/\t/|/2" -e "s/\t-//" -e "s/\t/\|/" -e "s/(\|)\|\[\|\]\|{\|}/ /g" | sort >ncbi_gene_symb_syno_name_txid9606.txt
+
diff --git a/topGene_step1_cnt_abstracts.py b/topGene_step1_cnt_abstracts.py
index a9dd23f..420c9cf 100755
--- a/topGene_step1_cnt_abstracts.py
+++ b/topGene_step1_cnt_abstracts.py
@@ -31,7 +31,6 @@ def saveStopWord(w):
swf.write(w+"\n")
return
-
# either start with ncbi_gene_symb_syno_name_txid9606 for fresh new counts
# or recount the results after adding additional stopwords
@@ -39,7 +38,6 @@ if len(sys.argv)==2:
input_f=sys.argv[1]
else:
input_f="./ncbi_gene_symb_syno_name_txid9606.txt"
- input_f="./ncbi_gene_symb_syno_name_txid9606_p2.txt"
addiction=undic(addiction_d)
drug=undic(drug_d)
@@ -53,50 +51,57 @@ with open (stopword_f, "r") as swf:
with open (input_f, "r") as f:
for line in f:
- rerun=0
- count=-1
+ do_search=0
inputline=line
+ line=line.replace("-","\ ")
+ # remove the annotated stopword
if "'" in line:
+ do_search=1
words=line.split("|")
line=str()
for word in words:
# ' is used to mark/annotate a word is a stop word in the results
+ # remove the ' mark
if "'" in word:
word=word.replace("'","")
stopWords.append(word)
saveStopWord(word)
- rerun=1
- # remove the ' mark
line+="|"+word
line=line[1:]
line=removeStopWords(line)
# tab is added if there are abstracts counts
if "\t" in line:
(gene, count)=line.split("\t")
- if int(count)<100:
- rerun=1
+ # rerun if count is low, these are less annotated
+ # if int(count)<50:
+ # do_search=1
else:
+ #no count,
gene=line.strip()
- # remove synonyms with only two letters
- if "|" in gene:
- synos=gene.split("|")
- gene=str()
- for syno in synos:
- if len(syno)>2:
- gene+="|"+syno
- gene=gene[1:]
- gene_q=gene.replace("|", "\"[tiab] OR \"")
- gene_q+="[tiab]"
- if rerun==1 or count== -1 :
+ do_search=1
+ if do_search==1:
+ # remove synonyms with only two letters
+ if "|" in gene:
+ synos=gene.split("|")
+ # keep the gene name regardless number of characters
+ gene=synos[0]
+ #print ("gene: "+gene + " synos -->" + str(synos[1:]))
+ for syno in synos[1:]:
+ #synonyms must be at least 3 characters
+ if len(syno)>3:
+ gene+="|"+syno
+ gene_q=gene.replace("|", "\"[tiab] OR \"")
+ gene_q+="[tiab]"
count=gene_addiction_cnt(gene_q)
- print("original line->\t"+inputline.strip())
- print("stopword rmed->\t"+line.strip())
- print("final result->\t"+gene+"\t"+count)
- # only save the non_zero results
- if (int(count)>0):
+ print("original line->\t"+inputline.strip())
+ print("stopword rmed->\t"+line.strip())
+ print("final result->\t"+gene+"\t"+count)
out.write(gene+"\t"+count)
+ else:
+ print("original resl->\t"+inputline.strip())
+ out.write(inputline)
-sorted_f=out_f.replace(".txt","_sorted.txt")
-os.system("sort -k2 -t$'\t' -rn " + out_f + " > " + sorted_f )
+sorted_f=output_f.replace(".txt","_sorted.txt")
+os.system("sort -k2 -t$'\t' -rn " + output_f + " > " + sorted_f )
diff --git a/topGene_step2_cnt_sentences.py b/topGene_step2_cnt_sentences.py
index fe97cdd..b05aa7a 100755
--- a/topGene_step2_cnt_sentences.py
+++ b/topGene_step2_cnt_sentences.py
@@ -1,5 +1,5 @@
#!/bin/env python3
-import os
+import os, sys
import re
import time
from nltk.tokenize import sent_tokenize
@@ -42,11 +42,18 @@ addiction=undic(addiction_d)
drug=undic(drug_d)
-out=open("gene_addiction_sentences.tab", "w+")
+out=open("topGene_addiction_sentences.tab", "w+")
cnt=0
-with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt", "r") as f:
+
+if len(sys.argv) != 2:
+ print ("Please provide a sorted gene count file at the command line")
+ sys.exit()
+
+sorted_file=sys.argv[1] # ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_sorted_absCnt_sorted.txt
+with open (sorted_file, "r") as f:
for line in f:
(genes, abstractCount)=line.strip().split("\t")
+ genes=genes.replace("-","\ ")
if int(abstractCount)>20:
symb=genes.split("|")[0]
print(symb+"-->"+genes)
@@ -55,5 +62,4 @@ with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absC
out.write(sentences)
out.close()
-os.system("cut -f 1,4 gene_addiction_sentences.tab |uniq |cut -f 1 |uniq -c |sort -rn > topGeneAbstractCount.tab")
-
+os.system("cut -f 1,4 topGene_addiction_sentences.tab |uniq |cut -f 1 |sort |uniq -c |sort -rn > topGeneAbstractCount.tab")
diff --git a/topGene_step3_generate_html.py b/topGene_step3_generate_html.py
index dfcd6fe..6755adb 100755
--- a/topGene_step3_generate_html.py
+++ b/topGene_step3_generate_html.py
@@ -1,14 +1,21 @@
import re
+import sys
## generate the html page for the top genes
## put gene names and alias in a dictionary
+#ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted_absCnt_absCnt_sorted.txt
+if (len(sys.argv) != 2):
+ print ("please provide the name of a sorted gene abstract count file")
+ sys.exit()
+
geneNames={}
-with open ("./ncbi_gene_symb_syno_name_txid9606_absCnt_sorted_absCnt_sorted.txt","r") as f:
+with open (sys.argv[1],"r") as f:
for line in f:
(genes, count)=line.strip().split("\t")
gene=genes.split("|")
- geneNames[gene[0]]=genes.strip()
+ names=re.sub(r'^.*?\|', "", genes)
+ geneNames[gene[0]]=names.strip().replace("|", "; ")
out=str()
html=str()
@@ -20,8 +27,8 @@ with open("./topGeneAbstractCount.tab" ,"r") as gc:
print (line)
pmid_cnt, symb=line.strip().split()
out+= symb+"\t"+geneNames[symb]+"\n"
- html+="- "+symb+"
\n"
- if cnt==500:
+ html+=" - "+symb+" ("+geneNames[symb]+")
\n"
+ if cnt==200:
break
with open("topGene_symb_alias.txt", "w+") as tg:
diff --git a/topGene_step4_get_pmids_for_all_top_genes.py b/topGene_step4_get_pmids_for_all_top_genes.py
new file mode 100755
index 0000000..9a18836
--- /dev/null
+++ b/topGene_step4_get_pmids_for_all_top_genes.py
@@ -0,0 +1,33 @@
+import os
+
+## save all pmids for the top genes so that I don't have to search for these.
+
+def getPMID(query):
+ print (query)
+ pmids=os.popen("esearch -db pubmed -query \"" + query + "\" | efetch -format uid").read()
+ return(pmids)
+
+def collectTerms():
+ pmids_f=open("topGene_all.pmid","w+")
+ with open("./topGene_symb_alias.txt", "r") as top:
+ q=str()
+ cnt=0
+ for one in top:
+ cnt+=1
+ (symb, alias)=one.split("\t")
+ q+="|"+alias.strip()
+ if (cnt==5):
+ print ("\n")
+ q=q[1:]
+ q=q.replace(";", "[tiab] OR ")+"[tiab]"
+ pmids=getPMID(q)
+ pmids_f.write(pmids)
+ cnt=0
+ q=str()
+ print("there should be nothing following the word empty"+q)
+
+collectTerms()
+os.system("sort topGene_all.pmid |uniq > topGene_uniq.pmid" )
+os.system("rm topGene_all.pmid")
+print ("results are in topGen_uniq.pmid")
+
--
cgit v1.2.3