From 35a5303bd958313d92323b4187ae67c950417125 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 6 May 2019 08:17:25 -0500 Subject: initial commit --- get_addiction_sentences.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 get_addiction_sentences.py (limited to 'get_addiction_sentences.py') diff --git a/get_addiction_sentences.py b/get_addiction_sentences.py new file mode 100644 index 0000000..30a8e50 --- /dev/null +++ b/get_addiction_sentences.py @@ -0,0 +1,83 @@ +#!/bin/env python3 +from nltk.tokenize import sent_tokenize +import os +import re +import codecs +import sys + +gene=sys.argv[1] + +addiction_terms="sensitization|intake|addiction|drug abuse|relapse|self-administered|self-administration|voluntary|reinstatement|binge|intoxication|withdrawal|chronic" + +drugs="alcohol|alcoholism|smoking|nicotine|tobacco|methamphetamine|amphetamine|cocaine|opioid|fentanyl|oxycodone|oxycontin|heroin|morphine|marijuana|cannabinoid|tetrahydrocannabinol|thc" + +brain_regions="cortex|accumbens|striatum|amygadala|hippocampus|tegmental|mesolimbic|infralimbic|prelimbic" + +brain_d ={"cortex":"cortex|pfc|vmpfc|il|pl|prelimbic|infralimbic", + "striatum":"striatum|STR", + "accumbens":"shell|core|NAcc|acbs|acbc", + "hippocampus":"hippocampus|hipp|hip|ca1|ca3|dentate|gyrus", + "amygadala":"amygadala|cea|bla|amy", + "ventral tegmental":"ventral tegmental|vta" + } + +function="LTP|LTD|plasticity|regulate|glutamate|GABA|cholinergic|serotoninergic|synaptic|methylation|transcription|phosphorylation" + +drugs_d = {"alcohol":"alcohol|alcoholism", + "nicotine":"smoking|nicotine|tobacco", + "amphetamine":"methamphetamine|amphetamine", + "cocaine":"cocaine", + "opioid":"opioid|fentanyl|oxycodone|oxycontin|heroin|morphine", + "cannabinoid":"marijuana|cannabinoid|Tetrahydrocannabinol|thc" + } + +def findWholeWord(w): + return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search + +def getSentences(query): + abstracts = os.popen("esearch -db pubmed -query " + query + " | efetch -format uid |fetch-pubmed -path /run/media/hao/PubMed/Archive/ | xtract -pattern PubmedArticle -element MedlineCitation/PMID,ArticleTitle,AbstractText").read() + out=str() + for row in abstracts.split("\n"): + tiab=row.split("\t") + pmid = tiab.pop(0) + tiab= " ".join(tiab) + sentences = sent_tokenize(tiab) + for sent in sentences: + if findWholeWord(gene)(sent): + sent=re.sub(r'\b(%s)\b' % gene, r'\1', sent, flags=re.I) + out+=pmid+"\t"+sent+"\n" + return(out) + +def gene_addiction(gene): + q="\"(" + addiction_terms.replace("|", " OR ") + ") AND (" + drugs.replace("|", " OR ", ) + ") AND " + gene + "\"" + sents=getSentences(q) + out=str() + for sent in sents.split("\n"): + for drug0 in drugs_d: + if findWholeWord(drugs_d[drug0])(sent) : + sent=re.sub(r'\b(%s)\b' % drugs_d[drug0], r'\1', sent, flags=re.I) + out+=gene+"\t"+drug0+"\t"+sent+"\n" + return(out) + +def gene_brainRegion(gene): + q="\"(" + brain_regions.replace("|", " OR ") + ") AND " + gene + "\"" + sents=getSentences(q) + out=str() + for sent in sents.split("\n"): + for brain0 in brain_d: + if findWholeWord(brain_d[brain0])(sent) : + sent=re.sub(r'\b(%s)\b' % brain_d[brain0], r'\1', sent, flags=re.I) + out+=gene+"\t"+brain0+"\t"+sent+"\n" + return(out) + +report=str() +out=gene_addiction(gene) +report+=out +out=gene_brainRegion(gene) +report+=out +with codecs.open(gene+"_addiction_sentences.tab", "w", encoding='utf8') as writer: + writer.write(report) + writer.close() + + + -- cgit v1.2.3