diff options
author | hakangunturkun | 2020-09-07 22:35:17 -0500 |
---|---|---|
committer | hakangunturkun | 2020-09-07 22:35:17 -0500 |
commit | a637889f13c2151303998e411dc81f3196a974f6 (patch) | |
tree | f593bb635e759889109504405d2abc3e31af9008 /utility/topGene_step1_cnt_abstracts.py | |
parent | 35179dec9aa1926102ad2ddbbd5b8ad2882be92e (diff) | |
download | genecup-a637889f13c2151303998e411dc81f3196a974f6.tar.gz |
last version
Diffstat (limited to 'utility/topGene_step1_cnt_abstracts.py')
-rwxr-xr-x | utility/topGene_step1_cnt_abstracts.py | 16 |
1 files changed, 11 insertions, 5 deletions
diff --git a/utility/topGene_step1_cnt_abstracts.py b/utility/topGene_step1_cnt_abstracts.py index 420c9cf..a99835e 100755 --- a/utility/topGene_step1_cnt_abstracts.py +++ b/utility/topGene_step1_cnt_abstracts.py @@ -36,8 +36,10 @@ def saveStopWord(w): if len(sys.argv)==2: input_f=sys.argv[1] + mincnt=10 else: input_f="./ncbi_gene_symb_syno_name_txid9606.txt" + mincnt=0; addiction=undic(addiction_d) drug=undic(drug_d) @@ -52,10 +54,14 @@ with open (stopword_f, "r") as swf: with open (input_f, "r") as f: for line in f: do_search=0 + skip=0 inputline=line line=line.replace("-","\ ") # remove the annotated stopword - if "'" in line: + if re.findall(r"^xx", line): + print ("skip this line\n") + skip=1 + elif "'" in line: do_search=1 words=line.split("|") line=str() @@ -72,14 +78,14 @@ with open (input_f, "r") as f: # tab is added if there are abstracts counts if "\t" in line: (gene, count)=line.split("\t") - # rerun if count is low, these are less annotated - # if int(count)<50: - # do_search=1 + # rerun if count > 20; save time on reruns + if int(count)>=20: + do_search=1 else: #no count, gene=line.strip() do_search=1 - if do_search==1: + if do_search==1 and skip==0: # remove synonyms with only two letters if "|" in gene: synos=gene.split("|") |