diff options
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/phenotypes_bulk_edit.py | 67 |
1 files changed, 61 insertions, 6 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py index 02ee2e4..ebb0241 100644 --- a/scripts/phenotypes_bulk_edit.py +++ b/scripts/phenotypes_bulk_edit.py @@ -7,6 +7,7 @@ from typing import Iterator from functools import reduce import requests +from lxml import etree from MySQLdb.cursors import DictCursor from gn_libs import jobs, mysqldb, sqlite3 @@ -96,10 +97,65 @@ def __fetch_publications__(conn, ids): return tuple(dict(row) for row in cursor.fetchall()) +def __pub_date__(pubdate: etree.Element): + pubyear = pubdate.find("Year") + pubmonth = pubdate.find("Month") + pubday = pubdate.find("Day") + return { + "year": pubyear.text if pubyear is not None else None, + "month": pubmonth.text if pubmonth is not None else None, + "day": pubday.text if pubday is not None else None + } + + +def __journal__(journal: etree.Element) -> dict: + volume = journal.find("JournalIssue/Volume") + issue = journal.find("JournalIssue/Issue") + return { + "volume": volume.text if volume is not None else None, + "issue": issue.text if issue is not None else None, + **__pub_date__(journal.find("JournalIssue/PubDate")), + "journal": journal.find("Title").text + } + +def __author__(author: etree.Element) -> str: + return "%s %s" % ( + author.find("LastName").text, + author.find("Initials").text) + + +def __pages__(pagination: etree.Element) -> str: + start = pagination.find("StartPage") + end = pagination.find("EndPage") + return (start.text + ( + f"-{end.text}" if end is not None else "" + )) if start is not None else "" + + +def __abstract__(article: etree.Element) -> str: + abstract = article.find("Abstract/AbstractText") + return abstract.text if abstract is not None else None + + +def __article__(pubmed_article: etree.Element) -> dict: + article = pubmed_article.find("MedlineCitation/Article") + return { + "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text, + "title": article.find("ArticleTitle").text, + **__journal__(article.find("Journal")), + "abstract": __abstract__(article), + "pages": __pages__(article.find("Pagination")), + "authors": ", ".join(__author__(author) + for author in article.findall("AuthorList/Author")) + } + + def __process_pubmed_publication_data__(text): """Process the data from PubMed into usable data.""" - # Process with lxml - pass + doc = etree.XML(text) + articles = doc.xpath("//PubmedArticle") + logger.debug("Retrieved %s publications from NCBI", len(articles)) + return tuple(__article__(article) for article in articles) def __fetch_new_pubmed_ids__(pubmed_ids): @@ -138,15 +194,14 @@ def __fetch_new_pubmed_ids__(pubmed_ids): return tuple() -def publications_differences(file_data, db_data, pubmed_ids): +def publications_differences(file_data, db_data, pubmed_ids) -> dict: """Compute differences in the publications.""" logger.info("Computing differences in publications.") db_pubmed_ids = reduce(lambda coll, curr: coll.union(set([curr["PubMed_ID"]])), db_data, set([None])) - new_pubmeds = __fetch_new_pubmed_ids__(tuple( - pubmed_ids.difference(db_pubmed_ids))) - pass + new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids)) + new_publications = __fetch_new_pubmed_ids__(new_pubmed_ids) def compute_differences(conn, file_contents, pheno_ids, pheno_xref_ids, pubmed_ids) -> tuple[tuple[dict, ...], tuple[dict, ...], tuple[dict, ...]]: |