diff options
-rw-r--r-- | scripts/phenotypes_bulk_edit.py | 114 | ||||
-rw-r--r-- | uploader/publications/pubmed.py | 97 |
2 files changed, 99 insertions, 112 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py index 07104a5..488805c 100644 --- a/scripts/phenotypes_bulk_edit.py +++ b/scripts/phenotypes_bulk_edit.py @@ -12,6 +12,7 @@ from MySQLdb.cursors import DictCursor from gn_libs import jobs, mysqldb, sqlite3 +import uploader.publications.pubmed as pmed logging.basicConfig( format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s") logger = logging.getLogger(__name__) @@ -83,117 +84,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]: return diff -def __fetch_publications__(conn, ids): - """Fetch publication from database by ID.""" - paramstr = ",".join(["(%s, %s)"] * len(ids)) - query = ( - "SELECT " - "pxr.PhenotypeId, pxr.Id AS xref_id, pxr.PublicationId, pub.PubMed_ID " - "FROM PublishXRef AS pxr INNER JOIN Publication AS pub " - "ON pxr.PublicationId=pub.Id " - f"WHERE (pxr.PhenotypeId, pxr.Id) IN ({paramstr})") - with conn.cursor(cursorclass=DictCursor) as cursor: - cursor.execute(query, tuple(item for row in ids for item in row)) - return tuple(dict(row) for row in cursor.fetchall()) - - -def __pub_date__(pubdate: etree.Element): - pubyear = pubdate.find("Year") - pubmonth = pubdate.find("Month") - pubday = pubdate.find("Day") - return { - "year": pubyear.text if pubyear is not None else None, - "month": pubmonth.text if pubmonth is not None else None, - "day": pubday.text if pubday is not None else None - } - - -def __journal__(journal: etree.Element) -> dict: - volume = journal.find("JournalIssue/Volume") - issue = journal.find("JournalIssue/Issue") - return { - "volume": volume.text if volume is not None else None, - "issue": issue.text if issue is not None else None, - **__pub_date__(journal.find("JournalIssue/PubDate")), - "journal": journal.find("Title").text - } - -def __author__(author: etree.Element) -> str: - return "%s %s" % ( - author.find("LastName").text, - author.find("Initials").text) - - -def __pages__(pagination: etree.Element) -> str: - start = pagination.find("StartPage") - end = pagination.find("EndPage") - return (start.text + ( - f"-{end.text}" if end is not None else "" - )) if start is not None else "" - - -def __abstract__(article: etree.Element) -> str: - abstract = article.find("Abstract/AbstractText") - return abstract.text if abstract is not None else None - - -def __article__(pubmed_article: etree.Element) -> dict: - article = pubmed_article.find("MedlineCitation/Article") - return { - "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text, - "title": article.find("ArticleTitle").text, - **__journal__(article.find("Journal")), - "abstract": __abstract__(article), - "pages": __pages__(article.find("Pagination")), - "authors": ", ".join(__author__(author) - for author in article.findall("AuthorList/Author")) - } - - -def __process_pubmed_publication_data__(text): - """Process the data from PubMed into usable data.""" - doc = etree.XML(text) - articles = doc.xpath("//PubmedArticle") - logger.debug("Retrieved %s publications from NCBI", len(articles)) - return tuple(__article__(article) for article in articles) - - -def __fetch_new_pubmed_ids__(pubmed_ids): - """Retrieve data on new publications from NCBI.""" - # See whether we can retrieve multiple publications in one go - # Parse data and save to DB - # Return PublicationId(s) for new publication(s). - if len(pubmed_ids) == 0: - logger.debug("There are no new PubMed IDs to fetch") - return tuple() - - logger.info("Fetching publications data for the following PubMed IDs: %s", - ", ".join((str(pid) for pid in pubmed_ids))) - - # Should we, perhaps, pass this in from a config variable? - uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" - try: - response = requests.get( - uri, - params={ - "db": "pubmed", - "retmode": "xml", - "id": ",".join(str(item) for item in pubmed_ids) - }) - - if response.status_code == 200: - return __process_pubmed_publication_data__(response.text) - - logger.error( - "Could not fetch the new publication from %s (status code: %s)", - uri, - response.status_code) - except requests.exceptions.ConnectionError: - logger.error("Could not find the domain %s", uri) - - return tuple() - - def __save_new_publications__(conn, publications, pubmed_ids) -> dict: if len(publications) > 0: with conn.cursor(cursorclass=DictCursor) as cursor: @@ -238,7 +128,7 @@ def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict: } new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids)) new_publications = __save_new_publications__( - conn, __fetch_new_pubmed_ids__(new_pubmed_ids), new_pubmed_ids) + conn, pmed.fetch_publications(new_pubmed_ids), new_pubmed_ids) new_pubmedid_to_id_map = { row["PubMed_ID"]: new_publications.get( row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"]) diff --git a/uploader/publications/pubmed.py b/uploader/publications/pubmed.py new file mode 100644 index 0000000..d984d99 --- /dev/null +++ b/uploader/publications/pubmed.py @@ -0,0 +1,97 @@ +"""Module to interact with NCBI's PubMed""" + + +def __pub_date__(pubdate: etree.Element): + pubyear = pubdate.find("Year") + pubmonth = pubdate.find("Month") + pubday = pubdate.find("Day") + return { + "year": pubyear.text if pubyear is not None else None, + "month": pubmonth.text if pubmonth is not None else None, + "day": pubday.text if pubday is not None else None + } + + +def __journal__(journal: etree.Element) -> dict: + volume = journal.find("JournalIssue/Volume") + issue = journal.find("JournalIssue/Issue") + return { + "volume": volume.text if volume is not None else None, + "issue": issue.text if issue is not None else None, + **__pub_date__(journal.find("JournalIssue/PubDate")), + "journal": journal.find("Title").text + } + +def __author__(author: etree.Element) -> str: + return "%s %s" % ( + author.find("LastName").text, + author.find("Initials").text) + + +def __pages__(pagination: etree.Element) -> str: + start = pagination.find("StartPage") + end = pagination.find("EndPage") + return (start.text + ( + f"-{end.text}" if end is not None else "" + )) if start is not None else "" + + +def __abstract__(article: etree.Element) -> str: + abstract = article.find("Abstract/AbstractText") + return abstract.text if abstract is not None else None + + +def __article__(pubmed_article: etree.Element) -> dict: + article = pubmed_article.find("MedlineCitation/Article") + return { + "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text, + "title": article.find("ArticleTitle").text, + **__journal__(article.find("Journal")), + "abstract": __abstract__(article), + "pages": __pages__(article.find("Pagination")), + "authors": ", ".join(__author__(author) + for author in article.findall("AuthorList/Author")) + } + + +def __process_pubmed_publication_data__(text) -> tuple[dict, ...]: + """Process the data from PubMed into usable data.""" + doc = etree.XML(text) + articles = doc.xpath("//PubmedArticle") + logger.debug("Retrieved %s publications from NCBI", len(articles)) + return tuple(__article__(article) for article in articles) + +def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]: + """Retrieve data on new publications from NCBI.""" + # See whether we can retrieve multiple publications in one go + # Parse data and save to DB + # Return PublicationId(s) for new publication(s). + if len(pubmed_ids) == 0: + logger.debug("There are no new PubMed IDs to fetch") + return tuple() + + logger.info("Fetching publications data for the following PubMed IDs: %s", + ", ".join((str(pid) for pid in pubmed_ids))) + + # Should we, perhaps, pass this in from a config variable? + uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + try: + response = requests.get( + uri, + params={ + "db": "pubmed", + "retmode": "xml", + "id": ",".join(str(item) for item in pubmed_ids) + }) + + if response.status_code == 200: + return __process_pubmed_publication_data__(response.text) + + logger.error( + "Could not fetch the new publication from %s (status code: %s)", + uri, + response.status_code) + except requests.exceptions.ConnectionError: + logger.error("Could not find the domain %s", uri) + + return tuple() |