about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--scripts/phenotypes_bulk_edit.py114
-rw-r--r--uploader/publications/pubmed.py97
2 files changed, 99 insertions, 112 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py
index 07104a5..488805c 100644
--- a/scripts/phenotypes_bulk_edit.py
+++ b/scripts/phenotypes_bulk_edit.py
@@ -12,6 +12,7 @@ from MySQLdb.cursors import DictCursor
 
 from gn_libs import jobs, mysqldb, sqlite3
 
+import uploader.publications.pubmed as pmed
 logging.basicConfig(
     format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -83,117 +84,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]:
     return diff
 
 
-def __fetch_publications__(conn, ids):
-    """Fetch publication from database by ID."""
-    paramstr = ",".join(["(%s, %s)"] * len(ids))
-    query = (
-        "SELECT "
-        "pxr.PhenotypeId, pxr.Id AS xref_id, pxr.PublicationId, pub.PubMed_ID "
-        "FROM PublishXRef AS pxr INNER JOIN Publication AS pub "
-        "ON pxr.PublicationId=pub.Id "
-        f"WHERE (pxr.PhenotypeId, pxr.Id) IN ({paramstr})")
-    with conn.cursor(cursorclass=DictCursor) as cursor:
-        cursor.execute(query, tuple(item for row in ids for item in row))
-        return tuple(dict(row) for row in cursor.fetchall())
-
-
-def __pub_date__(pubdate: etree.Element):
-    pubyear = pubdate.find("Year")
-    pubmonth = pubdate.find("Month")
-    pubday = pubdate.find("Day")
-    return {
-        "year": pubyear.text if pubyear is not None else None,
-        "month": pubmonth.text if pubmonth is not None else None,
-        "day": pubday.text if pubday is not None else None
-    }
-
-
-def __journal__(journal: etree.Element) -> dict:
-    volume = journal.find("JournalIssue/Volume")
-    issue = journal.find("JournalIssue/Issue")
-    return {
-        "volume": volume.text if volume is not None else None,
-        "issue": issue.text if issue is not None else None,
-        **__pub_date__(journal.find("JournalIssue/PubDate")),
-        "journal": journal.find("Title").text
-    }
-
-def __author__(author: etree.Element) -> str:
-    return "%s %s" % (
-        author.find("LastName").text,
-        author.find("Initials").text)
-
-
-def __pages__(pagination: etree.Element) -> str:
-    start = pagination.find("StartPage")
-    end = pagination.find("EndPage")
-    return (start.text + (
-        f"-{end.text}" if end is not None else ""
-    )) if start is not None else ""
-
-
-def __abstract__(article: etree.Element) -> str:
-    abstract = article.find("Abstract/AbstractText")
-    return abstract.text if abstract is not None else None
-
-
-def __article__(pubmed_article: etree.Element) -> dict:
-    article = pubmed_article.find("MedlineCitation/Article")
-    return {
-        "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
-        "title": article.find("ArticleTitle").text,
-        **__journal__(article.find("Journal")),
-        "abstract": __abstract__(article),
-        "pages": __pages__(article.find("Pagination")),
-        "authors": ", ".join(__author__(author)
-                             for author in article.findall("AuthorList/Author"))
-    }
-
-
-def __process_pubmed_publication_data__(text):
-    """Process the data from PubMed into usable data."""
-    doc = etree.XML(text)
-    articles = doc.xpath("//PubmedArticle")
-    logger.debug("Retrieved %s publications from NCBI", len(articles))
-    return tuple(__article__(article) for article in articles)
-
-
-def __fetch_new_pubmed_ids__(pubmed_ids):
-    """Retrieve data on new publications from NCBI."""
-    # See whether we can retrieve multiple publications in one go
-    # Parse data and save to DB
-    # Return PublicationId(s) for new publication(s).
-    if len(pubmed_ids) == 0:
-        logger.debug("There are no new PubMed IDs to fetch")
-        return tuple()
-
-    logger.info("Fetching publications data for the following PubMed IDs: %s",
-                ", ".join((str(pid) for pid in pubmed_ids)))
-
-    # Should we, perhaps, pass this in from a config variable?
-    uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
-    try:
-        response = requests.get(
-            uri,
-            params={
-                "db": "pubmed",
-                "retmode": "xml",
-                "id": ",".join(str(item) for item in pubmed_ids)
-            })
-
-        if response.status_code == 200:
-            return __process_pubmed_publication_data__(response.text)
-
-        logger.error(
-            "Could not fetch the new publication from %s (status code: %s)",
-            uri,
-            response.status_code)
-    except requests.exceptions.ConnectionError:
-        logger.error("Could not find the domain %s", uri)
-
-    return tuple()
-
-
 def __save_new_publications__(conn, publications, pubmed_ids) -> dict:
     if len(publications) > 0:
         with conn.cursor(cursorclass=DictCursor) as cursor:
@@ -238,7 +128,7 @@ def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict:
     }
     new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids))
     new_publications = __save_new_publications__(
-        conn, __fetch_new_pubmed_ids__(new_pubmed_ids), new_pubmed_ids)
+        conn, pmed.fetch_publications(new_pubmed_ids), new_pubmed_ids)
     new_pubmedid_to_id_map = {
         row["PubMed_ID"]: new_publications.get(
             row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"])
diff --git a/uploader/publications/pubmed.py b/uploader/publications/pubmed.py
new file mode 100644
index 0000000..d984d99
--- /dev/null
+++ b/uploader/publications/pubmed.py
@@ -0,0 +1,97 @@
+"""Module to interact with NCBI's PubMed"""
+
+
+def __pub_date__(pubdate: etree.Element):
+    pubyear = pubdate.find("Year")
+    pubmonth = pubdate.find("Month")
+    pubday = pubdate.find("Day")
+    return {
+        "year": pubyear.text if pubyear is not None else None,
+        "month": pubmonth.text if pubmonth is not None else None,
+        "day": pubday.text if pubday is not None else None
+    }
+
+
+def __journal__(journal: etree.Element) -> dict:
+    volume = journal.find("JournalIssue/Volume")
+    issue = journal.find("JournalIssue/Issue")
+    return {
+        "volume": volume.text if volume is not None else None,
+        "issue": issue.text if issue is not None else None,
+        **__pub_date__(journal.find("JournalIssue/PubDate")),
+        "journal": journal.find("Title").text
+    }
+
+def __author__(author: etree.Element) -> str:
+    return "%s %s" % (
+        author.find("LastName").text,
+        author.find("Initials").text)
+
+
+def __pages__(pagination: etree.Element) -> str:
+    start = pagination.find("StartPage")
+    end = pagination.find("EndPage")
+    return (start.text + (
+        f"-{end.text}" if end is not None else ""
+    )) if start is not None else ""
+
+
+def __abstract__(article: etree.Element) -> str:
+    abstract = article.find("Abstract/AbstractText")
+    return abstract.text if abstract is not None else None
+
+
+def __article__(pubmed_article: etree.Element) -> dict:
+    article = pubmed_article.find("MedlineCitation/Article")
+    return {
+        "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
+        "title": article.find("ArticleTitle").text,
+        **__journal__(article.find("Journal")),
+        "abstract": __abstract__(article),
+        "pages": __pages__(article.find("Pagination")),
+        "authors": ", ".join(__author__(author)
+                             for author in article.findall("AuthorList/Author"))
+    }
+
+
+def __process_pubmed_publication_data__(text) -> tuple[dict, ...]:
+    """Process the data from PubMed into usable data."""
+    doc = etree.XML(text)
+    articles = doc.xpath("//PubmedArticle")
+    logger.debug("Retrieved %s publications from NCBI", len(articles))
+    return tuple(__article__(article) for article in articles)
+
+def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]:
+    """Retrieve data on new publications from NCBI."""
+    # See whether we can retrieve multiple publications in one go
+    # Parse data and save to DB
+    # Return PublicationId(s) for new publication(s).
+    if len(pubmed_ids) == 0:
+        logger.debug("There are no new PubMed IDs to fetch")
+        return tuple()
+
+    logger.info("Fetching publications data for the following PubMed IDs: %s",
+                ", ".join((str(pid) for pid in pubmed_ids)))
+
+    # Should we, perhaps, pass this in from a config variable?
+    uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    try:
+        response = requests.get(
+            uri,
+            params={
+                "db": "pubmed",
+                "retmode": "xml",
+                "id": ",".join(str(item) for item in pubmed_ids)
+            })
+
+        if response.status_code == 200:
+            return __process_pubmed_publication_data__(response.text)
+
+        logger.error(
+            "Could not fetch the new publication from %s (status code: %s)",
+            uri,
+            response.status_code)
+    except requests.exceptions.ConnectionError:
+        logger.error("Could not find the domain %s", uri)
+
+    return tuple()