aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/phenotypes_bulk_edit.py114
-rw-r--r--uploader/publications/pubmed.py97
2 files changed, 99 insertions, 112 deletions
diff --git a/scripts/phenotypes_bulk_edit.py b/scripts/phenotypes_bulk_edit.py
index 07104a5..488805c 100644
--- a/scripts/phenotypes_bulk_edit.py
+++ b/scripts/phenotypes_bulk_edit.py
@@ -12,6 +12,7 @@ from MySQLdb.cursors import DictCursor
from gn_libs import jobs, mysqldb, sqlite3
+import uploader.publications.pubmed as pmed
logging.basicConfig(
format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
@@ -83,117 +84,6 @@ def descriptions_differences(file_data, db_data) -> dict[str, str]:
return diff
-def __fetch_publications__(conn, ids):
- """Fetch publication from database by ID."""
- paramstr = ",".join(["(%s, %s)"] * len(ids))
- query = (
- "SELECT "
- "pxr.PhenotypeId, pxr.Id AS xref_id, pxr.PublicationId, pub.PubMed_ID "
- "FROM PublishXRef AS pxr INNER JOIN Publication AS pub "
- "ON pxr.PublicationId=pub.Id "
- f"WHERE (pxr.PhenotypeId, pxr.Id) IN ({paramstr})")
- with conn.cursor(cursorclass=DictCursor) as cursor:
- cursor.execute(query, tuple(item for row in ids for item in row))
- return tuple(dict(row) for row in cursor.fetchall())
-
-
-def __pub_date__(pubdate: etree.Element):
- pubyear = pubdate.find("Year")
- pubmonth = pubdate.find("Month")
- pubday = pubdate.find("Day")
- return {
- "year": pubyear.text if pubyear is not None else None,
- "month": pubmonth.text if pubmonth is not None else None,
- "day": pubday.text if pubday is not None else None
- }
-
-
-def __journal__(journal: etree.Element) -> dict:
- volume = journal.find("JournalIssue/Volume")
- issue = journal.find("JournalIssue/Issue")
- return {
- "volume": volume.text if volume is not None else None,
- "issue": issue.text if issue is not None else None,
- **__pub_date__(journal.find("JournalIssue/PubDate")),
- "journal": journal.find("Title").text
- }
-
-def __author__(author: etree.Element) -> str:
- return "%s %s" % (
- author.find("LastName").text,
- author.find("Initials").text)
-
-
-def __pages__(pagination: etree.Element) -> str:
- start = pagination.find("StartPage")
- end = pagination.find("EndPage")
- return (start.text + (
- f"-{end.text}" if end is not None else ""
- )) if start is not None else ""
-
-
-def __abstract__(article: etree.Element) -> str:
- abstract = article.find("Abstract/AbstractText")
- return abstract.text if abstract is not None else None
-
-
-def __article__(pubmed_article: etree.Element) -> dict:
- article = pubmed_article.find("MedlineCitation/Article")
- return {
- "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
- "title": article.find("ArticleTitle").text,
- **__journal__(article.find("Journal")),
- "abstract": __abstract__(article),
- "pages": __pages__(article.find("Pagination")),
- "authors": ", ".join(__author__(author)
- for author in article.findall("AuthorList/Author"))
- }
-
-
-def __process_pubmed_publication_data__(text):
- """Process the data from PubMed into usable data."""
- doc = etree.XML(text)
- articles = doc.xpath("//PubmedArticle")
- logger.debug("Retrieved %s publications from NCBI", len(articles))
- return tuple(__article__(article) for article in articles)
-
-
-def __fetch_new_pubmed_ids__(pubmed_ids):
- """Retrieve data on new publications from NCBI."""
- # See whether we can retrieve multiple publications in one go
- # Parse data and save to DB
- # Return PublicationId(s) for new publication(s).
- if len(pubmed_ids) == 0:
- logger.debug("There are no new PubMed IDs to fetch")
- return tuple()
-
- logger.info("Fetching publications data for the following PubMed IDs: %s",
- ", ".join((str(pid) for pid in pubmed_ids)))
-
- # Should we, perhaps, pass this in from a config variable?
- uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
- try:
- response = requests.get(
- uri,
- params={
- "db": "pubmed",
- "retmode": "xml",
- "id": ",".join(str(item) for item in pubmed_ids)
- })
-
- if response.status_code == 200:
- return __process_pubmed_publication_data__(response.text)
-
- logger.error(
- "Could not fetch the new publication from %s (status code: %s)",
- uri,
- response.status_code)
- except requests.exceptions.ConnectionError:
- logger.error("Could not find the domain %s", uri)
-
- return tuple()
-
-
def __save_new_publications__(conn, publications, pubmed_ids) -> dict:
if len(publications) > 0:
with conn.cursor(cursorclass=DictCursor) as cursor:
@@ -238,7 +128,7 @@ def publications_differences(conn, file_data, db_data, pubmed_ids) -> dict:
}
new_pubmed_ids = tuple(pubmed_ids.difference(db_pubmed_ids))
new_publications = __save_new_publications__(
- conn, __fetch_new_pubmed_ids__(new_pubmed_ids), new_pubmed_ids)
+ conn, pmed.fetch_publications(new_pubmed_ids), new_pubmed_ids)
new_pubmedid_to_id_map = {
row["PubMed_ID"]: new_publications.get(
row["PubMed_ID"], pubmedid_to_id_map[f"{row['phenotype_id']}::{row['xref_id']}"])
diff --git a/uploader/publications/pubmed.py b/uploader/publications/pubmed.py
new file mode 100644
index 0000000..d984d99
--- /dev/null
+++ b/uploader/publications/pubmed.py
@@ -0,0 +1,97 @@
+"""Module to interact with NCBI's PubMed"""
+
+
+def __pub_date__(pubdate: etree.Element):
+ pubyear = pubdate.find("Year")
+ pubmonth = pubdate.find("Month")
+ pubday = pubdate.find("Day")
+ return {
+ "year": pubyear.text if pubyear is not None else None,
+ "month": pubmonth.text if pubmonth is not None else None,
+ "day": pubday.text if pubday is not None else None
+ }
+
+
+def __journal__(journal: etree.Element) -> dict:
+ volume = journal.find("JournalIssue/Volume")
+ issue = journal.find("JournalIssue/Issue")
+ return {
+ "volume": volume.text if volume is not None else None,
+ "issue": issue.text if issue is not None else None,
+ **__pub_date__(journal.find("JournalIssue/PubDate")),
+ "journal": journal.find("Title").text
+ }
+
+def __author__(author: etree.Element) -> str:
+ return "%s %s" % (
+ author.find("LastName").text,
+ author.find("Initials").text)
+
+
+def __pages__(pagination: etree.Element) -> str:
+ start = pagination.find("StartPage")
+ end = pagination.find("EndPage")
+ return (start.text + (
+ f"-{end.text}" if end is not None else ""
+ )) if start is not None else ""
+
+
+def __abstract__(article: etree.Element) -> str:
+ abstract = article.find("Abstract/AbstractText")
+ return abstract.text if abstract is not None else None
+
+
+def __article__(pubmed_article: etree.Element) -> dict:
+ article = pubmed_article.find("MedlineCitation/Article")
+ return {
+ "pubmed_id": pubmed_article.find("MedlineCitation/PMID").text,
+ "title": article.find("ArticleTitle").text,
+ **__journal__(article.find("Journal")),
+ "abstract": __abstract__(article),
+ "pages": __pages__(article.find("Pagination")),
+ "authors": ", ".join(__author__(author)
+ for author in article.findall("AuthorList/Author"))
+ }
+
+
+def __process_pubmed_publication_data__(text) -> tuple[dict, ...]:
+ """Process the data from PubMed into usable data."""
+ doc = etree.XML(text)
+ articles = doc.xpath("//PubmedArticle")
+ logger.debug("Retrieved %s publications from NCBI", len(articles))
+ return tuple(__article__(article) for article in articles)
+
+def fetch_publications(pubmed_ids: tuple[int, ...]) -> tuple[dict, ...]:
+ """Retrieve data on new publications from NCBI."""
+ # See whether we can retrieve multiple publications in one go
+ # Parse data and save to DB
+ # Return PublicationId(s) for new publication(s).
+ if len(pubmed_ids) == 0:
+ logger.debug("There are no new PubMed IDs to fetch")
+ return tuple()
+
+ logger.info("Fetching publications data for the following PubMed IDs: %s",
+ ", ".join((str(pid) for pid in pubmed_ids)))
+
+ # Should we, perhaps, pass this in from a config variable?
+ uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+ try:
+ response = requests.get(
+ uri,
+ params={
+ "db": "pubmed",
+ "retmode": "xml",
+ "id": ",".join(str(item) for item in pubmed_ids)
+ })
+
+ if response.status_code == 200:
+ return __process_pubmed_publication_data__(response.text)
+
+ logger.error(
+ "Could not fetch the new publication from %s (status code: %s)",
+ uri,
+ response.status_code)
+ except requests.exceptions.ConnectionError:
+ logger.error("Could not find the domain %s", uri)
+
+ return tuple()