about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--gn3/db/rdf/wiki.py113
-rw-r--r--tests/unit/db/rdf/test_wiki.py7
2 files changed, 69 insertions, 51 deletions
diff --git a/gn3/db/rdf/wiki.py b/gn3/db/rdf/wiki.py
index 5e8e02e..2a5924e 100644
--- a/gn3/db/rdf/wiki.py
+++ b/gn3/db/rdf/wiki.py
@@ -9,13 +9,13 @@ NOTE: In the CONSTRUCT queries below, we manually sort the arrays from
    <https://stackoverflow.com/questions/78186393>
    <https://www.w3.org/TR/rdf-sparql-query/#modOrderBy>
 """
-from datetime import datetime
 from string import Template
 from gn3.db.rdf import (
     BASE_CONTEXT,
     RDF_PREFIXES,
     query_frame_and_compact,
     update_rdf,
+    sparql_query,
 )
 
 
@@ -36,20 +36,6 @@ WIKI_CONTEXT = BASE_CONTEXT | {
     "id": "dct:identifier",
 }
 
-RIF_CONTEXT = BASE_CONTEXT | {
-    "dct": "http://purl.org/dc/terms/",
-    "skos": "http://www.w3.org/2004/02/skos/core#",
-    "symbol": "gnt:symbol",
-    "species": "gnt:species",
-    "taxonomic_id": "skos:notation",
-    "gene_id": "gnt:hasGeneId",
-    "pubmed_id": "dct:references",
-    "created": "dct:created",
-    "comment": "rdfs:comment",
-    "version": "dct:hasVersion",
-    "id": "dct:identifier",
-}
-
 
 def __sanitize_result(result: dict) -> dict:
     """Make sure `categories` and `pubmed_ids` are always arrays"""
@@ -262,43 +248,72 @@ $comment_triple}
 
 
 def get_rif_entries_by_symbol(
-        symbol: str, sparql_uri: str, graph: str = "<http://genenetwork.org>"
+    symbol: str, sparql_uri: str, graph: str = "<http://genenetwork.org>"
 ) -> dict:
-    """Fetch NCBI RIF entries by a symbol.  Symbol here is case in-sensitive."""
-    query = Template("""
+    """Fetch NCBI RIF entries for a given symbol (case-insensitive).
+
+This function retrieves NCBI RIF entries using a SPARQL `SELECT` query
+instead of a `CONSTRUCT` to avoid truncation.  The Virtuoso SPARQL
+engine limits query results to 1,048,576 triples per solution, and
+NCBI entries can exceed this limit.  Since there may be more than
+2,000 entries, which could result in the number of triples surpassing
+the limit, `SELECT` is used to ensure complete data retrieval without
+truncation.  See:
+
+<https://community.openlinksw.com/t/sparql-query-limiting-results-to-100000-triples/2131>
+
+    """
+    # XXX: Consider pagination
+    query = Template(
+        """
 $prefix
 
-CONSTRUCT {
-    ?comment gnt:symbol ?symbol ;
-             gnt:species ?species ;
-             dct:references ?pmid ;
-             rdfs:comment ?text ;
-             dct:hasVersion ?version ;
-             dct:created ?created ;
-             gnt:hasGeneId ?gene_id ;
-             skos:notation ?taxonId .
-} FROM $graph WHERE {
-    ?comment rdfs:label ?text_ ;
-             gnt:symbol ?symbol ;
-             rdf:type gnc:NCBIWikiEntry ;
-             gnt:hasGeneId ?gene_id_ ;
-             dct:hasVersion ?version ;
-             dct:references ?pmid_ ;
-             dct:created ?createTime ;
-             gnt:belongsToSpecies ?speciesId .
-    ?speciesId gnt:shortName ?species .
-    FILTER ( LCASE(STR(?symbol)) = LCASE("$symbol") ) .
-    OPTIONAL { ?comment skos:notation ?taxonId_ . } .
-    BIND (STR(?text_) AS ?text) .
-    BIND (xsd:integer(STRAFTER(STR(?taxonId_), STR(taxon:))) AS ?taxonId) .
-    BIND (xsd:integer(STRAFTER(STR(?pmid_), STR(pubmed:))) AS ?pmid) .
+SELECT ?comment ?symbol ?species ?pubmed_id ?version ?created ?gene_id ?taxonomic_id
+FROM $graph WHERE {
+    ?comment_id rdfs:label ?text_ ;
+                gnt:symbol ?symbol ;
+                rdf:type gnc:NCBIWikiEntry ;
+                gnt:hasGeneId ?gene_id_ ;
+                dct:hasVersion ?version ;
+                dct:references ?pmid_ ;
+                dct:created ?createTime ;
+                gnt:belongsToSpecies ?speciesId .
+    ?speciesId rdfs:label ?species .
+    FILTER ( LCASE(?symbol) = LCASE("$symbol") ) .
+    OPTIONAL { ?comment_id skos:notation ?taxonId_ . } .
+    BIND (STR(?text_) AS ?comment) .
+    BIND (xsd:integer(STRAFTER(STR(?taxonId_), STR(taxon:))) AS ?taxonomic_id) .
+    BIND (xsd:integer(STRAFTER(STR(?pmid_), STR(pubmed:))) AS ?pubmed_id) .
     BIND (xsd:integer(STRAFTER(STR(?gene_id_), STR(generif:))) AS ?gene_id) .
     BIND (STR(?createTime) AS ?created) .
-}
-""").substitute(prefix=RDF_PREFIXES, graph=graph, symbol=symbol)
-    results = query_frame_and_compact(query, RIF_CONTEXT, sparql_uri)
-    results["data"] = sorted(
-        results["data"],
-        key=lambda k: (k["species"],
-                       datetime.strptime(k["created"], "%Y-%m-%d %H:%M:%S")))
+} ORDER BY ?species ?createTime
+"""
+    ).substitute(prefix=RDF_PREFIXES, graph=graph, symbol=symbol)
+    results: dict[str, dict|list] = {
+        "@context": {
+            "dct": "http://purl.org/dc/terms/",
+            "gnt": "http://genenetwork.org/term/",
+            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+            "skos": "http://www.w3.org/2004/02/skos/core#",
+            "symbol": "gnt:symbol",
+            "species": "gnt:species",
+            "taxonomic_id": "skos:notation",
+            "gene_id": "gnt:hasGeneId",
+            "pubmed_id": "dct:references",
+            "created": "dct:created",
+            "comment": "rdfs:comment",
+            "version": "dct:hasVersion",
+        }
+    }
+    data: list[dict[str, int|str]] = []
+    for entry in sparql_query(query=query, endpoint=sparql_uri, format_type="json"):
+        data.append(
+            {
+                key: int(metadata.get("value"))
+                if metadata.get("value").isdigit()
+                else metadata.get("value")
+                for key, metadata in entry.items()
+            }
+        )
+    results["data"] = data
     return results
diff --git a/tests/unit/db/rdf/test_wiki.py b/tests/unit/db/rdf/test_wiki.py
index 7a0dc3a..150130e 100644
--- a/tests/unit/db/rdf/test_wiki.py
+++ b/tests/unit/db/rdf/test_wiki.py
@@ -405,8 +405,11 @@ def test_update_wiki_comment(rdf_setup):  # pylint: disable=W0613,W0621
 def test_get_rif_entries_by_symbol(rdf_setup):  # pylint: disable=W0613,W0621
     """Test fetching NCBI Rif Metadata from RDF"""
     sparql_conf = SPARQL_CONF
-    assert get_rif_entries_by_symbol(
+    entries = get_rif_entries_by_symbol(
         symbol="Lpl",
         sparql_uri=sparql_conf["sparql_endpoint"],
         graph=GRAPH,
-    ) == LPL_RIF_ENTRIES
+    )
+    assert len(LPL_RIF_ENTRIES["data"]) == len(entries["data"])
+    for result, expected in zip(LPL_RIF_ENTRIES["data"], entries["data"]):
+        TestCase().assertDictEqual(result, expected)