about summary refs log tree commit diff
path: root/gn3/db
diff options
context:
space:
mode:
Diffstat (limited to 'gn3/db')
-rw-r--r--gn3/db/constants.py152
-rw-r--r--gn3/db/rdf.py126
-rw-r--r--gn3/db/wiki.py80
3 files changed, 321 insertions, 37 deletions
diff --git a/gn3/db/constants.py b/gn3/db/constants.py
new file mode 100644
index 0000000..45e3bfc
--- /dev/null
+++ b/gn3/db/constants.py
@@ -0,0 +1,152 @@
+"""
+This module contains some constants used in other modules.
+"""
+PREFIXES = {
+    "dcat": "http://www.w3.org/ns/dcat#",
+    "dct": "http://purl.org/dc/terms/",
+    "ex": "http://example.org/stuff/1.0/",
+    "fabio": "http://purl.org/spar/fabio/",
+    "foaf": "http://xmlns.com/foaf/0.1/",
+    "generif": "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=",
+    "genotype": "http://genenetwork.org/genotype/",
+    "gn": "http://genenetwork.org/id/",
+    "gnc": "http://genenetwork.org/category/",
+    "gnt": "http://genenetwork.org/term/",
+    "owl": "http://www.w3.org/2002/07/owl#",
+    "phenotype": "http://genenetwork.org/phenotype/",
+    "prism": "http://prismstandard.org/namespaces/basic/2.0/",
+    "publication": "http://genenetwork.org/publication/",
+    "pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/",
+    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+    "skos": "http://www.w3.org/2004/02/skos/core#",
+    "taxon": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=",
+    "up": "http://purl.uniprot.org/core/",
+    "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#",
+    "xsd": "http://www.w3.org/2001/XMLSchema#",
+}
+
+RDF_PREFIXES = "\n".join([f"PREFIX {key}: <{value}>"
+                          for key, value in PREFIXES.items()])
+
+BASE_CONTEXT = {
+    "data": "@graph",
+    "type": "@type",
+    "gn": "http://genenetwork.org/id/",
+    "gnc": "http://genenetwork.org/category/",
+    "gnt": "http://genenetwork.org/term/",
+    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#>",
+}
+
+DATASET_CONTEXT = {
+    "accessRights": "dct:accessRights",
+    "accessionId": "dct:identifier",
+    "acknowledgement": "gnt:hasAcknowledgement",
+    "altLabel": "skos:altLabel",
+    "caseInfo": "gnt:hasCaseInfo",
+    "classifiedUnder": "xkos:classifiedUnder",
+    "contributors": "dct:creator",
+    "contactPoint": "dcat:contactPoint",
+    "created": "dct:created",
+    "dcat": "http://www.w3.org/ns/dcat#",
+    "dct": "http://purl.org/dc/terms/",
+    "description": "dct:description",
+    "ex": "http://example.org/stuff/1.0/",
+    "experimentDesignInfo": "gnt:hasExperimentDesignInfo",
+    "experimentType": "gnt:hasExperimentType",
+    "foaf": "http://xmlns.com/foaf/0.1/",
+    "geoSeriesId": "gnt:hasGeoSeriesId",
+    "gnt": "http://genenetwork.org/term/",
+    "inbredSet": "gnt:belongsToGroup",
+    "label": "rdfs:label",
+    "normalization": "gnt:usesNormalization",
+    "platformInfo": "gnt:hasPlatformInfo",
+    "notes": "gnt:hasNotes",
+    "organization": "foaf:Organization",
+    "prefLabel": "skos:prefLabel",
+    "citation": "dct:isReferencedBy",
+    "GoTree": "gnt:hasGOTreeValue",
+    "platform": "gnt:usesPlatform",
+    "processingInfo": "gnt:hasDataProcessingInfo",
+    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+    "skos": "http://www.w3.org/2004/02/skos/core#",
+    "specifics": "gnt:hasContentInfo",
+    "title": "dct:title",
+    "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#",
+    "tissueInfo": "gnt:hasTissueInfo",
+    "tissue": "gnt:hasTissue",
+    "contactWebUrl": "foaf:homepage",
+    "contactName": "foaf:name",
+}
+
+SEARCH_CONTEXT = {
+    "pages": "ex:pages",
+    "hits": "ex:hits",
+    "result": "ex:result",
+    "results": "ex:items",
+    "resultItem": "ex:resultType",
+    "currentPage": "ex:currentPage",
+}
+
+DATASET_SEARCH_CONTEXT = SEARCH_CONTEXT | {
+    "classifiedUnder": "xkos:classifiedUnder",
+    "created": "dct:created",
+    "dct": "http://purl.org/dc/terms/",
+    "ex": "http://example.org/stuff/1.0/",
+    "inbredSet": "ex:belongsToInbredSet",
+    "title": "dct:title",
+    "name": "rdfs:label",
+    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+    "type": "@type",
+    "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#",
+}
+
+PUBLICATION_CONTEXT = {
+    "dct": "http://purl.org/dc/terms/",
+    "fabio": "http://purl.org/spar/fabio/",
+    "prism": "http://prismstandard.org/namespaces/basic/2.0/",
+    "xsd": "http://www.w3.org/2001/XMLSchema#",
+    "title": "dct:title",
+    "journal": "fabio:Journal",
+    "volume": "prism:volume",
+    "page": "fabio:page",
+    "creator": "dct:creator",
+    "abstract": "dct:abstract",
+    "year": {
+        "@id": "fabio:hasPublicationYear",
+        "@type": "xsd:gYear",
+    },
+    "month": {
+        "@id": "prism:publicationDate",
+        "@type": "xsd:gMonth"
+    },
+}
+
+PHENOTYPE_CONTEXT = BASE_CONTEXT | PUBLICATION_CONTEXT | {
+    "skos": "http://www.w3.org/2004/02/skos/core#",
+    "dcat": "http://www.w3.org/ns/dcat#",
+    "prism": "http://prismstandard.org/namespaces/basic/2.0/",
+    "traitName": "skos:altLabel",
+    "trait": "rdfs:label",
+    "altName": "rdfs:altLabel",
+    "description": "dct:description",
+    "abbreviation": "gnt:abbreviation",
+    "labCode": "gnt:labCode",
+    "submitter": "gnt:submitter",
+    "dataset": "dcat:Distribution",
+    "contributor": "dct:contributor",
+    "mean": "gnt:mean",
+    "locus": "gnt:locus",
+    "lodScore": "gnt:lodScore",
+    "references": "dct:isReferencedBy",
+    "additive": "gnt:additive",
+    "sequence": "gnt:sequence",
+    "prefLabel": "skos:prefLabel",
+    "identifier": "dct:identifier",
+    "chromosome": "gnt:chr",
+    "mb": "gnt:mb",
+    "peakLocation": "gnt:locus",
+    "species": "gnt:belongsToSpecies",
+    "group": "gnt:belongsToGroup",
+}
diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py
index eb4014a..5a95683 100644
--- a/gn3/db/rdf.py
+++ b/gn3/db/rdf.py
@@ -4,39 +4,12 @@ This module is a collection of functions that handle SPARQL queries.
 
 """
 import json
-
+from string import Template
 from SPARQLWrapper import SPARQLWrapper
 from pyld import jsonld  # type: ignore
-
-
-PREFIXES = {
-    "dcat": "http://www.w3.org/ns/dcat#",
-    "dct": "http://purl.org/dc/terms/",
-    "ex": "http://example.org/stuff/1.0/",
-    "fabio": "http://purl.org/spar/fabio/",
-    "foaf": "http://xmlns.com/foaf/0.1/",
-    "generif": "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=",
-    "genotype": "http://genenetwork.org/genotype/",
-    "gn": "http://genenetwork.org/id/",
-    "gnc": "http://genenetwork.org/category/",
-    "gnt": "http://genenetwork.org/term/",
-    "owl": "http://www.w3.org/2002/07/owl#",
-    "phenotype": "http://genenetwork.org/phenotype/",
-    "prism": "http://prismstandard.org/namespaces/basic/2.0/",
-    "publication": "http://genenetwork.org/publication/",
-    "pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/",
-    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
-    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
-    "skos": "http://www.w3.org/2004/02/skos/core#",
-    "taxon": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=",
-    "up": "http://purl.uniprot.org/core/",
-    "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#",
-    "xsd": "http://www.w3.org/2001/XMLSchema#",
-}
-
-
-RDF_PREFIXES = "\n".join([f"PREFIX {key}: <{value}>"
-                          for key, value in PREFIXES.items()])
+from gn3.db.constants import (
+    RDF_PREFIXES, BASE_CONTEXT
+)
 
 
 def sparql_construct_query(query: str, endpoint: str) -> dict:
@@ -51,22 +24,101 @@ def sparql_construct_query(query: str, endpoint: str) -> dict:
 def query_frame_and_compact(query: str, context: dict, endpoint: str) -> dict:
     """Frame and then compact the results given a context"""
     results = sparql_construct_query(query, endpoint)
-    if not results:
-        return {}
     return jsonld.compact(jsonld.frame(results, context), context)
 
 
 def query_and_compact(query: str, context: dict, endpoint: str) -> dict:
     """Compact the results given a context"""
     results = sparql_construct_query(query, endpoint)
-    if not results:
-        return {}
     return jsonld.compact(results, context)
 
 
 def query_and_frame(query: str, context: dict, endpoint: str) -> dict:
     """Frame the results given a context"""
     results = sparql_construct_query(query, endpoint)
-    if not results:
-        return {}
     return jsonld.frame(results, context)
+
+
+def get_wiki_entries_by_symbol(symbol: str, sparql_uri: str) -> dict:
+    """Fetch all the Wiki entries using the symbol"""
+    # This query uses a sub-query to fetch the latest comment by the
+    # version id.
+    query = Template("""
+$prefix
+
+CONSTRUCT {
+    ?uid rdfs:label ?symbolName;
+         gnt:reason ?reason ;
+         gnt:species ?species ;
+         dct:references ?pmid ;
+         foaf:homepage ?weburl ;
+         rdfs:comment ?comment ;
+         foaf:mbox ?email ;
+         gnt:initial ?usercode ;
+         gnt:belongsToCategory ?category ;
+         gnt:hasVersion ?versionId ;
+         dct:created ?created ;
+         dct:identifier ?identifier .
+} WHERE {
+    ?symbolId rdfs:label ?symbolName .
+    ?uid rdfs:comment ?comment ;
+         gnt:symbol ?symbolId ;
+         rdf:type gnc:GNWikiEntry ;
+         dct:created ?createTime .
+    FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) .
+    {
+        SELECT (MAX(?vers) AS ?max) ?id_ WHERE {
+            ?symbolId rdfs:label ?symbolName .
+            ?uid dct:identifier ?id_ ;
+                 dct:hasVersion ?vers ;
+                 dct:identifier ?id_ ;
+                 gnt:symbol ?symbolId .
+            FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) .
+        }
+    }
+    ?uid dct:hasVersion ?max ;
+         dct:identifier ?id_ .
+    OPTIONAL { ?uid gnt:reason ?reason } .
+    OPTIONAL {
+        ?uid gnt:belongsToSpecies ?speciesId .
+        ?speciesId gnt:shortName ?species .
+    } .
+    OPTIONAL { ?uid dct:references ?pubmedId . } .
+    OPTIONAL { ?uid foaf:homepage ?weburl . } .
+    OPTIONAL { ?uid gnt:initial ?usercode . } .
+    OPTIONAL { ?uid gnt:mbox ?email . } .
+    OPTIONAL { ?uid gnt:belongsToCategory ?category . } .
+    BIND (str(?version) AS ?versionId) .
+    BIND (str(?id_) AS ?identifier) .
+    BIND (str(?pubmedId) AS ?pmid) .
+    BIND (str(?createTime) AS ?created) .
+}
+""").substitute(prefix=RDF_PREFIXES, symbol=symbol,)
+    context = BASE_CONTEXT | {
+        "foaf": "http://xmlns.com/foaf/0.1/",
+        "dct": "http://purl.org/dc/terms/",
+        "categories": "gnt:belongsToCategory",
+        "web_url": "foaf:homepage",
+        "version": "gnt:hasVersion",
+        "symbol": "rdfs:label",
+        "reason": "gnt:reason",
+        "species": "gnt:species",
+        "pubmed_id": "dct:references",
+        "email": "foaf:mbox",
+        "initial": "gnt:initial",
+        "comment": "rdfs:comment",
+        "created": "dct:created",
+        "id": "dct:identifier",
+        # This points to the RDF Node which is the unique identifier
+        # for this triplet.  It's constructed using the comment-id and
+        # the comment-versionId
+        "wiki_identifier": "@id",
+    }
+    results = query_frame_and_compact(
+        query, context,
+        sparql_uri
+    )
+    data = results.get("data")
+    if not data:
+        return results
+    return results
diff --git a/gn3/db/wiki.py b/gn3/db/wiki.py
new file mode 100644
index 0000000..abb1644
--- /dev/null
+++ b/gn3/db/wiki.py
@@ -0,0 +1,80 @@
+"""Helper functions to access wiki entries"""
+
+from typing import Dict, List
+
+from MySQLdb.cursors import DictCursor
+
+
+class MissingDBDataException(Exception):
+    """Error due to DB missing some data"""
+
+
+def get_latest_comment(connection, comment_id: str) -> int:
+    """ Latest comment is one with the highest versionId """
+    cursor = connection.cursor(DictCursor)
+    query = """ SELECT versionId AS version, symbol, PubMed_ID AS pubmed_ids, sp.Name AS species,
+        comment, email, weburl, initial, reason
+        FROM `GeneRIF` gr
+		INNER JOIN Species sp USING(SpeciesId)
+		WHERE gr.Id = %s
+		ORDER BY versionId DESC LIMIT 1;
+    """
+    cursor.execute(query, (comment_id,))
+    result = cursor.fetchone()
+    result["pubmed_ids"] = [x.strip() for x in result["pubmed_ids"].split()]
+    categories_query = """
+        SELECT grx.GeneRIFId, grx.versionId, gc.Name FROM GeneRIFXRef grx
+                INNER JOIN GeneCategory gc ON grx.GeneCategoryId=gc.Id
+                WHERE GeneRIFId = %s AND versionId=%s;
+    """
+
+    cursor.execute(categories_query, (comment_id, result["version"]))
+    categories = cursor.fetchall()
+    result["categories"] = [x["Name"] for x in categories]
+    return result
+
+
+def get_species_id(cursor, species_name: str) -> int:
+    """Find species id given species `Name`"""
+    cursor.execute("SELECT SpeciesID from Species  WHERE Name = %s", (species_name,))
+    species_ids = cursor.fetchall()
+    if len(species_ids) != 1:
+        raise MissingDBDataException(
+            f"expected 1 species with Name={species_name} but found {len(species_ids)}!"
+        )
+    return species_ids[0][0]
+
+
+def get_next_comment_version(cursor, comment_id: int) -> int:
+    """Find the version to add, usually latest_version + 1"""
+    cursor.execute(
+        "SELECT MAX(versionId) as version_id from GeneRIF WHERE Id = %s", (comment_id,)
+    )
+    latest_version = cursor.fetchone()[0]
+    if latest_version is None:
+        raise MissingDBDataException(f"No comment found with comment_id={comment_id}")
+    return latest_version + 1
+
+
+def get_categories_ids(cursor, categories: List[str]) -> List[int]:
+    """Get the categories_ids from a list of category strings"""
+    dict_cats = get_categories(cursor)
+    category_ids = []
+    for category in set(categories):
+        cat_id = dict_cats.get(category.strip())
+        if cat_id is None:
+            raise MissingDBDataException(f"Category with Name={category} not found")
+        category_ids.append(cat_id)
+    return category_ids
+
+def get_categories(cursor) -> Dict[str, int]:
+    cursor.execute("SELECT Name, Id from GeneCategory")
+    raw_categories = cursor.fetchall()
+    dict_cats = dict(raw_categories)
+    return dict_cats
+
+def get_species(cursor) -> Dict[str, str]:
+    cursor.execute("SELECT Name, SpeciesName from Species")
+    raw_species = cursor.fetchall()
+    dict_cats = dict(raw_species)
+    return dict_cats