diff options
-rw-r--r-- | gn3/api/metadata.py | 2 | ||||
-rw-r--r-- | gn3/api/metadata_api/wiki.py | 24 | ||||
-rw-r--r-- | gn3/db/rdf.py | 124 | ||||
-rw-r--r-- | gn3/db/rdf/__init__.py (renamed from gn3/db/constants.py) | 43 | ||||
-rw-r--r-- | gn3/db/rdf/wiki.py | 168 | ||||
-rw-r--r-- | gn3/db/wiki.py | 17 |
6 files changed, 242 insertions, 136 deletions
diff --git a/gn3/api/metadata.py b/gn3/api/metadata.py index 3f28f5d..6110880 100644 --- a/gn3/api/metadata.py +++ b/gn3/api/metadata.py @@ -15,7 +15,7 @@ from gn3.db.datasets import (retrieve_metadata, get_history) from gn3.db.rdf import (query_frame_and_compact, query_and_compact) -from gn3.db.constants import ( +from gn3.db.rdf import ( RDF_PREFIXES, BASE_CONTEXT, DATASET_CONTEXT, DATASET_SEARCH_CONTEXT, PUBLICATION_CONTEXT, diff --git a/gn3/api/metadata_api/wiki.py b/gn3/api/metadata_api/wiki.py index a4abef6..9ea0d53 100644 --- a/gn3/api/metadata_api/wiki.py +++ b/gn3/api/metadata_api/wiki.py @@ -5,8 +5,9 @@ from typing import Any, Dict from flask import Blueprint, request, jsonify, current_app, make_response from gn3 import db_utils from gn3.db import wiki -from gn3.db.rdf import (query_frame_and_compact, - get_wiki_entries_by_symbol) +from gn3.db.rdf import query_frame_and_compact +from gn3.db.rdf.wiki import (get_wiki_entries_by_symbol, + get_comment_history) wiki_blueprint = Blueprint("wiki", __name__, url_prefix="wiki") @@ -71,7 +72,6 @@ def edit_wiki(comment_id: int): @wiki_blueprint.route("/<string:symbol>", methods=["GET"]) def get_wiki_entries(symbol: str): """Fetch wiki entries""" - content_type = request.headers.get("Content-Type") status_code = 200 response = get_wiki_entries_by_symbol( symbol=symbol, @@ -80,7 +80,7 @@ def get_wiki_entries(symbol: str): if not data: data = {} status_code = 404 - if content_type == "application/ld+json": + if request.headers.get("Accept") == "application/ld+json": payload = make_response(response) payload.headers["Content-Type"] = "application/ld+json" return payload, status_code @@ -117,3 +117,19 @@ def get_species(): species_dict = wiki.get_species(cursor) return jsonify(species_dict) return jsonify(error="Error getting species, most likely due to DB error!"), 500 + + +@wiki_blueprint.route("/<int:comment_id>/history", methods=["GET"]) +def get_history(comment_id): + status_code = 200 + response = get_comment_history(comment_id=comment_id, + sparql_uri=current_app.config["SPARQL_ENDPOINT"]) + data = response.get("data") + if not data: + data = {} + status_code = 404 + if request.headers.get("Accept") == "application/ld+json": + payload = make_response(response) + payload.headers["Content-Type"] = "application/ld+json" + return payload, status_code + return jsonify(data), status_code diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py deleted file mode 100644 index 5a95683..0000000 --- a/gn3/db/rdf.py +++ /dev/null @@ -1,124 +0,0 @@ -"""RDF utilities - -This module is a collection of functions that handle SPARQL queries. - -""" -import json -from string import Template -from SPARQLWrapper import SPARQLWrapper -from pyld import jsonld # type: ignore -from gn3.db.constants import ( - RDF_PREFIXES, BASE_CONTEXT -) - - -def sparql_construct_query(query: str, endpoint: str) -> dict: - """Query virtuoso using a CONSTRUCT query and return a json-ld - dictionary""" - sparql = SPARQLWrapper(endpoint) - sparql.setQuery(query) - results = sparql.queryAndConvert() - return json.loads(results.serialize(format="json-ld")) # type: ignore - - -def query_frame_and_compact(query: str, context: dict, endpoint: str) -> dict: - """Frame and then compact the results given a context""" - results = sparql_construct_query(query, endpoint) - return jsonld.compact(jsonld.frame(results, context), context) - - -def query_and_compact(query: str, context: dict, endpoint: str) -> dict: - """Compact the results given a context""" - results = sparql_construct_query(query, endpoint) - return jsonld.compact(results, context) - - -def query_and_frame(query: str, context: dict, endpoint: str) -> dict: - """Frame the results given a context""" - results = sparql_construct_query(query, endpoint) - return jsonld.frame(results, context) - - -def get_wiki_entries_by_symbol(symbol: str, sparql_uri: str) -> dict: - """Fetch all the Wiki entries using the symbol""" - # This query uses a sub-query to fetch the latest comment by the - # version id. - query = Template(""" -$prefix - -CONSTRUCT { - ?uid rdfs:label ?symbolName; - gnt:reason ?reason ; - gnt:species ?species ; - dct:references ?pmid ; - foaf:homepage ?weburl ; - rdfs:comment ?comment ; - foaf:mbox ?email ; - gnt:initial ?usercode ; - gnt:belongsToCategory ?category ; - gnt:hasVersion ?versionId ; - dct:created ?created ; - dct:identifier ?identifier . -} WHERE { - ?symbolId rdfs:label ?symbolName . - ?uid rdfs:comment ?comment ; - gnt:symbol ?symbolId ; - rdf:type gnc:GNWikiEntry ; - dct:created ?createTime . - FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . - { - SELECT (MAX(?vers) AS ?max) ?id_ WHERE { - ?symbolId rdfs:label ?symbolName . - ?uid dct:identifier ?id_ ; - dct:hasVersion ?vers ; - dct:identifier ?id_ ; - gnt:symbol ?symbolId . - FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . - } - } - ?uid dct:hasVersion ?max ; - dct:identifier ?id_ . - OPTIONAL { ?uid gnt:reason ?reason } . - OPTIONAL { - ?uid gnt:belongsToSpecies ?speciesId . - ?speciesId gnt:shortName ?species . - } . - OPTIONAL { ?uid dct:references ?pubmedId . } . - OPTIONAL { ?uid foaf:homepage ?weburl . } . - OPTIONAL { ?uid gnt:initial ?usercode . } . - OPTIONAL { ?uid gnt:mbox ?email . } . - OPTIONAL { ?uid gnt:belongsToCategory ?category . } . - BIND (str(?version) AS ?versionId) . - BIND (str(?id_) AS ?identifier) . - BIND (str(?pubmedId) AS ?pmid) . - BIND (str(?createTime) AS ?created) . -} -""").substitute(prefix=RDF_PREFIXES, symbol=symbol,) - context = BASE_CONTEXT | { - "foaf": "http://xmlns.com/foaf/0.1/", - "dct": "http://purl.org/dc/terms/", - "categories": "gnt:belongsToCategory", - "web_url": "foaf:homepage", - "version": "gnt:hasVersion", - "symbol": "rdfs:label", - "reason": "gnt:reason", - "species": "gnt:species", - "pubmed_id": "dct:references", - "email": "foaf:mbox", - "initial": "gnt:initial", - "comment": "rdfs:comment", - "created": "dct:created", - "id": "dct:identifier", - # This points to the RDF Node which is the unique identifier - # for this triplet. It's constructed using the comment-id and - # the comment-versionId - "wiki_identifier": "@id", - } - results = query_frame_and_compact( - query, context, - sparql_uri - ) - data = results.get("data") - if not data: - return results - return results diff --git a/gn3/db/constants.py b/gn3/db/rdf/__init__.py index 45e3bfc..ffb75e5 100644 --- a/gn3/db/constants.py +++ b/gn3/db/rdf/__init__.py @@ -1,6 +1,15 @@ +"""RDF + +Constants for prefixes and contexts; and wrapper functions around +creating contexts to be used by jsonld when framing and/or compacting. + """ -This module contains some constants used in other modules. -""" +import json + +from SPARQLWrapper import SPARQLWrapper +from pyld import jsonld # type: ignore + + PREFIXES = { "dcat": "http://www.w3.org/ns/dcat#", "dct": "http://purl.org/dc/terms/", @@ -150,3 +159,33 @@ PHENOTYPE_CONTEXT = BASE_CONTEXT | PUBLICATION_CONTEXT | { "species": "gnt:belongsToSpecies", "group": "gnt:belongsToGroup", } + + +def sparql_construct_query(query: str, endpoint: str) -> dict: + """Query virtuoso using a CONSTRUCT query and return a json-ld + dictionary""" + sparql = SPARQLWrapper(endpoint) + sparql.setQuery(query) + results = sparql.queryAndConvert() + return json.loads(results.serialize(format="json-ld")) # type: ignore + + +def query_frame_and_compact(query: str, context: dict, endpoint: str) -> dict: + """Frame and then compact the results given a context""" + results = sparql_construct_query(query, endpoint) + return jsonld.compact( + jsonld.frame(results, context), + context, + options={"graph": True}) + + +def query_and_compact(query: str, context: dict, endpoint: str) -> dict: + """Compact the results given a context""" + results = sparql_construct_query(query, endpoint) + return jsonld.compact(results, context, options={"graph": True}) + + +def query_and_frame(query: str, context: dict, endpoint: str) -> dict: + """Frame the results given a context""" + results = sparql_construct_query(query, endpoint) + return jsonld.frame(results, context) diff --git a/gn3/db/rdf/wiki.py b/gn3/db/rdf/wiki.py new file mode 100644 index 0000000..f7bec47 --- /dev/null +++ b/gn3/db/rdf/wiki.py @@ -0,0 +1,168 @@ +"""Sparql queries to get metadata about WIKI and RIF metadata. + +""" +from string import Template +from gn3.db.rdf import (BASE_CONTEXT, RDF_PREFIXES, + query_frame_and_compact) + + +WIKI_CONTEXT = BASE_CONTEXT | { + "foaf": "http://xmlns.com/foaf/0.1/", + "dct": "http://purl.org/dc/terms/", + "categories": "gnt:belongsToCategory", + "web_url": "foaf:homepage", + "version": "gnt:hasVersion", + "symbol": "rdfs:label", + "reason": "gnt:reason", + "species": "gnt:species", + "pubmed_ids": "dct:references", + "email": "foaf:mbox", + "initial": "gnt:initial", + "comment": "rdfs:comment", + "created": "dct:created", + "id": "dct:identifier", + # This points to the RDF Node which is the unique identifier + # for this triplet. It's constructed using the comment-id and + # the comment-versionId + "wiki_identifier": "@id", +} + + +def get_wiki_entries_by_symbol(symbol: str, sparql_uri: str) -> dict: + """Fetch all the Wiki entries using the symbol""" + # This query uses a sub-query to fetch the latest comment by the + # version id. + query = Template(""" +$prefix + +CONSTRUCT { + ?uid rdfs:label ?symbolName; + gnt:reason ?reason ; + gnt:species ?species ; + dct:references ?pmid ; + foaf:homepage ?weburl ; + rdfs:comment ?comment ; + foaf:mbox ?email ; + gnt:initial ?usercode ; + gnt:belongsToCategory ?category ; + gnt:hasVersion ?versionId ; + dct:created ?created ; + dct:identifier ?identifier . +} WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid rdfs:comment ?comment ; + gnt:symbol ?symbolId ; + rdf:type gnc:GNWikiEntry ; + dct:created ?createTime . + FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . + { + SELECT (MAX(?vers) AS ?max) ?id_ WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid dct:identifier ?id_ ; + dct:hasVersion ?vers ; + dct:identifier ?id_ ; + gnt:symbol ?symbolId . + FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . + } + } + ?uid dct:hasVersion ?max ; + dct:identifier ?id_ . + OPTIONAL { ?uid gnt:reason ?reason } . + OPTIONAL { + ?uid gnt:belongsToSpecies ?speciesId . + ?speciesId gnt:shortName ?species . + } . + OPTIONAL { ?uid dct:references ?pubmedId . } . + OPTIONAL { ?uid foaf:homepage ?weburl . } . + OPTIONAL { ?uid gnt:initial ?usercode . } . + OPTIONAL { ?uid foaf:mbox ?email . } . + OPTIONAL { ?uid gnt:belongsToCategory ?category . } . + BIND (str(?version) AS ?versionId) . + BIND (str(?id_) AS ?identifier) . + BIND (str(?pubmedId) AS ?pmid) . + BIND (str(?createTime) AS ?created) . +} +""").substitute(prefix=RDF_PREFIXES, symbol=symbol,) + results = query_frame_and_compact( + query, WIKI_CONTEXT, + sparql_uri + ) + data = results.get("data") + if not data: + return results + return results + + +def get_comment_history(comment_id: int, sparql_uri: str) -> dict: + """Get all the historical data for a given id""" + query = Template(""" +$prefix + +CONSTRUCT { + ?uid rdfs:label ?symbolName ; + gnt:reason ?reason ; + gnt:species ?species ; + dct:references ?pmid ; + foaf:homepage ?weburl ; + rdfs:comment ?comment ; + foaf:mbox ?email ; + gnt:initial ?usercode ; + gnt:belongsToCategory ?category ; + gnt:hasVersion ?versionId ; + dct:created ?created . +} WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid rdf:type gnc:GNWikiEntry ; + rdfs:comment ?comment ; + gnt:symbol ?symbolId ; + dct:created ?createTime ; + dct:hasVersion ?version ; + dct:identifier $comment_id ; + dct:identifier ?id_ . + OPTIONAL { ?uid gnt:reason ?reason_ } . + OPTIONAL { + ?uid gnt:belongsToSpecies ?speciesId . + ?speciesId gnt:shortName ?species_ . + } . + OPTIONAL { ?uid dct:references ?pmid . } . + OPTIONAL { ?uid foaf:homepage ?weburl_ . } . + OPTIONAL { ?uid gnt:initial ?usercode_ . } . + OPTIONAL { ?uid foaf:mbox ?email_ . } . + OPTIONAL { ?uid gnt:belongsToCategory ?category_ . } . + BIND (str(?version) AS ?versionId) . + BIND (str(?createTime) AS ?created) . + BIND (COALESCE(?reason_, "") AS ?reason) . + BIND (COALESCE(?weburl_, "") AS ?weburl) . + BIND (COALESCE(?usercode_, "") AS ?usercode) . + BIND (COALESCE(?email_, "") AS ?email) . + BIND (COALESCE(?species_, "") AS ?species) . + BIND (COALESCE(?category_, "") AS ?category) . +} +""").substitute(prefix=RDF_PREFIXES, comment_id=comment_id) + results = query_frame_and_compact( + query, WIKI_CONTEXT, + sparql_uri + ) + data = results.get("data") + for result in data: + categories = result.get("categories") or [] + if categories and isinstance(categories, str): + result["categories"] = [categories] + pmids = result.get("pubmed_ids") + if pmids and isinstance(pmids, str): + result["pubmed_ids"] = [pmids] + elif pmids: + result["pubmed_ids"] = [int(pmid) for pmid in pmids] + else: + result["pubmed_ids"] = [] + result["version"] = int(result["version"]) + + # We manually sort the array, since the SPARQL engine does not + # provide a guarantee that it will support an ORDER BY clause in a + # CONSTRUCT. Using ORDER BY on a solution sequence for a CONSTRUCT + # or DESCRIBE query has no direct effect because only SELECT + # returns a sequence of results. See: + # <https://stackoverflow.com/questions/78186393> + # <https://www.w3.org/TR/rdf-sparql-query/#modOrderBy> + results["data"] = sorted(data, key=lambda d: d["version"], reverse=True) + return results diff --git a/gn3/db/wiki.py b/gn3/db/wiki.py index abb1644..7ef5e68 100644 --- a/gn3/db/wiki.py +++ b/gn3/db/wiki.py @@ -9,7 +9,7 @@ class MissingDBDataException(Exception): """Error due to DB missing some data""" -def get_latest_comment(connection, comment_id: str) -> int: +def get_latest_comment(connection, comment_id: int) -> int: """ Latest comment is one with the highest versionId """ cursor = connection.cursor(DictCursor) query = """ SELECT versionId AS version, symbol, PubMed_ID AS pubmed_ids, sp.Name AS species, @@ -19,7 +19,7 @@ def get_latest_comment(connection, comment_id: str) -> int: WHERE gr.Id = %s ORDER BY versionId DESC LIMIT 1; """ - cursor.execute(query, (comment_id,)) + cursor.execute(query, (str(comment_id),)) result = cursor.fetchone() result["pubmed_ids"] = [x.strip() for x in result["pubmed_ids"].split()] categories_query = """ @@ -36,7 +36,8 @@ def get_latest_comment(connection, comment_id: str) -> int: def get_species_id(cursor, species_name: str) -> int: """Find species id given species `Name`""" - cursor.execute("SELECT SpeciesID from Species WHERE Name = %s", (species_name,)) + cursor.execute( + "SELECT SpeciesID from Species WHERE Name = %s", (species_name,)) species_ids = cursor.fetchall() if len(species_ids) != 1: raise MissingDBDataException( @@ -52,7 +53,8 @@ def get_next_comment_version(cursor, comment_id: int) -> int: ) latest_version = cursor.fetchone()[0] if latest_version is None: - raise MissingDBDataException(f"No comment found with comment_id={comment_id}") + raise MissingDBDataException( + f"No comment found with comment_id={comment_id}") return latest_version + 1 @@ -63,17 +65,22 @@ def get_categories_ids(cursor, categories: List[str]) -> List[int]: for category in set(categories): cat_id = dict_cats.get(category.strip()) if cat_id is None: - raise MissingDBDataException(f"Category with Name={category} not found") + raise MissingDBDataException( + f"Category with Name={category} not found") category_ids.append(cat_id) return category_ids + def get_categories(cursor) -> Dict[str, int]: + """Get all categories""" cursor.execute("SELECT Name, Id from GeneCategory") raw_categories = cursor.fetchall() dict_cats = dict(raw_categories) return dict_cats + def get_species(cursor) -> Dict[str, str]: + """Get all species""" cursor.execute("SELECT Name, SpeciesName from Species") raw_species = cursor.fetchall() dict_cats = dict(raw_species) |