diff options
Diffstat (limited to 'gn3/db')
| -rw-r--r-- | gn3/db/constants.py | 152 | ||||
| -rw-r--r-- | gn3/db/rdf.py | 126 | ||||
| -rw-r--r-- | gn3/db/wiki.py | 80 |
3 files changed, 321 insertions, 37 deletions
diff --git a/gn3/db/constants.py b/gn3/db/constants.py new file mode 100644 index 0000000..45e3bfc --- /dev/null +++ b/gn3/db/constants.py @@ -0,0 +1,152 @@ +""" +This module contains some constants used in other modules. +""" +PREFIXES = { + "dcat": "http://www.w3.org/ns/dcat#", + "dct": "http://purl.org/dc/terms/", + "ex": "http://example.org/stuff/1.0/", + "fabio": "http://purl.org/spar/fabio/", + "foaf": "http://xmlns.com/foaf/0.1/", + "generif": "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=", + "genotype": "http://genenetwork.org/genotype/", + "gn": "http://genenetwork.org/id/", + "gnc": "http://genenetwork.org/category/", + "gnt": "http://genenetwork.org/term/", + "owl": "http://www.w3.org/2002/07/owl#", + "phenotype": "http://genenetwork.org/phenotype/", + "prism": "http://prismstandard.org/namespaces/basic/2.0/", + "publication": "http://genenetwork.org/publication/", + "pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "taxon": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=", + "up": "http://purl.uniprot.org/core/", + "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", + "xsd": "http://www.w3.org/2001/XMLSchema#", +} + +RDF_PREFIXES = "\n".join([f"PREFIX {key}: <{value}>" + for key, value in PREFIXES.items()]) + +BASE_CONTEXT = { + "data": "@graph", + "type": "@type", + "gn": "http://genenetwork.org/id/", + "gnc": "http://genenetwork.org/category/", + "gnt": "http://genenetwork.org/term/", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#>", +} + +DATASET_CONTEXT = { + "accessRights": "dct:accessRights", + "accessionId": "dct:identifier", + "acknowledgement": "gnt:hasAcknowledgement", + "altLabel": "skos:altLabel", + "caseInfo": "gnt:hasCaseInfo", + "classifiedUnder": "xkos:classifiedUnder", + "contributors": "dct:creator", + "contactPoint": "dcat:contactPoint", + "created": "dct:created", + "dcat": "http://www.w3.org/ns/dcat#", + "dct": "http://purl.org/dc/terms/", + "description": "dct:description", + "ex": "http://example.org/stuff/1.0/", + "experimentDesignInfo": "gnt:hasExperimentDesignInfo", + "experimentType": "gnt:hasExperimentType", + "foaf": "http://xmlns.com/foaf/0.1/", + "geoSeriesId": "gnt:hasGeoSeriesId", + "gnt": "http://genenetwork.org/term/", + "inbredSet": "gnt:belongsToGroup", + "label": "rdfs:label", + "normalization": "gnt:usesNormalization", + "platformInfo": "gnt:hasPlatformInfo", + "notes": "gnt:hasNotes", + "organization": "foaf:Organization", + "prefLabel": "skos:prefLabel", + "citation": "dct:isReferencedBy", + "GoTree": "gnt:hasGOTreeValue", + "platform": "gnt:usesPlatform", + "processingInfo": "gnt:hasDataProcessingInfo", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "specifics": "gnt:hasContentInfo", + "title": "dct:title", + "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", + "tissueInfo": "gnt:hasTissueInfo", + "tissue": "gnt:hasTissue", + "contactWebUrl": "foaf:homepage", + "contactName": "foaf:name", +} + +SEARCH_CONTEXT = { + "pages": "ex:pages", + "hits": "ex:hits", + "result": "ex:result", + "results": "ex:items", + "resultItem": "ex:resultType", + "currentPage": "ex:currentPage", +} + +DATASET_SEARCH_CONTEXT = SEARCH_CONTEXT | { + "classifiedUnder": "xkos:classifiedUnder", + "created": "dct:created", + "dct": "http://purl.org/dc/terms/", + "ex": "http://example.org/stuff/1.0/", + "inbredSet": "ex:belongsToInbredSet", + "title": "dct:title", + "name": "rdfs:label", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "type": "@type", + "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", +} + +PUBLICATION_CONTEXT = { + "dct": "http://purl.org/dc/terms/", + "fabio": "http://purl.org/spar/fabio/", + "prism": "http://prismstandard.org/namespaces/basic/2.0/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "title": "dct:title", + "journal": "fabio:Journal", + "volume": "prism:volume", + "page": "fabio:page", + "creator": "dct:creator", + "abstract": "dct:abstract", + "year": { + "@id": "fabio:hasPublicationYear", + "@type": "xsd:gYear", + }, + "month": { + "@id": "prism:publicationDate", + "@type": "xsd:gMonth" + }, +} + +PHENOTYPE_CONTEXT = BASE_CONTEXT | PUBLICATION_CONTEXT | { + "skos": "http://www.w3.org/2004/02/skos/core#", + "dcat": "http://www.w3.org/ns/dcat#", + "prism": "http://prismstandard.org/namespaces/basic/2.0/", + "traitName": "skos:altLabel", + "trait": "rdfs:label", + "altName": "rdfs:altLabel", + "description": "dct:description", + "abbreviation": "gnt:abbreviation", + "labCode": "gnt:labCode", + "submitter": "gnt:submitter", + "dataset": "dcat:Distribution", + "contributor": "dct:contributor", + "mean": "gnt:mean", + "locus": "gnt:locus", + "lodScore": "gnt:lodScore", + "references": "dct:isReferencedBy", + "additive": "gnt:additive", + "sequence": "gnt:sequence", + "prefLabel": "skos:prefLabel", + "identifier": "dct:identifier", + "chromosome": "gnt:chr", + "mb": "gnt:mb", + "peakLocation": "gnt:locus", + "species": "gnt:belongsToSpecies", + "group": "gnt:belongsToGroup", +} diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py index eb4014a..5a95683 100644 --- a/gn3/db/rdf.py +++ b/gn3/db/rdf.py @@ -4,39 +4,12 @@ This module is a collection of functions that handle SPARQL queries. """ import json - +from string import Template from SPARQLWrapper import SPARQLWrapper from pyld import jsonld # type: ignore - - -PREFIXES = { - "dcat": "http://www.w3.org/ns/dcat#", - "dct": "http://purl.org/dc/terms/", - "ex": "http://example.org/stuff/1.0/", - "fabio": "http://purl.org/spar/fabio/", - "foaf": "http://xmlns.com/foaf/0.1/", - "generif": "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=", - "genotype": "http://genenetwork.org/genotype/", - "gn": "http://genenetwork.org/id/", - "gnc": "http://genenetwork.org/category/", - "gnt": "http://genenetwork.org/term/", - "owl": "http://www.w3.org/2002/07/owl#", - "phenotype": "http://genenetwork.org/phenotype/", - "prism": "http://prismstandard.org/namespaces/basic/2.0/", - "publication": "http://genenetwork.org/publication/", - "pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/", - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "skos": "http://www.w3.org/2004/02/skos/core#", - "taxon": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=", - "up": "http://purl.uniprot.org/core/", - "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", - "xsd": "http://www.w3.org/2001/XMLSchema#", -} - - -RDF_PREFIXES = "\n".join([f"PREFIX {key}: <{value}>" - for key, value in PREFIXES.items()]) +from gn3.db.constants import ( + RDF_PREFIXES, BASE_CONTEXT +) def sparql_construct_query(query: str, endpoint: str) -> dict: @@ -51,22 +24,101 @@ def sparql_construct_query(query: str, endpoint: str) -> dict: def query_frame_and_compact(query: str, context: dict, endpoint: str) -> dict: """Frame and then compact the results given a context""" results = sparql_construct_query(query, endpoint) - if not results: - return {} return jsonld.compact(jsonld.frame(results, context), context) def query_and_compact(query: str, context: dict, endpoint: str) -> dict: """Compact the results given a context""" results = sparql_construct_query(query, endpoint) - if not results: - return {} return jsonld.compact(results, context) def query_and_frame(query: str, context: dict, endpoint: str) -> dict: """Frame the results given a context""" results = sparql_construct_query(query, endpoint) - if not results: - return {} return jsonld.frame(results, context) + + +def get_wiki_entries_by_symbol(symbol: str, sparql_uri: str) -> dict: + """Fetch all the Wiki entries using the symbol""" + # This query uses a sub-query to fetch the latest comment by the + # version id. + query = Template(""" +$prefix + +CONSTRUCT { + ?uid rdfs:label ?symbolName; + gnt:reason ?reason ; + gnt:species ?species ; + dct:references ?pmid ; + foaf:homepage ?weburl ; + rdfs:comment ?comment ; + foaf:mbox ?email ; + gnt:initial ?usercode ; + gnt:belongsToCategory ?category ; + gnt:hasVersion ?versionId ; + dct:created ?created ; + dct:identifier ?identifier . +} WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid rdfs:comment ?comment ; + gnt:symbol ?symbolId ; + rdf:type gnc:GNWikiEntry ; + dct:created ?createTime . + FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . + { + SELECT (MAX(?vers) AS ?max) ?id_ WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid dct:identifier ?id_ ; + dct:hasVersion ?vers ; + dct:identifier ?id_ ; + gnt:symbol ?symbolId . + FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . + } + } + ?uid dct:hasVersion ?max ; + dct:identifier ?id_ . + OPTIONAL { ?uid gnt:reason ?reason } . + OPTIONAL { + ?uid gnt:belongsToSpecies ?speciesId . + ?speciesId gnt:shortName ?species . + } . + OPTIONAL { ?uid dct:references ?pubmedId . } . + OPTIONAL { ?uid foaf:homepage ?weburl . } . + OPTIONAL { ?uid gnt:initial ?usercode . } . + OPTIONAL { ?uid gnt:mbox ?email . } . + OPTIONAL { ?uid gnt:belongsToCategory ?category . } . + BIND (str(?version) AS ?versionId) . + BIND (str(?id_) AS ?identifier) . + BIND (str(?pubmedId) AS ?pmid) . + BIND (str(?createTime) AS ?created) . +} +""").substitute(prefix=RDF_PREFIXES, symbol=symbol,) + context = BASE_CONTEXT | { + "foaf": "http://xmlns.com/foaf/0.1/", + "dct": "http://purl.org/dc/terms/", + "categories": "gnt:belongsToCategory", + "web_url": "foaf:homepage", + "version": "gnt:hasVersion", + "symbol": "rdfs:label", + "reason": "gnt:reason", + "species": "gnt:species", + "pubmed_id": "dct:references", + "email": "foaf:mbox", + "initial": "gnt:initial", + "comment": "rdfs:comment", + "created": "dct:created", + "id": "dct:identifier", + # This points to the RDF Node which is the unique identifier + # for this triplet. It's constructed using the comment-id and + # the comment-versionId + "wiki_identifier": "@id", + } + results = query_frame_and_compact( + query, context, + sparql_uri + ) + data = results.get("data") + if not data: + return results + return results diff --git a/gn3/db/wiki.py b/gn3/db/wiki.py new file mode 100644 index 0000000..abb1644 --- /dev/null +++ b/gn3/db/wiki.py @@ -0,0 +1,80 @@ +"""Helper functions to access wiki entries""" + +from typing import Dict, List + +from MySQLdb.cursors import DictCursor + + +class MissingDBDataException(Exception): + """Error due to DB missing some data""" + + +def get_latest_comment(connection, comment_id: str) -> int: + """ Latest comment is one with the highest versionId """ + cursor = connection.cursor(DictCursor) + query = """ SELECT versionId AS version, symbol, PubMed_ID AS pubmed_ids, sp.Name AS species, + comment, email, weburl, initial, reason + FROM `GeneRIF` gr + INNER JOIN Species sp USING(SpeciesId) + WHERE gr.Id = %s + ORDER BY versionId DESC LIMIT 1; + """ + cursor.execute(query, (comment_id,)) + result = cursor.fetchone() + result["pubmed_ids"] = [x.strip() for x in result["pubmed_ids"].split()] + categories_query = """ + SELECT grx.GeneRIFId, grx.versionId, gc.Name FROM GeneRIFXRef grx + INNER JOIN GeneCategory gc ON grx.GeneCategoryId=gc.Id + WHERE GeneRIFId = %s AND versionId=%s; + """ + + cursor.execute(categories_query, (comment_id, result["version"])) + categories = cursor.fetchall() + result["categories"] = [x["Name"] for x in categories] + return result + + +def get_species_id(cursor, species_name: str) -> int: + """Find species id given species `Name`""" + cursor.execute("SELECT SpeciesID from Species WHERE Name = %s", (species_name,)) + species_ids = cursor.fetchall() + if len(species_ids) != 1: + raise MissingDBDataException( + f"expected 1 species with Name={species_name} but found {len(species_ids)}!" + ) + return species_ids[0][0] + + +def get_next_comment_version(cursor, comment_id: int) -> int: + """Find the version to add, usually latest_version + 1""" + cursor.execute( + "SELECT MAX(versionId) as version_id from GeneRIF WHERE Id = %s", (comment_id,) + ) + latest_version = cursor.fetchone()[0] + if latest_version is None: + raise MissingDBDataException(f"No comment found with comment_id={comment_id}") + return latest_version + 1 + + +def get_categories_ids(cursor, categories: List[str]) -> List[int]: + """Get the categories_ids from a list of category strings""" + dict_cats = get_categories(cursor) + category_ids = [] + for category in set(categories): + cat_id = dict_cats.get(category.strip()) + if cat_id is None: + raise MissingDBDataException(f"Category with Name={category} not found") + category_ids.append(cat_id) + return category_ids + +def get_categories(cursor) -> Dict[str, int]: + cursor.execute("SELECT Name, Id from GeneCategory") + raw_categories = cursor.fetchall() + dict_cats = dict(raw_categories) + return dict_cats + +def get_species(cursor) -> Dict[str, str]: + cursor.execute("SELECT Name, SpeciesName from Species") + raw_species = cursor.fetchall() + dict_cats = dict(raw_species) + return dict_cats |
