diff options
-rw-r--r-- | gn3/api/correlation.py | 2 | ||||
-rw-r--r-- | gn3/api/heatmaps.py | 2 | ||||
-rw-r--r-- | gn3/api/menu.py | 2 | ||||
-rw-r--r-- | gn3/api/metadata.py | 349 | ||||
-rw-r--r-- | gn3/api/metadata_api/wiki.py | 119 | ||||
-rw-r--r-- | gn3/api/rqtl.py | 4 | ||||
-rw-r--r-- | gn3/api/search.py | 25 | ||||
-rw-r--r-- | gn3/case_attributes.py | 96 | ||||
-rw-r--r-- | gn3/db/constants.py | 152 | ||||
-rw-r--r-- | gn3/db/rdf.py | 126 | ||||
-rw-r--r-- | gn3/db/wiki.py | 80 | ||||
-rw-r--r-- | gn3/db_utils.py | 7 | ||||
-rw-r--r-- | gn3/errors.py | 42 | ||||
-rw-r--r-- | gn3/oauth2/__init__.py | 1 | ||||
-rw-r--r-- | gn3/oauth2/authorisation.py | 34 | ||||
-rw-r--r-- | gn3/oauth2/errors.py | 8 | ||||
-rw-r--r-- | gn3/oauth2/jwks.py | 36 | ||||
-rw-r--r-- | gn3/settings.py | 4 | ||||
-rwxr-xr-x | scripts/index-genenetwork | 251 | ||||
-rw-r--r-- | scripts/rqtl_wrapper.R | 16 | ||||
-rwxr-xr-x | scripts/update_rif_table.py | 167 |
21 files changed, 1043 insertions, 480 deletions
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py index eb4cc7d..c77dd93 100644 --- a/gn3/api/correlation.py +++ b/gn3/api/correlation.py @@ -64,7 +64,7 @@ def compute_lit_corr(species=None, gene_id=None): might be needed for actual computing of the correlation results """ - with database_connection(current_app.config["SQL_URI"]) as conn: + with database_connection(current_app.config["SQL_URI"], logger=current_app.logger) as conn: target_traits_gene_ids = request.get_json() target_trait_gene_list = list(target_traits_gene_ids.items()) diff --git a/gn3/api/heatmaps.py b/gn3/api/heatmaps.py index 632c54a..172d555 100644 --- a/gn3/api/heatmaps.py +++ b/gn3/api/heatmaps.py @@ -24,7 +24,7 @@ def clustered_heatmaps(): return jsonify({ "message": "You need to provide at least two trait names." }), 400 - with database_connection(current_app.config["SQL_URI"]) as conn: + with database_connection(current_app.config["SQL_URI"], logger=current_app.logger) as conn: def parse_trait_fullname(trait): name_parts = trait.split(":") return f"{name_parts[1]}::{name_parts[0]}" diff --git a/gn3/api/menu.py b/gn3/api/menu.py index 58b761e..377ac6b 100644 --- a/gn3/api/menu.py +++ b/gn3/api/menu.py @@ -10,5 +10,5 @@ menu = Blueprint("menu", __name__) @menu.route("/generate/json") def generate_json(): """Get the menu in the JSON format""" - with database_connection(current_app.config["SQL_URI"]) as conn: + with database_connection(current_app.config["SQL_URI"], logger=current_app.logger) as conn: return jsonify(gen_dropdown_json(conn)) diff --git a/gn3/api/metadata.py b/gn3/api/metadata.py index 91dc115..3f28f5d 100644 --- a/gn3/api/metadata.py +++ b/gn3/api/metadata.py @@ -5,7 +5,6 @@ from string import Template from pathlib import Path from authlib.jose import jwt - from flask import Blueprint from flask import request from flask import current_app @@ -14,135 +13,20 @@ from gn3.auth.authorisation.errors import AuthorisationError from gn3.db.datasets import (retrieve_metadata, save_metadata, get_history) -from gn3.db.rdf import RDF_PREFIXES from gn3.db.rdf import (query_frame_and_compact, - query_and_compact, - query_and_frame) - - -BASE_CONTEXT = { - "data": "@graph", - "id": "@id", - "type": "@type", - "gnc": "http://genenetwork.org/category/", - "gnt": "http://genenetwork.org/term/", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#>", -} - -DATASET_CONTEXT = { - "accessRights": "dct:accessRights", - "accessionId": "dct:identifier", - "acknowledgement": "gnt:hasAcknowledgement", - "altLabel": "skos:altLabel", - "caseInfo": "gnt:hasCaseInfo", - "classifiedUnder": "xkos:classifiedUnder", - "contributors": "dct:creator", - "contactPoint": "dcat:contactPoint", - "created": "dct:created", - "dcat": "http://www.w3.org/ns/dcat#", - "dct": "http://purl.org/dc/terms/", - "description": "dct:description", - "ex": "http://example.org/stuff/1.0/", - "experimentDesignInfo": "gnt:hasExperimentDesignInfo", - "experimentType": "gnt:hasExperimentType", - "foaf": "http://xmlns.com/foaf/0.1/", - "geoSeriesId": "gnt:hasGeoSeriesId", - "gnt": "http://genenetwork.org/term/", - "inbredSet": "gnt:belongsToGroup", - "label": "rdfs:label", - "normalization": "gnt:usesNormalization", - "platformInfo": "gnt:hasPlatformInfo", - "notes": "gnt:hasNotes", - "organization": "foaf:Organization", - "prefLabel": "skos:prefLabel", - "citation": "dct:isReferencedBy", - "GoTree": "gnt:hasGOTreeValue", - "platform": "gnt:usesPlatform", - "processingInfo": "gnt:hasDataProcessingInfo", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "skos": "http://www.w3.org/2004/02/skos/core#", - "specifics": "gnt:hasContentInfo", - "title": "dct:title", - "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", - "tissueInfo": "gnt:hasTissueInfo", - "tissue": "gnt:hasTissue", - "contactWebUrl": "foaf:homepage", - "contactName": "foaf:name", -} - -SEARCH_CONTEXT = { - "pages": "ex:pages", - "hits": "ex:hits", - "result": "ex:result", - "results": "ex:items", - "resultItem": "ex:resultType", - "currentPage": "ex:currentPage", -} - -DATASET_SEARCH_CONTEXT = SEARCH_CONTEXT | { - "classifiedUnder": "xkos:classifiedUnder", - "created": "dct:created", - "dct": "http://purl.org/dc/terms/", - "ex": "http://example.org/stuff/1.0/", - "inbredSet": "ex:belongsToInbredSet", - "title": "dct:title", - "name": "rdfs:label", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "type": "@type", - "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", -} + query_and_compact) +from gn3.db.constants import ( + RDF_PREFIXES, BASE_CONTEXT, + DATASET_CONTEXT, + DATASET_SEARCH_CONTEXT, PUBLICATION_CONTEXT, + PHENOTYPE_CONTEXT +) -PUBLICATION_CONTEXT = { - "dct": "http://purl.org/dc/terms/", - "fabio": "http://purl.org/spar/fabio/", - "prism": "http://prismstandard.org/namespaces/basic/2.0/", - "xsd": "http://www.w3.org/2001/XMLSchema#", - "title": "dct:title", - "journal": "fabio:Journal", - "volume": "prism:volume", - "page": "fabio:page", - "creator": "dct:creator", - "abstract": "dct:abstract", - "year": { - "@id": "fabio:hasPublicationYear", - "@type": "xsd:gYear", - }, - "month": { - "@id": "prism:publicationDate", - "@type": "xsd:gMonth" - }, -} +from gn3.api.metadata_api import wiki -PHENOTYPE_CONTEXT = BASE_CONTEXT | PUBLICATION_CONTEXT | { - "skos": "http://www.w3.org/2004/02/skos/core#", - "dcat": "http://www.w3.org/ns/dcat#", - "prism": "http://prismstandard.org/namespaces/basic/2.0/", - "traitName": "skos:altLabel", - "trait": "rdfs:label", - "altName": "rdfs:altLabel", - "description": "dct:description", - "abbreviation": "gnt:abbreviation", - "labCode": "gnt:labCode", - "submitter": "gnt:submitter", - "dataset": "dcat:Distribution", - "contributor": "dct:contributor", - "mean": "gnt:mean", - "locus": "gnt:locus", - "lodScore": "gnt:lodScore", - "references": "dct:isReferencedBy", - "additive": "gnt:additive", - "sequence": "gnt:sequence", - "prefLabel": "skos:prefLabel", - "identifier": "dct:identifier", - "chromosome": "gnt:chr", - "mb": "gnt:mb", - "peakLocation": "gnt:locus", - "species": "gnt:belongsToSpecies", - "group": "gnt:belongsToGroup", -} metadata = Blueprint("metadata", __name__) +metadata.register_blueprint(wiki.wiki_blueprint) @metadata.route("/datasets/<name>", methods=["GET"]) @@ -208,7 +92,7 @@ CONSTRUCT { (Path( current_app.config.get("DATA_DIR") ) / "gn-docs/general/datasets" / - Path(__result.get("id", "")).stem).as_posix() + Path(__result.get("id", "")).stem).as_posix() ) @@ -348,69 +232,6 @@ def edit_dataset(): lambda x: ("Edit successfull", 201) ) -@metadata.route("/datasets/search/<term>", methods=["GET"]) -def search_datasets(term): - """Search datasets""" - args = request.args - page = args.get("page", 0) - page_size = args.get("per-page", 10) - _query = Template(""" -$prefix - -CONSTRUCT { - ex:result rdf:type ex:resultType ; - ex:pages ?pages ; - ex:hits ?hits ; - ex:currentPage $offset ; - ex:items [ - rdfs:label ?label ; - dct:title ?title ; - ex:belongsToInbredSet ?inbredSetName ; - xkos:classifiedUnder ?datasetType ; - ] -} WHERE { -{ - SELECT DISTINCT ?dataset ?label ?inbredSetName ?datasetType ?title - WHERE { - ?dataset rdf:type dcat:Dataset ; - rdfs:label ?label ; - ?datasetPredicate ?datasetObject ; - xkos:classifiedUnder ?inbredSet . - ?inbredSet ^skos:member gnc:Set ; - rdfs:label ?inbredSetName . - ?datasetObject bif:contains "'$term'" . - OPTIONAL { - ?dataset dct:title ?title . - } . - OPTIONAL { - ?classification ^xkos:classifiedUnder ?dataset ; - ^skos:member gnc:DatasetType ; - ?typePredicate ?typeName ; - skos:prefLabel ?datasetType . - } - } ORDER BY ?dataset LIMIT $limit OFFSET $offset -} - -{ - SELECT (COUNT(DISTINCT ?dataset)/$limit+1 AS ?pages) - (COUNT(DISTINCT ?dataset) AS ?hits) WHERE { - ?dataset rdf:type dcat:Dataset ; - ?p ?o . - ?o bif:contains "'$term'" . - } -} - -} -""").substitute(prefix=RDF_PREFIXES, term=term, limit=page_size, offset=page) - _context = { - "@context": BASE_CONTEXT | DATASET_SEARCH_CONTEXT, - "type": "resultItem", - } - return query_frame_and_compact( - _query, _context, - current_app.config.get("SPARQL_ENDPOINT") - ) - @metadata.route("/publications/<name>", methods=["GET"]) def publications(name): @@ -436,65 +257,6 @@ CONSTRUCT { ) -@metadata.route("/publications/search/<term>", methods=["GET"]) -def search_publications(term): - """Search publications""" - args = request.args - page = args.get("page", 0) - page_size = args.get("per-page", 10) - _query = Template(""" -$prefix - -CONSTRUCT { - ex:result rdf:type ex:resultType ; - ex:totalCount ?totalCount ; - ex:currentPage $offset ; - ex:items [ - rdfs:label ?publication ; - dct:title ?title ; - ] -} WHERE { -{ - SELECT ?publication ?title ?pmid WHERE { - ?pub rdf:type fabio:ResearchPaper ; - ?predicate ?object ; - dct:title ?title . - ?object bif:contains "'$term'" . - BIND( STR(?pub) AS ?publication ) . - } ORDER BY ?title LIMIT $limit OFFSET $offset - } -{ - SELECT (COUNT(*)/$limit+1 AS ?totalCount) WHERE { - ?publication rdf:type fabio:ResearchPaper ; - ?predicate ?object . - ?object bif:contains "'$term'" . - } -} -} -""").substitute(prefix=RDF_PREFIXES, term=term, limit=page_size, offset=page) - _context = { - "@context": BASE_CONTEXT | SEARCH_CONTEXT | { - "dct": "http://purl.org/dc/terms/", - "ex": "http://example.org/stuff/1.0/", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "fabio": "http://purl.org/spar/fabio/", - "title": "dct:title", - "pubmed": "fabio:hasPubMedId", - "currentPage": "ex:currentPage", - "url": "rdfs:label", - }, - "type": "resultItem", - "paper": { - "@type": "fabio:ResearchPaper", - "@container": "@index" - } - } - return query_and_frame( - _query, _context, - current_app.config.get("SPARQL_ENDPOINT") - ) - - @metadata.route("/phenotypes/<name>", methods=["GET"]) @metadata.route("/phenotypes/<group>/<name>", methods=["GET"]) def phenotypes(name, group=None): @@ -630,97 +392,6 @@ CONSTRUCT { ) -@metadata.route("/genewikis/gn/<symbol>", methods=["GET"]) -def get_gn_genewiki_entries(symbol): - """Fetch the GN and NCBI GeneRIF entries""" - args = request.args - page = args.get("page", 0) - page_size = args.get("per-page", 10) - _query = Template(""" -$prefix - -CONSTRUCT { - ?symbol ex:entries [ - rdfs:comment ?comment ; - ex:species ?species_ ; - dct:created ?createTime ; - dct:references ?pmids ; - dct:creator ?creator ; - gnt:belongsToCategory ?categories ; - ] . - ?symbol rdf:type gnc:GNWikiEntry ; - ex:totalCount ?totalCount ; - ex:currentPage $offset . -} WHERE { -{ - SELECT ?symbol ?comment - (GROUP_CONCAT(DISTINCT ?speciesName; SEPARATOR='; ') AS ?species_) - ?createTime ?creator - (GROUP_CONCAT(DISTINCT ?pubmed; SEPARATOR='; ') AS ?pmids) - (GROUP_CONCAT(DISTINCT ?category; SEPARATOR='; ') AS ?categories) - WHERE { - ?symbol rdfs:label ?label ; - rdfs:comment _:entry . - ?label bif:contains "'$symbol'" . - _:entry rdf:type gnc:GNWikiEntry ; - rdfs:comment ?comment . - OPTIONAL { - ?species ^xkos:classifiedUnder _:entry ; - ^skos:member gnc:Species ; - skos:prefLabel ?speciesName . - } . - OPTIONAL { _:entry dct:created ?createTime . } . - OPTIONAL { _:entry dct:references ?pubmed . } . - OPTIONAL { - ?investigator foaf:name ?creator ; - ^dct:creator _:entry . - } . - OPTIONAL { _:entry gnt:belongsToCategory ?category . } . - } GROUP BY ?comment ?symbol ?createTime - ?creator ORDER BY ?createTime LIMIT $limit OFFSET $offset -} - -{ - SELECT (COUNT(DISTINCT ?comment)/$limit+1 AS ?totalCount) WHERE { - ?symbol rdfs:comment _:entry ; - rdfs:label ?label . - _:entry rdfs:comment ?comment ; - rdf:type gnc:GNWikiEntry . - ?label bif:contains "'$symbol'" . - } -} -} -""").substitute(prefix=RDF_PREFIXES, symbol=symbol, - limit=page_size, offset=page) - _context = { - "@context": BASE_CONTEXT | { - "ex": "http://example.org/stuff/1.0/", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "gnt": "http://genenetwork.org/term/", - "gnc": "http://genenetwork.org/category/", - "dct": "http://purl.org/dc/terms/", - "xsd": "http://www.w3.org/2001/XMLSchema#", - "entries": "ex:entries", - "comment": "rdfs:comment", - "species": "ex:species", - "category": 'gnt:belongsToCategory', - "author": "dct:creator", - "pubmed": "dct:references", - "currentPage": "ex:currentPage", - "pages": "ex:totalCount", - "created": { - "@id": "dct:created", - "@type": "xsd:datetime" - }, - }, - "type": "gnc:GNWikiEntry" - } - return query_frame_and_compact( - _query, _context, - current_app.config.get("SPARQL_ENDPOINT") - ) - - @metadata.route("/genewikis/ncbi/<symbol>", methods=["GET"]) def get_ncbi_genewiki_entries(symbol): """Fetch the NCBI GeneRIF entries""" diff --git a/gn3/api/metadata_api/wiki.py b/gn3/api/metadata_api/wiki.py new file mode 100644 index 0000000..a4abef6 --- /dev/null +++ b/gn3/api/metadata_api/wiki.py @@ -0,0 +1,119 @@ +"""API for accessing/editting wiki metadata""" + +import datetime +from typing import Any, Dict +from flask import Blueprint, request, jsonify, current_app, make_response +from gn3 import db_utils +from gn3.db import wiki +from gn3.db.rdf import (query_frame_and_compact, + get_wiki_entries_by_symbol) + + +wiki_blueprint = Blueprint("wiki", __name__, url_prefix="wiki") + + +@wiki_blueprint.route("/<int:comment_id>/edit", methods=["POST"]) +def edit_wiki(comment_id: int): + """Edit wiki comment. This is achieved by adding another entry with a new VersionId""" + # FIXME: attempt to check and fix for types here with relevant errors + payload: Dict[str, Any] = request.json # type: ignore + pubmed_ids = [str(x) for x in payload.get("pubmed_ids", [])] + + insert_dict = { + "Id": comment_id, + "symbol": payload["symbol"], + "PubMed_ID": " ".join(pubmed_ids), + "comment": payload["comment"], + "email": payload["email"], + "createtime": datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y-%m-%d %H:%M" + ), + "user_ip": request.environ.get("HTTP_X_REAL_IP", request.remote_addr), + "weburl": payload.get("web_url"), + "initial": payload.get("initial"), + "reason": payload["reason"], + } + + insert_query = """ + INSERT INTO GeneRIF (Id, versionId, symbol, PubMed_ID, SpeciesID, comment, + email, createtime, user_ip, weburl, initial, reason) + VALUES (%(Id)s, %(versionId)s, %(symbol)s, %(PubMed_ID)s, %(SpeciesID)s, %(comment)s, %(email)s, %(createtime)s, %(user_ip)s, %(weburl)s, %(initial)s, %(reason)s) + """ + with db_utils.database_connection(current_app.config["SQL_URI"]) as conn: + cursor = conn.cursor() + try: + category_ids = wiki.get_categories_ids( + cursor, payload["categories"]) + species_id = wiki.get_species_id(cursor, payload["species"]) + next_version = wiki.get_next_comment_version(cursor, comment_id) + except wiki.MissingDBDataException as missing_exc: + return jsonify(error=f"Error editting wiki entry, {missing_exc}"), 500 + insert_dict["SpeciesID"] = species_id + insert_dict["versionId"] = next_version + current_app.logger.debug(f"Running query: {insert_query}") + cursor.execute(insert_query, insert_dict) + category_addition_query = """ + INSERT INTO GeneRIFXRef (GeneRIFId, versionId, GeneCategoryId) + VALUES (%s, %s, %s) + """ + + for cat_id in category_ids: + current_app.logger.debug( + f"Running query: {category_addition_query}") + cursor.execute( + category_addition_query, (comment_id, + insert_dict["versionId"], cat_id) + ) + return jsonify({"success": "ok"}) + return jsonify(error="Error editing wiki entry, most likely due to DB error!"), 500 + + +@wiki_blueprint.route("/<string:symbol>", methods=["GET"]) +def get_wiki_entries(symbol: str): + """Fetch wiki entries""" + content_type = request.headers.get("Content-Type") + status_code = 200 + response = get_wiki_entries_by_symbol( + symbol=symbol, + sparql_uri=current_app.config["SPARQL_ENDPOINT"]) + data = response.get("data") + if not data: + data = {} + status_code = 404 + if content_type == "application/ld+json": + payload = make_response(response) + payload.headers["Content-Type"] = "application/ld+json" + return payload, status_code + return jsonify(data), status_code + + +@wiki_blueprint.route("/<int:comment_id>", methods=["GET"]) +def get_wiki(comment_id: int): + """ + Gets latest wiki comments. + + TODO: fetch this from RIF + """ + with db_utils.database_connection(current_app.config["SQL_URI"]) as conn: + return jsonify(wiki.get_latest_comment(conn, comment_id)) + return jsonify(error="Error fetching wiki entry, most likely due to DB error!"), 500 + + +@wiki_blueprint.route("/categories", methods=["GET"]) +def get_categories(): + """ Gets list of supported categories for RIF """ + with db_utils.database_connection(current_app.config["SQL_URI"]) as conn: + cursor = conn.cursor() + categories_dict = wiki.get_categories(cursor) + return jsonify(categories_dict) + return jsonify(error="Error getting categories, most likely due to DB error!"), 500 + + +@wiki_blueprint.route("/species", methods=["GET"]) +def get_species(): + """ Gets list of all species, contains name and SpeciesName """ + with db_utils.database_connection(current_app.config["SQL_URI"]) as conn: + cursor = conn.cursor() + species_dict = wiki.get_species(cursor) + return jsonify(species_dict) + return jsonify(error="Error getting species, most likely due to DB error!"), 500 diff --git a/gn3/api/rqtl.py b/gn3/api/rqtl.py index 70ebe12..ae0110d 100644 --- a/gn3/api/rqtl.py +++ b/gn3/api/rqtl.py @@ -25,11 +25,11 @@ run the rqtl_wrapper script and return the results as JSON raise FileNotFoundError # Split kwargs by those with values and boolean ones that just convert to True/False - kwargs = ["covarstruct", "model", "method", "nperm", "scale", "control_marker"] + kwargs = ["covarstruct", "model", "method", "nperm", "scale", "control"] boolean_kwargs = ["addcovar", "interval", "pstrata", "pairscan"] all_kwargs = kwargs + boolean_kwargs - rqtl_kwargs = {"geno": genofile, "pheno": phenofile} + rqtl_kwargs = {"geno": genofile, "pheno": phenofile, "outdir": current_app.config.get("TMPDIR")} rqtl_bool_kwargs = [] for kwarg in all_kwargs: if kwarg in request.form: diff --git a/gn3/api/search.py b/gn3/api/search.py index c741b15..f696428 100644 --- a/gn3/api/search.py +++ b/gn3/api/search.py @@ -194,23 +194,36 @@ def parse_location_field(species_query: xapian.Query, .maybe(xapian.Query.MatchNothing, make_query)) +def parse_boolean_prefixed_field(prefix: str, query: bytes) -> xapian.Query: + """Parse boolean prefixed field and return a xapian query.""" + # For some reason, xapian does not stem boolean prefixed fields + # when the query starts with a capital letter. We need it to stem + # always. Hence this function. + return xapian.Query(prefix + query.decode("utf-8").lower()) + + # pylint: disable=too-many-locals def parse_query(synteny_files_directory: Path, query: str): """Parse search query using GeneNetwork specific field processors.""" queryparser = xapian.QueryParser() queryparser.set_stemmer(xapian.Stem("en")) - queryparser.set_stemming_strategy(queryparser.STEM_SOME) + queryparser.set_stemming_strategy(queryparser.STEM_ALL_Z) species_prefix = "XS" chromosome_prefix = "XC" queryparser.add_boolean_prefix("author", "A") queryparser.add_boolean_prefix("species", species_prefix) - queryparser.add_boolean_prefix("group", "XG") + queryparser.add_boolean_prefix("group", + FieldProcessor(partial(parse_boolean_prefixed_field, "XG"))) queryparser.add_boolean_prefix("tissue", "XI") queryparser.add_boolean_prefix("dataset", "XDS") queryparser.add_boolean_prefix("symbol", "XY") queryparser.add_boolean_prefix("chr", chromosome_prefix) queryparser.add_boolean_prefix("peakchr", "XPC") queryparser.add_prefix("description", "XD") + queryparser.add_prefix("rif", "XRF") + queryparser.add_prefix("wiki", "XWK") + queryparser.add_prefix("RIF", "XRF") + queryparser.add_prefix("WIKI", "XWK") range_prefixes = ["mean", "peak", "position", "peakmb", "additive", "year"] for i, prefix in enumerate(range_prefixes): # Treat position specially since it needs its own field processor. @@ -263,11 +276,13 @@ def search_results(): if page < 1: abort(404, description="Requested page does not exist") results_per_page = args.get("per_page", default=100, type=int) - maximum_results_per_page = 10000 + maximum_results_per_page = 50000 if results_per_page > maximum_results_per_page: abort(400, description="Requested too many search results") - - query = parse_query(Path(current_app.config["DATA_DIR"]) / "synteny", querystring) + try: + query = parse_query(Path(current_app.config["DATA_DIR"]) / "synteny", querystring) + except xapian.QueryParserError as err: + return jsonify({"error_type": str(err.get_type()), "error": err.get_msg()}), 400 traits = [] # pylint: disable=invalid-name with xapian_database(current_app.config["XAPIAN_DB_PATH"]) as db: diff --git a/gn3/case_attributes.py b/gn3/case_attributes.py index d973b8e..efc82e9 100644 --- a/gn3/case_attributes.py +++ b/gn3/case_attributes.py @@ -26,8 +26,8 @@ from gn3.commands import run_cmd from gn3.db_utils import Connection, database_connection +from gn3.oauth2.authorisation import require_token from gn3.auth.authorisation.errors import AuthorisationError -from gn3.auth.authorisation.oauth2.resource_server import require_oauth caseattr = Blueprint("case-attribute", __name__) @@ -61,8 +61,10 @@ class CAJSONEncoder(json.JSONEncoder): return json.JSONEncoder.default(self, obj) def required_access( - inbredset_id: int, access_levels: tuple[str, ...]) -> Union[ - bool, tuple[str, ...]]: + token: dict, + inbredset_id: int, + access_levels: tuple[str, ...] +) -> Union[bool, tuple[str, ...]]: """Check whether the user has the appropriate access""" def __species_id__(conn): with conn.cursor() as cursor: @@ -71,19 +73,21 @@ def required_access( (inbredset_id,)) return cursor.fetchone()[0] try: - with (require_oauth.acquire("profile resource") as the_token, - database_connection(current_app.config["SQL_URI"]) as conn): + with database_connection(current_app.config["SQL_URI"]) as conn: result = requests.get( + # this section fetches the resource ID from the auth server urljoin(current_app.config["AUTH_SERVER_URL"], "auth/resource/inbredset/resource-id" f"/{__species_id__(conn)}/{inbredset_id}")) if result.status_code == 200: resource_id = result.json()["resource-id"] auth = requests.post( + # this section fetches the authorisations/privileges that + # the current user has on the resource we got above urljoin(current_app.config["AUTH_SERVER_URL"], "auth/resource/authorisation"), json={"resource-ids": [resource_id]}, - headers={"Authorization": f"Bearer {the_token.access_token}"}) + headers={"Authorization": f"Bearer {token['access_token']}"}) if auth.status_code == 200: privs = tuple(priv["privilege_id"] for role in auth.json()[resource_id]["roles"] @@ -398,14 +402,15 @@ def __apply_deletions__( params) def __apply_diff__( - conn: Connection, inbredset_id: int, diff_filename, the_diff) -> None: + conn: Connection, auth_token, inbredset_id: int, diff_filename, the_diff) -> None: """ Apply the changes in the diff at `diff_filename` to the data in the database if the user has appropriate privileges. """ - required_access( - inbredset_id, ("system:inbredset:edit-case-attribute", - "system:inbredset:apply-case-attribute-edit")) + required_access(auth_token, + inbredset_id, + ("system:inbredset:edit-case-attribute", + "system:inbredset:apply-case-attribute-edit")) diffs = the_diff["diff"] with conn.cursor(cursorclass=DictCursor) as cursor: # __apply_additions__(cursor, inbredset_id, diffs["Additions"]) @@ -419,6 +424,7 @@ def __apply_diff__( os.rename(diff_filename, new_path) def __reject_diff__(conn: Connection, + auth_token: dict, inbredset_id: int, diff_filename: Path, diff: dict) -> Path: @@ -426,38 +432,45 @@ def __reject_diff__(conn: Connection, Reject the changes in the diff at `diff_filename` to the data in the database if the user has appropriate privileges. """ - required_access( - inbredset_id, ("system:inbredset:edit-case-attribute", - "system:inbredset:apply-case-attribute-edit")) + required_access(auth_token, + inbredset_id, + ("system:inbredset:edit-case-attribute", + "system:inbredset:apply-case-attribute-edit")) __save_diff__(conn, diff, EditStatus.rejected) new_path = Path(diff_filename.parent, f"{diff_filename.stem}-rejected{diff_filename.suffix}") os.rename(diff_filename, new_path) return diff_filename @caseattr.route("/<int:inbredset_id>/add", methods=["POST"]) -def add_case_attributes(inbredset_id: int) -> Response: +@require_token +def add_case_attributes(inbredset_id: int, auth_token=None) -> Response: """Add a new case attribute for `InbredSetId`.""" - required_access(inbredset_id, ("system:inbredset:create-case-attribute",)) - with (require_oauth.acquire("profile resource") as the_token, # pylint: disable=[unused-variable] - database_connection(current_app.config["SQL_URI"]) as conn): # pylint: disable=[unused-variable] + required_access( + auth_token, inbredset_id, ("system:inbredset:create-case-attribute",)) + with database_connection(current_app.config["SQL_URI"]) as conn: # pylint: disable=[unused-variable] raise NotImplementedError @caseattr.route("/<int:inbredset_id>/delete", methods=["POST"]) -def delete_case_attributes(inbredset_id: int) -> Response: +@require_token +def delete_case_attributes(inbredset_id: int, auth_token=None) -> Response: """Delete a case attribute from `InbredSetId`.""" - required_access(inbredset_id, ("system:inbredset:delete-case-attribute",)) - with (require_oauth.acquire("profile resource") as the_token, # pylint: disable=[unused-variable] - database_connection(current_app.config["SQL_URI"]) as conn): # pylint: disable=[unused-variable] + required_access( + auth_token, inbredset_id, ("system:inbredset:delete-case-attribute",)) + with database_connection(current_app.config["SQL_URI"]) as conn: # pylint: disable=[unused-variable] raise NotImplementedError @caseattr.route("/<int:inbredset_id>/edit", methods=["POST"]) -def edit_case_attributes(inbredset_id: int) -> Response: - """Edit the case attributes for `InbredSetId` based on data received.""" - with (require_oauth.acquire("profile resource") as the_token, - database_connection(current_app.config["SQL_URI"]) as conn): - required_access(inbredset_id, +@require_token +def edit_case_attributes(inbredset_id: int, auth_token = None) -> Response: + """Edit the case attributes for `InbredSetId` based on data received. + + :inbredset_id: Identifier for the population that the case attribute belongs + :auth_token: A validated JWT from the auth server + """ + with database_connection(current_app.config["SQL_URI"]) as conn: + required_access(auth_token, + inbredset_id, ("system:inbredset:edit-case-attribute",)) - user = the_token.user fieldnames = tuple(["Strain"] + sorted( attr["Name"] for attr in __case_attribute_labels_by_inbred_set__(conn, inbredset_id))) @@ -465,7 +478,7 @@ def edit_case_attributes(inbredset_id: int) -> Response: diff_filename = __queue_diff__( conn, { "inbredset_id": inbredset_id, - "user_id": str(user.user_id), + "user_id": auth_token["jwt"]["sub"], "fieldnames": fieldnames, "diff": __compute_diff__( fieldnames, @@ -488,8 +501,11 @@ def edit_case_attributes(inbredset_id: int) -> Response: return response try: - __apply_diff__( - conn, inbredset_id, diff_filename, __load_diff__(diff_filename)) + __apply_diff__(conn, + auth_token, + inbredset_id, + diff_filename, + __load_diff__(diff_filename)) return jsonify({ "diff-status": "applied", "message": ("The changes to the case-attributes have been " @@ -555,37 +571,45 @@ def list_diffs(inbredset_id: int) -> Response: return resp @caseattr.route("/approve/<path:filename>", methods=["POST"]) -def approve_case_attributes_diff(filename: str) -> Response: +@require_token +def approve_case_attributes_diff(filename: str, auth_token = None) -> Response: """Approve the changes to the case attributes in the diff.""" diff_dir = Path(current_app.config["TMPDIR"], CATTR_DIFFS_DIR) diff_filename = Path(diff_dir, filename) the_diff = __load_diff__(diff_filename) with database_connection(current_app.config["SQL_URI"]) as conn: - __apply_diff__(conn, the_diff["inbredset_id"], diff_filename, the_diff) + __apply_diff__(conn, auth_token, the_diff["inbredset_id"], diff_filename, the_diff) return jsonify({ "message": "Applied the diff successfully.", "diff_filename": diff_filename.name }) @caseattr.route("/reject/<path:filename>", methods=["POST"]) -def reject_case_attributes_diff(filename: str) -> Response: +@require_token +def reject_case_attributes_diff(filename: str, auth_token=None) -> Response: """Reject the changes to the case attributes in the diff.""" diff_dir = Path(current_app.config["TMPDIR"], CATTR_DIFFS_DIR) diff_filename = Path(diff_dir, filename) the_diff = __load_diff__(diff_filename) with database_connection(current_app.config["SQL_URI"]) as conn: - __reject_diff__(conn, the_diff["inbredset_id"], diff_filename, the_diff) + __reject_diff__(conn, + auth_token, + the_diff["inbredset_id"], + diff_filename, + the_diff) return jsonify({ "message": "Rejected diff successfully", "diff_filename": diff_filename.name }) @caseattr.route("/<int:inbredset_id>/diff/<int:diff_id>/view", methods=["GET"]) -def view_diff(inbredset_id: int, diff_id: int) -> Response: +@require_token +def view_diff(inbredset_id: int, diff_id: int, auth_token=None) -> Response: """View a diff.""" with (database_connection(current_app.config["SQL_URI"]) as conn, conn.cursor(cursorclass=DictCursor) as cursor): - required_access(inbredset_id, ("system:inbredset:view-case-attribute",)) + required_access( + auth_token, inbredset_id, ("system:inbredset:view-case-attribute",)) cursor.execute( "SELECT * FROM caseattributes_audit WHERE id=%s", (diff_id,)) diff --git a/gn3/db/constants.py b/gn3/db/constants.py new file mode 100644 index 0000000..45e3bfc --- /dev/null +++ b/gn3/db/constants.py @@ -0,0 +1,152 @@ +""" +This module contains some constants used in other modules. +""" +PREFIXES = { + "dcat": "http://www.w3.org/ns/dcat#", + "dct": "http://purl.org/dc/terms/", + "ex": "http://example.org/stuff/1.0/", + "fabio": "http://purl.org/spar/fabio/", + "foaf": "http://xmlns.com/foaf/0.1/", + "generif": "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=", + "genotype": "http://genenetwork.org/genotype/", + "gn": "http://genenetwork.org/id/", + "gnc": "http://genenetwork.org/category/", + "gnt": "http://genenetwork.org/term/", + "owl": "http://www.w3.org/2002/07/owl#", + "phenotype": "http://genenetwork.org/phenotype/", + "prism": "http://prismstandard.org/namespaces/basic/2.0/", + "publication": "http://genenetwork.org/publication/", + "pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "taxon": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=", + "up": "http://purl.uniprot.org/core/", + "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", + "xsd": "http://www.w3.org/2001/XMLSchema#", +} + +RDF_PREFIXES = "\n".join([f"PREFIX {key}: <{value}>" + for key, value in PREFIXES.items()]) + +BASE_CONTEXT = { + "data": "@graph", + "type": "@type", + "gn": "http://genenetwork.org/id/", + "gnc": "http://genenetwork.org/category/", + "gnt": "http://genenetwork.org/term/", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#>", +} + +DATASET_CONTEXT = { + "accessRights": "dct:accessRights", + "accessionId": "dct:identifier", + "acknowledgement": "gnt:hasAcknowledgement", + "altLabel": "skos:altLabel", + "caseInfo": "gnt:hasCaseInfo", + "classifiedUnder": "xkos:classifiedUnder", + "contributors": "dct:creator", + "contactPoint": "dcat:contactPoint", + "created": "dct:created", + "dcat": "http://www.w3.org/ns/dcat#", + "dct": "http://purl.org/dc/terms/", + "description": "dct:description", + "ex": "http://example.org/stuff/1.0/", + "experimentDesignInfo": "gnt:hasExperimentDesignInfo", + "experimentType": "gnt:hasExperimentType", + "foaf": "http://xmlns.com/foaf/0.1/", + "geoSeriesId": "gnt:hasGeoSeriesId", + "gnt": "http://genenetwork.org/term/", + "inbredSet": "gnt:belongsToGroup", + "label": "rdfs:label", + "normalization": "gnt:usesNormalization", + "platformInfo": "gnt:hasPlatformInfo", + "notes": "gnt:hasNotes", + "organization": "foaf:Organization", + "prefLabel": "skos:prefLabel", + "citation": "dct:isReferencedBy", + "GoTree": "gnt:hasGOTreeValue", + "platform": "gnt:usesPlatform", + "processingInfo": "gnt:hasDataProcessingInfo", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "specifics": "gnt:hasContentInfo", + "title": "dct:title", + "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", + "tissueInfo": "gnt:hasTissueInfo", + "tissue": "gnt:hasTissue", + "contactWebUrl": "foaf:homepage", + "contactName": "foaf:name", +} + +SEARCH_CONTEXT = { + "pages": "ex:pages", + "hits": "ex:hits", + "result": "ex:result", + "results": "ex:items", + "resultItem": "ex:resultType", + "currentPage": "ex:currentPage", +} + +DATASET_SEARCH_CONTEXT = SEARCH_CONTEXT | { + "classifiedUnder": "xkos:classifiedUnder", + "created": "dct:created", + "dct": "http://purl.org/dc/terms/", + "ex": "http://example.org/stuff/1.0/", + "inbredSet": "ex:belongsToInbredSet", + "title": "dct:title", + "name": "rdfs:label", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "type": "@type", + "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", +} + +PUBLICATION_CONTEXT = { + "dct": "http://purl.org/dc/terms/", + "fabio": "http://purl.org/spar/fabio/", + "prism": "http://prismstandard.org/namespaces/basic/2.0/", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "title": "dct:title", + "journal": "fabio:Journal", + "volume": "prism:volume", + "page": "fabio:page", + "creator": "dct:creator", + "abstract": "dct:abstract", + "year": { + "@id": "fabio:hasPublicationYear", + "@type": "xsd:gYear", + }, + "month": { + "@id": "prism:publicationDate", + "@type": "xsd:gMonth" + }, +} + +PHENOTYPE_CONTEXT = BASE_CONTEXT | PUBLICATION_CONTEXT | { + "skos": "http://www.w3.org/2004/02/skos/core#", + "dcat": "http://www.w3.org/ns/dcat#", + "prism": "http://prismstandard.org/namespaces/basic/2.0/", + "traitName": "skos:altLabel", + "trait": "rdfs:label", + "altName": "rdfs:altLabel", + "description": "dct:description", + "abbreviation": "gnt:abbreviation", + "labCode": "gnt:labCode", + "submitter": "gnt:submitter", + "dataset": "dcat:Distribution", + "contributor": "dct:contributor", + "mean": "gnt:mean", + "locus": "gnt:locus", + "lodScore": "gnt:lodScore", + "references": "dct:isReferencedBy", + "additive": "gnt:additive", + "sequence": "gnt:sequence", + "prefLabel": "skos:prefLabel", + "identifier": "dct:identifier", + "chromosome": "gnt:chr", + "mb": "gnt:mb", + "peakLocation": "gnt:locus", + "species": "gnt:belongsToSpecies", + "group": "gnt:belongsToGroup", +} diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py index eb4014a..5a95683 100644 --- a/gn3/db/rdf.py +++ b/gn3/db/rdf.py @@ -4,39 +4,12 @@ This module is a collection of functions that handle SPARQL queries. """ import json - +from string import Template from SPARQLWrapper import SPARQLWrapper from pyld import jsonld # type: ignore - - -PREFIXES = { - "dcat": "http://www.w3.org/ns/dcat#", - "dct": "http://purl.org/dc/terms/", - "ex": "http://example.org/stuff/1.0/", - "fabio": "http://purl.org/spar/fabio/", - "foaf": "http://xmlns.com/foaf/0.1/", - "generif": "http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=", - "genotype": "http://genenetwork.org/genotype/", - "gn": "http://genenetwork.org/id/", - "gnc": "http://genenetwork.org/category/", - "gnt": "http://genenetwork.org/term/", - "owl": "http://www.w3.org/2002/07/owl#", - "phenotype": "http://genenetwork.org/phenotype/", - "prism": "http://prismstandard.org/namespaces/basic/2.0/", - "publication": "http://genenetwork.org/publication/", - "pubmed": "http://rdf.ncbi.nlm.nih.gov/pubmed/", - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "skos": "http://www.w3.org/2004/02/skos/core#", - "taxon": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=", - "up": "http://purl.uniprot.org/core/", - "xkos": "http://rdf-vocabulary.ddialliance.org/xkos#", - "xsd": "http://www.w3.org/2001/XMLSchema#", -} - - -RDF_PREFIXES = "\n".join([f"PREFIX {key}: <{value}>" - for key, value in PREFIXES.items()]) +from gn3.db.constants import ( + RDF_PREFIXES, BASE_CONTEXT +) def sparql_construct_query(query: str, endpoint: str) -> dict: @@ -51,22 +24,101 @@ def sparql_construct_query(query: str, endpoint: str) -> dict: def query_frame_and_compact(query: str, context: dict, endpoint: str) -> dict: """Frame and then compact the results given a context""" results = sparql_construct_query(query, endpoint) - if not results: - return {} return jsonld.compact(jsonld.frame(results, context), context) def query_and_compact(query: str, context: dict, endpoint: str) -> dict: """Compact the results given a context""" results = sparql_construct_query(query, endpoint) - if not results: - return {} return jsonld.compact(results, context) def query_and_frame(query: str, context: dict, endpoint: str) -> dict: """Frame the results given a context""" results = sparql_construct_query(query, endpoint) - if not results: - return {} return jsonld.frame(results, context) + + +def get_wiki_entries_by_symbol(symbol: str, sparql_uri: str) -> dict: + """Fetch all the Wiki entries using the symbol""" + # This query uses a sub-query to fetch the latest comment by the + # version id. + query = Template(""" +$prefix + +CONSTRUCT { + ?uid rdfs:label ?symbolName; + gnt:reason ?reason ; + gnt:species ?species ; + dct:references ?pmid ; + foaf:homepage ?weburl ; + rdfs:comment ?comment ; + foaf:mbox ?email ; + gnt:initial ?usercode ; + gnt:belongsToCategory ?category ; + gnt:hasVersion ?versionId ; + dct:created ?created ; + dct:identifier ?identifier . +} WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid rdfs:comment ?comment ; + gnt:symbol ?symbolId ; + rdf:type gnc:GNWikiEntry ; + dct:created ?createTime . + FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . + { + SELECT (MAX(?vers) AS ?max) ?id_ WHERE { + ?symbolId rdfs:label ?symbolName . + ?uid dct:identifier ?id_ ; + dct:hasVersion ?vers ; + dct:identifier ?id_ ; + gnt:symbol ?symbolId . + FILTER ( LCASE(?symbolName) = LCASE('$symbol') ) . + } + } + ?uid dct:hasVersion ?max ; + dct:identifier ?id_ . + OPTIONAL { ?uid gnt:reason ?reason } . + OPTIONAL { + ?uid gnt:belongsToSpecies ?speciesId . + ?speciesId gnt:shortName ?species . + } . + OPTIONAL { ?uid dct:references ?pubmedId . } . + OPTIONAL { ?uid foaf:homepage ?weburl . } . + OPTIONAL { ?uid gnt:initial ?usercode . } . + OPTIONAL { ?uid gnt:mbox ?email . } . + OPTIONAL { ?uid gnt:belongsToCategory ?category . } . + BIND (str(?version) AS ?versionId) . + BIND (str(?id_) AS ?identifier) . + BIND (str(?pubmedId) AS ?pmid) . + BIND (str(?createTime) AS ?created) . +} +""").substitute(prefix=RDF_PREFIXES, symbol=symbol,) + context = BASE_CONTEXT | { + "foaf": "http://xmlns.com/foaf/0.1/", + "dct": "http://purl.org/dc/terms/", + "categories": "gnt:belongsToCategory", + "web_url": "foaf:homepage", + "version": "gnt:hasVersion", + "symbol": "rdfs:label", + "reason": "gnt:reason", + "species": "gnt:species", + "pubmed_id": "dct:references", + "email": "foaf:mbox", + "initial": "gnt:initial", + "comment": "rdfs:comment", + "created": "dct:created", + "id": "dct:identifier", + # This points to the RDF Node which is the unique identifier + # for this triplet. It's constructed using the comment-id and + # the comment-versionId + "wiki_identifier": "@id", + } + results = query_frame_and_compact( + query, context, + sparql_uri + ) + data = results.get("data") + if not data: + return results + return results diff --git a/gn3/db/wiki.py b/gn3/db/wiki.py new file mode 100644 index 0000000..abb1644 --- /dev/null +++ b/gn3/db/wiki.py @@ -0,0 +1,80 @@ +"""Helper functions to access wiki entries""" + +from typing import Dict, List + +from MySQLdb.cursors import DictCursor + + +class MissingDBDataException(Exception): + """Error due to DB missing some data""" + + +def get_latest_comment(connection, comment_id: str) -> int: + """ Latest comment is one with the highest versionId """ + cursor = connection.cursor(DictCursor) + query = """ SELECT versionId AS version, symbol, PubMed_ID AS pubmed_ids, sp.Name AS species, + comment, email, weburl, initial, reason + FROM `GeneRIF` gr + INNER JOIN Species sp USING(SpeciesId) + WHERE gr.Id = %s + ORDER BY versionId DESC LIMIT 1; + """ + cursor.execute(query, (comment_id,)) + result = cursor.fetchone() + result["pubmed_ids"] = [x.strip() for x in result["pubmed_ids"].split()] + categories_query = """ + SELECT grx.GeneRIFId, grx.versionId, gc.Name FROM GeneRIFXRef grx + INNER JOIN GeneCategory gc ON grx.GeneCategoryId=gc.Id + WHERE GeneRIFId = %s AND versionId=%s; + """ + + cursor.execute(categories_query, (comment_id, result["version"])) + categories = cursor.fetchall() + result["categories"] = [x["Name"] for x in categories] + return result + + +def get_species_id(cursor, species_name: str) -> int: + """Find species id given species `Name`""" + cursor.execute("SELECT SpeciesID from Species WHERE Name = %s", (species_name,)) + species_ids = cursor.fetchall() + if len(species_ids) != 1: + raise MissingDBDataException( + f"expected 1 species with Name={species_name} but found {len(species_ids)}!" + ) + return species_ids[0][0] + + +def get_next_comment_version(cursor, comment_id: int) -> int: + """Find the version to add, usually latest_version + 1""" + cursor.execute( + "SELECT MAX(versionId) as version_id from GeneRIF WHERE Id = %s", (comment_id,) + ) + latest_version = cursor.fetchone()[0] + if latest_version is None: + raise MissingDBDataException(f"No comment found with comment_id={comment_id}") + return latest_version + 1 + + +def get_categories_ids(cursor, categories: List[str]) -> List[int]: + """Get the categories_ids from a list of category strings""" + dict_cats = get_categories(cursor) + category_ids = [] + for category in set(categories): + cat_id = dict_cats.get(category.strip()) + if cat_id is None: + raise MissingDBDataException(f"Category with Name={category} not found") + category_ids.append(cat_id) + return category_ids + +def get_categories(cursor) -> Dict[str, int]: + cursor.execute("SELECT Name, Id from GeneCategory") + raw_categories = cursor.fetchall() + dict_cats = dict(raw_categories) + return dict_cats + +def get_species(cursor) -> Dict[str, str]: + cursor.execute("SELECT Name, SpeciesName from Species") + raw_species = cursor.fetchall() + dict_cats = dict(raw_species) + return dict_cats diff --git a/gn3/db_utils.py b/gn3/db_utils.py index e4dc81f..0d9bd0a 100644 --- a/gn3/db_utils.py +++ b/gn3/db_utils.py @@ -1,11 +1,15 @@ """module contains all db related stuff""" import contextlib +import logging from typing import Any, Iterator, Protocol, Tuple from urllib.parse import urlparse import MySQLdb as mdb import xapian +LOGGER = logging.getLogger(__file__) + + def parse_db_url(sql_uri: str) -> Tuple: """function to parse SQL_URI env variable note:there\ is a default value for SQL_URI so a tuple result is\ @@ -24,7 +28,7 @@ class Connection(Protocol): @contextlib.contextmanager -def database_connection(sql_uri) -> Iterator[Connection]: +def database_connection(sql_uri: str, logger: logging.Logger = LOGGER) -> Iterator[Connection]: """Connect to MySQL database.""" host, user, passwd, db_name, port = parse_db_url(sql_uri) connection = mdb.connect(db=db_name, @@ -35,6 +39,7 @@ def database_connection(sql_uri) -> Iterator[Connection]: try: yield connection except mdb.Error as _mbde: + logger.error("DB error encountered", exc_info=True) connection.rollback() finally: connection.commit() diff --git a/gn3/errors.py b/gn3/errors.py index c53604f..ec7a554 100644 --- a/gn3/errors.py +++ b/gn3/errors.py @@ -15,6 +15,7 @@ from werkzeug.exceptions import NotFound from authlib.oauth2.rfc6749.errors import OAuth2Error from flask import Flask, jsonify, Response, current_app +from gn3.oauth2 import errors as oautherrors from gn3.auth.authorisation.errors import AuthorisationError from gn3.llms.errors import LLMError @@ -28,6 +29,7 @@ def add_trace(exc: Exception, jsonmsg: dict) -> dict: def page_not_found(pnf): """Generic 404 handler.""" + current_app.logger.error("Handling 404 errors", exc_info=True) return jsonify(add_trace(pnf, { "error": pnf.name, "error_description": pnf.description @@ -36,6 +38,7 @@ def page_not_found(pnf): def internal_server_error(pnf): """Generic 404 handler.""" + current_app.logger.error("Handling internal server errors", exc_info=True) return jsonify(add_trace(pnf, { "error": pnf.name, "error_description": pnf.description @@ -44,15 +47,16 @@ def internal_server_error(pnf): def url_server_error(pnf): """Handler for an exception with a url connection.""" + current_app.logger.error("Handling url server errors", exc_info=True) return jsonify(add_trace(pnf, { "error": f"URLLib Error no: {pnf.reason.errno}", "error_description": pnf.reason.strerror, - })) + })), 500 def handle_authorisation_error(exc: AuthorisationError): """Handle AuthorisationError if not handled anywhere else.""" - current_app.logger.error(exc) + current_app.logger.error("Handling external auth errors", exc_info=True) return jsonify(add_trace(exc, { "error": type(exc).__name__, "error_description": " :: ".join(exc.args) @@ -61,7 +65,7 @@ def handle_authorisation_error(exc: AuthorisationError): def handle_oauth2_errors(exc: OAuth2Error): """Handle OAuth2Error if not handled anywhere else.""" - current_app.logger.error(exc) + current_app.logger.error("Handling external oauth2 errors", exc_info=True) return jsonify(add_trace(exc, { "error": exc.error, "error_description": exc.description, @@ -70,7 +74,7 @@ def handle_oauth2_errors(exc: OAuth2Error): def handle_sqlite3_errors(exc: OperationalError): """Handle sqlite3 errors if not handled anywhere else.""" - current_app.logger.error(exc) + current_app.logger.error("Handling sqlite3 errors", exc_info=True) return jsonify({ "error": "DatabaseError", "error_description": exc.args[0], @@ -78,24 +82,23 @@ def handle_sqlite3_errors(exc: OperationalError): def handle_sparql_errors(exc): - """Handle sqlite3 errors if not handled anywhere else.""" - current_app.logger.error(exc) - __code = { - EndPointInternalError: 500, - EndPointNotFound: 400, - QueryBadFormed: 400, - Unauthorized: 401, - URITooLong: 414, + """Handle sparql/virtuoso errors if not handled anywhere else.""" + current_app.logger.error("Handling sparql errors", exc_info=True) + code = { + "EndPointInternalError": 500, + "EndPointNotFound": 404, + "QueryBadFormed": 400, + "Unauthorized": 401, + "URITooLong": 414, } return jsonify({ "error": exc.msg, - "error_description": str(exc), - }), __code.get(exc) + }), code.get(exc.__class__.__name__) def handle_generic(exc: Exception) -> Response: """Handle generic exception.""" - current_app.logger.error(exc) + current_app.logger.error("Handling generic errors", exc_info=True) resp = jsonify({ "error": type(exc).__name__, "error_description": ( @@ -106,6 +109,15 @@ def handle_generic(exc: Exception) -> Response: return resp +def handle_local_authorisation_errors(exc: oautherrors.AuthorisationError): + """Handle errors relating to authorisation that are raised locally.""" + current_app.logger.error("Handling local auth errors", exc_info=True) + return jsonify(add_trace(exc, { + "error": type(exc).__name__, + "error_description": " ".join(exc.args) + })), 400 + + def handle_llm_error(exc: Exception) -> Response: """ Handle llm erros if not handled anywhere else. """ current_app.logger.error(exc) diff --git a/gn3/oauth2/__init__.py b/gn3/oauth2/__init__.py new file mode 100644 index 0000000..8001d34 --- /dev/null +++ b/gn3/oauth2/__init__.py @@ -0,0 +1 @@ +"""Package to handle OAuth2 authorisation and other issues.""" diff --git a/gn3/oauth2/authorisation.py b/gn3/oauth2/authorisation.py new file mode 100644 index 0000000..b2dd1ae --- /dev/null +++ b/gn3/oauth2/authorisation.py @@ -0,0 +1,34 @@ +"""Handle authorisation with auth server.""" +from functools import wraps + +from flask import request, jsonify, current_app as app + +from gn3.oauth2 import jwks +from gn3.oauth2.errors import TokenValidationError + + +def require_token(func): + """Check for and verify bearer token.""" + @wraps(func) + def __auth__(*args, **kwargs): + try: + bearer = request.headers.get("Authorization", "") + if bearer.startswith("Bearer"): + # validate token and return it + _extra, token = [item.strip() for item in bearer.split(" ")] + _jwt = jwks.validate_token( + token, + jwks.fetch_jwks(app.config["AUTH_SERVER_URL"], + "auth/public-jwks")) + return func(*args, **{**kwargs, "auth_token": {"access_token": token, "jwt": _jwt}}) + error_message = "We expected a bearer token but did not get one." + except TokenValidationError as _tve: + app.logger.debug("Token validation failed.", exc_info=True) + error_message = "The token was found to be invalid." + + return jsonify({ + "error": "TokenValidationError", + "description": error_message + }), 400 + + return __auth__ diff --git a/gn3/oauth2/errors.py b/gn3/oauth2/errors.py new file mode 100644 index 0000000..f8cfd2c --- /dev/null +++ b/gn3/oauth2/errors.py @@ -0,0 +1,8 @@ +"""List of possible errors.""" + +class AuthorisationError(Exception): + """Top-level error class dealing with generic authorisation errors.""" + + +class TokenValidationError(AuthorisationError): + """Class to indicate that token validation failed.""" diff --git a/gn3/oauth2/jwks.py b/gn3/oauth2/jwks.py new file mode 100644 index 0000000..8798a3f --- /dev/null +++ b/gn3/oauth2/jwks.py @@ -0,0 +1,36 @@ +"""Utilities dealing with JSON Web Keys (JWK)""" +from urllib.parse import urljoin + +import requests +from flask import current_app as app +from authlib.jose.errors import BadSignatureError +from authlib.jose import KeySet, JsonWebKey, JsonWebToken + +from gn3.oauth2.errors import TokenValidationError + + +def fetch_jwks(authserveruri: str, path: str = "auth/public-jwks") -> KeySet: + """Fetch the JWKs from a particular URI""" + try: + response = requests.get(urljoin(authserveruri, path)) + if response.status_code == 200: + return KeySet([ + JsonWebKey.import_key(key) for key in response.json()["jwks"]]) + # XXXX: TODO: Catch specific exception we need. + # pylint: disable=W0703 + except Exception as _exc: + app.logger.debug("There was an error fetching the JSON Web Keys.", + exc_info=True) + + return KeySet([]) + + +def validate_token(token: str, keys: KeySet) -> dict: + """Validate the token against the given keys.""" + for key in keys.keys: + try: + return JsonWebToken(["RS256"]).decode(token, key=key) + except BadSignatureError as _bse: + pass + + raise TokenValidationError("No key was found for validation.") diff --git a/gn3/settings.py b/gn3/settings.py index acf3619..1e794ff 100644 --- a/gn3/settings.py +++ b/gn3/settings.py @@ -85,7 +85,7 @@ ROUND_TO = 10 MULTIPROCESSOR_PROCS = 6 # Number of processes to spawn -AUTH_SERVER_URL = "" +AUTH_SERVER_URL = "https://auth.genenetwork.org" AUTH_MIGRATIONS = "migrations/auth" AUTH_DB = os.environ.get( "AUTH_DB", f"{os.environ.get('HOME')}/genenetwork/gn3_files/db/auth.db") @@ -93,8 +93,6 @@ OAUTH2_SCOPE = ( "profile", "group", "role", "resource", "user", "masquerade", "introspect") -GNQA_DB = os.environ.get( - "GNQA_DB", f"{os.environ.get('HOME')}/tmp/gnqa.db") try: # *** SECURITY CONCERN *** diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index 1f649cf..2779abc 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -8,21 +8,26 @@ xapian index. This xapian index is later used in providing search through the web interface. """ - -from collections import deque, namedtuple +from dataclasses import dataclass +from collections import deque, namedtuple, Counter import contextlib +import time +import datetime from functools import partial import itertools import json import logging -from multiprocessing import Lock, Process +from multiprocessing import Lock, Manager, Process, managers import os import pathlib import resource +import re import shutil import sys +import hashlib import tempfile -from typing import Callable, Generator, Iterable, List +from typing import Callable, Dict, Generator, Hashable, Iterable, List +from SPARQLWrapper import SPARQLWrapper, JSON import MySQLdb import click @@ -33,7 +38,10 @@ import xapian from gn3.db_utils import database_connection from gn3.monads import query_sql -DOCUMENTS_PER_CHUNK = 100000 +DOCUMENTS_PER_CHUNK = 100_000 +# Running the script in prod consumers ~1GB per process when handling 100_000 Documents per chunk. +# To prevent running out of RAM, we set this as the upper bound for total concurrent processes +PROCESS_COUNT_LIMIT = 67 SQLQuery = namedtuple("SQLQuery", ["fields", "tables", "where", "offset", "limit"], @@ -122,6 +130,38 @@ phenotypes_query = SQLQuery( SQLTableClause("LEFT JOIN", "Geno", "PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id")]) +WIKI_CACHE_QUERY = """ +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX gnt: <http://genenetwork.org/term/> +PREFIX gnc: <http://genenetwork.org/category/> + +SELECT ?symbolName ?speciesName GROUP_CONCAT(DISTINCT ?comment ; separator=\"\\n\") AS ?comment WHERE { + ?symbol rdfs:comment _:node ; + rdfs:label ?symbolName . +_:node rdf:type gnc:GNWikiEntry ; + gnt:belongsToSpecies ?species ; + rdfs:comment ?comment . +?species gnt:shortName ?speciesName . +} GROUP BY ?speciesName ?symbolName +""" + +RIF_CACHE_QUERY = """ +PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> +PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> +PREFIX gnt: <http://genenetwork.org/term/> +PREFIX gnc: <http://genenetwork.org/category/> + +SELECT ?symbolName ?speciesName GROUP_CONCAT(DISTINCT ?comment ; separator=\"\\n\") AS ?comment WHERE { + ?symbol rdfs:comment _:node ; + rdfs:label ?symbolName . +_:node rdf:type gnc:NCBIWikiEntry ; + gnt:belongsToSpecies ?species ; + rdfs:comment ?comment . +?species gnt:shortName ?speciesName . +} GROUP BY ?speciesName ?symbolName +""" + def serialize_sql(query: SQLQuery) -> str: """Serialize SQLQuery object to a string.""" @@ -168,6 +208,48 @@ def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDataba db.close() +def build_rdf_cache(sparql_uri: str, query: str, remove_common_words: bool = False): + cache = {} + sparql = SPARQLWrapper(sparql_uri) + sparql.setReturnFormat(JSON) + sparql.setQuery(query) + results = sparql.queryAndConvert() + if not isinstance(results, dict): + raise TypeError(f"Expected results to be a dict but found {type(results)}") + bindings = results["results"]["bindings"] + count: Counter[str] = Counter() + words_regex = re.compile(r"\w+") + for entry in bindings : + x = (entry["speciesName"]["value"], entry["symbolName"]["value"],) + value = entry["comment"]["value"] + value = " ".join(words_regex.findall(value)) # remove punctuation + cache[x] = value + count.update(Counter(value.lower().strip().split())) + + if not remove_common_words: + return cache + + words_to_drop = set() + for word, cnt in count.most_common(1000): + if len(word) < 4 or cnt > 3000: + words_to_drop.add(word) + smaller_cache = {} + for entry, value in cache.items(): + new_value = set(word for word in value.lower().split() if word not in words_to_drop) + smaller_cache[entry] = " ".join(new_value) + return smaller_cache + + +def md5hash_ttl_dir(ttl_dir: pathlib.Path) -> str: + if not ttl_dir.exists(): + return "-1" + ttl_hash = hashlib.new("md5") + for ttl_file in ttl_dir.glob("*.ttl"): + with open(ttl_file, encoding="utf-8") as f_: + ttl_hash.update(f_.read().encode()) + return ttl_hash.hexdigest() + + # pylint: disable=invalid-name def write_document(db: xapian.WritableDatabase, identifier: str, doctype: str, doc: xapian.Document) -> None: @@ -181,15 +263,23 @@ def write_document(db: xapian.WritableDatabase, identifier: str, termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) +termgenerator.set_stopper_strategy(xapian.TermGenerator.STOP_ALL) +termgenerator.set_stopper(xapian.SimpleStopper()) def index_text(text: str) -> None: """Index text and increase term position.""" termgenerator.index_text(text) termgenerator.increase_termpos() -# pylint: disable=unnecessary-lambda -index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) +@curry(3) +def index_from_dictionary(keys: Hashable, prefix: str, dictionary: dict): + entry = dictionary.get(keys) + if not entry: + return + termgenerator.index_text_without_positions(entry, 0, prefix) + +index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) index_authors = lambda authors: termgenerator.index_text(authors, 0, "A") index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS") index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG") @@ -206,10 +296,17 @@ add_peakmb = lambda doc, peakmb: doc.add_value(3, xapian.sortable_serialise(peak add_additive = lambda doc, additive: doc.add_value(4, xapian.sortable_serialise(additive)) add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(year))) + + + # When a child process is forked, it inherits a copy of the memory of # its parent. We use this to pass data retrieved from SQL from parent # to child. Specifically, we use this global variable. -data: Iterable +# This is copy-on-write so make sure child processes don't modify this data +mysql_data: Iterable +rif_cache: Iterable +wiki_cache: Iterable + # We use this lock to ensure that only one process writes its Xapian # index to disk at a time. xapian_lock = Lock() @@ -217,7 +314,7 @@ xapian_lock = Lock() def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: """Index genes data into a Xapian index.""" with locked_xapian_writable_database(xapian_build_directory / f"genes-{chunk_index:04d}") as db: - for trait in data: + for trait in mysql_data: # pylint: disable=cell-var-from-loop doc = xapian.Document() termgenerator.set_document(doc) @@ -230,7 +327,7 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: trait["additive"].bind(partial(add_additive, doc)) # Index free text. - for key in ["description", "tissue", "dataset_fullname"]: + for key in ["description", "tissue", "dataset"]: trait[key].bind(index_text) trait.pop("probe_target_description").bind(index_text) for key in ["name", "symbol", "species", "group"]: @@ -242,11 +339,23 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: trait["species"].bind(index_species) trait["group"].bind(index_group) trait["tissue"].bind(index_tissue) - trait["dataset_fullname"].bind(index_dataset) + trait["dataset"].bind(index_dataset) trait["symbol"].bind(index_symbol) trait["chr"].bind(index_chr) trait["geno_chr"].bind(index_peakchr) + Maybe.apply(index_from_dictionary).to_arguments( + Just((trait["species"].value, trait["symbol"].value)), + Just("XRF"), + Just(rif_cache) + ) + + Maybe.apply(index_from_dictionary).to_arguments( + Just((trait["species"].value, trait["symbol"].value)), + Just("XWK"), + Just(wiki_cache) + ) + doc.set_data(json.dumps(trait.data)) (Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}")) .to_arguments(trait["name"], trait["dataset"]) @@ -257,7 +366,8 @@ def index_phenotypes(xapian_build_directory: pathlib.Path, chunk_index: int) -> """Index phenotypes data into a Xapian index.""" with locked_xapian_writable_database( xapian_build_directory / f"phenotypes-{chunk_index:04d}") as db: - for trait in data: + + for trait in mysql_data: # pylint: disable=cell-var-from-loop doc = xapian.Document() termgenerator.set_document(doc) @@ -270,7 +380,7 @@ def index_phenotypes(xapian_build_directory: pathlib.Path, chunk_index: int) -> trait["year"].bind(partial(add_year, doc)) # Index free text. - for key in ["description", "authors", "dataset_fullname"]: + for key in ["description", "authors", "dataset"]: trait[key].bind(index_text) for key in ["Abstract", "Title"]: trait.pop(key).bind(index_text) @@ -284,7 +394,7 @@ def index_phenotypes(xapian_build_directory: pathlib.Path, chunk_index: int) -> trait["group"].bind(index_group) trait["authors"].bind(index_authors) trait["geno_chr"].bind(index_peakchr) - trait["dataset_fullname"].bind(index_dataset) + trait["dataset"].bind(index_dataset) # Convert name from integer to string. trait["name"] = trait["name"].map(str) @@ -320,12 +430,16 @@ def worker_queue(number_of_workers: int = os.cpu_count() or 1) -> Generator: process.join() -def index_query(index_function: Callable, query: SQLQuery, - xapian_build_directory: pathlib.Path, sql_uri: str, start: int = 0) -> None: +def index_query(index_function: Callable[[pathlib.Path, int], None], query: SQLQuery, + xapian_build_directory: pathlib.Path, sql_uri: str, + sparql_uri: str, start: int = 0) -> None: """Run SQL query, and index its results for Xapian.""" i = start + default_no_of_workers = os.cpu_count() or 1 + no_of_workers = min(default_no_of_workers, PROCESS_COUNT_LIMIT) + try: - with worker_queue() as spawn_worker: + with worker_queue(no_of_workers) as spawn_worker: with database_connection(sql_uri) as conn: for chunk in group(query_sql(conn, serialize_sql( # KLUDGE: MariaDB does not allow an offset @@ -335,9 +449,8 @@ def index_query(index_function: Callable, query: SQLQuery, offset=start*DOCUMENTS_PER_CHUNK)), server_side=True), DOCUMENTS_PER_CHUNK): - # pylint: disable=global-statement - global data - data = chunk + global mysql_data + mysql_data = chunk spawn_worker(index_function, (xapian_build_directory, i)) logging.debug("Spawned worker process on chunk %s", i) i += 1 @@ -347,7 +460,7 @@ def index_query(index_function: Callable, query: SQLQuery, except MySQLdb._exceptions.OperationalError: logging.warning("Reopening connection to recovering from SQL operational error", exc_info=True) - index_query(index_function, query, xapian_build_directory, sql_uri, i) + index_query(index_function, query, xapian_build_directory, sql_uri, sparql_uri, i) @contextlib.contextmanager @@ -357,12 +470,33 @@ def temporary_directory(prefix: str, parent_directory: str) -> Generator: yield pathlib.Path(tmpdirname) +def parallel_xapian_compact(combined_index: pathlib.Path, indices: List[pathlib.Path]) -> None: + # We found that compacting 50 files of ~600MB has decent performance + no_of_workers = 20 + file_groupings = 50 + with temporary_directory("parallel_combine", str(combined_index)) as parallel_combine: + parallel_combine.mkdir(parents=True, exist_ok=True) + with worker_queue(no_of_workers) as spawn_worker: + i = 0 + while i < len(indices): + end_index = (i + file_groupings) + files = indices[i:end_index] + last_item_idx = i + len(files) + spawn_worker(xapian_compact, (parallel_combine / f"{i}_{last_item_idx}", files)) + logging.debug("Spawned worker to compact files from %s to %s", i, last_item_idx) + i = end_index + logging.debug("Completed parallel xapian compacts") + xapian_compact(combined_index, list(parallel_combine.iterdir())) + + def xapian_compact(combined_index: pathlib.Path, indices: List[pathlib.Path]) -> None: """Compact and combine several Xapian indices.""" # xapian-compact opens all indices simultaneously. So, raise the limit on # the number of open files. soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (max(soft, min(10*len(indices), hard)), hard)) + combined_index.mkdir(parents=True, exist_ok=True) + start = time.monotonic() db = xapian.Database() try: for index in indices: @@ -370,32 +504,73 @@ def xapian_compact(combined_index: pathlib.Path, indices: List[pathlib.Path]) -> db.compact(str(combined_index), xapian.DBCOMPACT_MULTIPASS | xapian.Compactor.FULLER) finally: db.close() + logging.debug("Removing databases that were compacted into %s", combined_index.name) + for folder in indices: + shutil.rmtree(folder) + logging.debug("Completed xapian-compact %s; handled %s files in %s minutes", combined_index.name, len(indices), (time.monotonic() - start) / 60) + + +@click.command(help="Verify checksums and return True when the data has been changed.") +@click.argument("xapian_directory") +@click.argument("sql_uri") +@click.argument("sparql_uri") +def is_data_modified(xapian_directory: str, + sql_uri: str, + sparql_uri: str) -> None: + dir_ = pathlib.Path(xapian_directory) + with locked_xapian_writable_database(dir_) as db, database_connection(sql_uri) as conn: + checksums = "-1" + if db.get_metadata('tables'): + checksums = " ".join([ + str(result["Checksum"].value) + for result in query_sql( + conn, + f"CHECKSUM TABLE {', '.join(db.get_metadata('tables').decode().split())}") + ]) + # Return a zero exit status code when the data has changed; + # otherwise exit with a 1 exit status code. + generif = pathlib.Path("/var/lib/data/") + if (db.get_metadata("generif-checksum").decode() == md5hash_ttl_dir(generif) and + db.get_metadata("checksums").decode() == checksums): + sys.exit(1) + sys.exit(0) @click.command(help="Index GeneNetwork data and build Xapian search index in XAPIAN_DIRECTORY.") @click.argument("xapian_directory") @click.argument("sql_uri") +@click.argument("sparql_uri") # pylint: disable=missing-function-docstring -def main(xapian_directory: str, sql_uri: str) -> None: +def create_xapian_index(xapian_directory: str, sql_uri: str, + sparql_uri: str) -> None: logging.basicConfig(level=os.environ.get("LOGLEVEL", "DEBUG"), - format='%(relativeCreated)s: %(levelname)s: %(message)s') + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S %Z') + if not pathlib.Path(xapian_directory).exists(): + pathlib.Path(xapian_directory).mkdir() # Ensure no other build process is running. - if pathlib.Path(xapian_directory).exists(): - logging.error("Build directory %s already exists; " + if any(pathlib.Path(xapian_directory).iterdir()): + logging.error("Build directory %s has build files; " "perhaps another build process is running.", xapian_directory) sys.exit(1) - pathlib.Path(xapian_directory).mkdir() + start_time = time.perf_counter() with temporary_directory("combined", xapian_directory) as combined_index: with temporary_directory("build", xapian_directory) as xapian_build_directory: + global rif_cache + global wiki_cache + logging.info("Building wiki cache") + wiki_cache = build_rdf_cache(sparql_uri, WIKI_CACHE_QUERY, remove_common_words=True) + logging.info("Building rif cache") + rif_cache = build_rdf_cache(sparql_uri, RIF_CACHE_QUERY, remove_common_words=True) logging.info("Indexing genes") - index_query(index_genes, genes_query, xapian_build_directory, sql_uri) + index_query(index_genes, genes_query, xapian_build_directory, sql_uri, sparql_uri) logging.info("Indexing phenotypes") - index_query(index_phenotypes, phenotypes_query, xapian_build_directory, sql_uri) + index_query(index_phenotypes, phenotypes_query, xapian_build_directory, sql_uri, sparql_uri) logging.info("Combining and compacting indices") - xapian_compact(combined_index, list(xapian_build_directory.iterdir())) + parallel_xapian_compact(combined_index, list(xapian_build_directory.iterdir())) logging.info("Writing table checksums into index") with locked_xapian_writable_database(combined_index) as db: # Build a (deduplicated) set of all tables referenced in @@ -409,11 +584,27 @@ def main(xapian_directory: str, sql_uri: str) -> None: ] db.set_metadata("tables", " ".join(tables)) db.set_metadata("checksums", " ".join(checksums)) + logging.info("Writing generif checksums into index") + db.set_metadata( + "generif-checksum", + md5hash_ttl_dir(pathlib.Path("/var/lib/data/")).encode()) for child in combined_index.iterdir(): shutil.move(child, xapian_directory) logging.info("Index built") + end_time = time.perf_counter() + index_time = datetime.timedelta(seconds=end_time - start_time) + logging.info(f"Time to Index: {index_time}") + + +@click.group() +def cli(): + pass + + +cli.add_command(is_data_modified) +cli.add_command(create_xapian_index) if __name__ == "__main__": # pylint: disable=no-value-for-parameter - main() + cli() diff --git a/scripts/rqtl_wrapper.R b/scripts/rqtl_wrapper.R index ea2c345..2ac8faa 100644 --- a/scripts/rqtl_wrapper.R +++ b/scripts/rqtl_wrapper.R @@ -3,8 +3,6 @@ library(qtl) library(stringi) library(stringr) -tmp_dir = Sys.getenv("TMPDIR") - option_list = list( make_option(c("-g", "--geno"), type="character", help=".geno file containing a dataset's genotypes"), make_option(c("-p", "--pheno"), type="character", help="File containing two columns - sample names and values"), @@ -18,7 +16,7 @@ option_list = list( make_option(c("--pstrata"), action="store_true", default=NULL, help="Use permutation strata (stored as final column/vector in phenotype input file)"), make_option(c("-s", "--scale"), type="character", default="mb", help="Mapping scale - Megabases (Mb) or Centimorgans (cM)"), make_option(c("--control"), type="character", default=NULL, help="Name of marker (contained in genotype file) to be used as a control"), - make_option(c("-o", "--outdir"), type="character", default=file.path(tmp_dir, "output"), help="Directory in which to write result file"), + make_option(c("-o", "--outdir"), type="character", default=NULL, help="Directory in which to write result file"), make_option(c("-f", "--filename"), type="character", default=NULL, help="Name to use for result file"), make_option(c("-v", "--verbose"), action="store_true", default=NULL, help="Show extra information") ); @@ -58,7 +56,7 @@ geno_file = opt$geno pheno_file = opt$pheno # Generate randomized filename for cross object -cross_file = file.path(tmp_dir, "cross", paste(stri_rand_strings(1, 8), ".cross", sep = "")) +cross_file = file.path(opt$outdir, "cross", paste(stri_rand_strings(1, 8), ".cross", sep = "")) trim <- function( x ) { gsub("(^[[:space:]]+|[[:space:]]+$)", "", x) } @@ -258,9 +256,9 @@ if (!is.null(opt$pairscan)) { # Calculate permutations if (opt$nperm > 0) { if (!is.null(opt$filename)){ - perm_out_file = file.path(opt$outdir, paste("PERM_", opt$filename, sep = "" )) + perm_out_file = file.path(opt$outdir, "output", paste("PERM_", opt$filename, sep = "" )) } else { - perm_out_file = file.path(opt$outdir, paste(pheno_name, "_PERM_", stri_rand_strings(1, 8), sep = "")) + perm_out_file = file.path(opt$outdir, "output", paste(pheno_name, "_PERM_", stri_rand_strings(1, 8), sep = "")) } if (!is.null(opt$addcovar) || !is.null(opt$control)){ @@ -284,9 +282,9 @@ if (opt$nperm > 0) { } if (!is.null(opt$filename)){ - out_file = file.path(opt$outdir, opt$filename) + out_file = file.path(opt$outdir, "output", opt$filename) } else { - out_file = file.path(opt$outdir, paste(pheno_name, "_", stri_rand_strings(1, 8), sep = "")) + out_file = file.path(opt$outdir, "output", paste(pheno_name, "_", stri_rand_strings(1, 8), sep = "")) } if (!is.null(opt$addcovar) || !is.null(opt$control)){ @@ -299,7 +297,7 @@ if (!is.null(opt$addcovar) || !is.null(opt$control)){ verbose_print('Writing results to CSV file\n') if (!is.null(opt$pairscan)) { - map_out_file = file.path(opt$outdir, paste("MAP_", opt$filename, sep = "" )) + map_out_file = file.path(opt$outdir, "output", paste("MAP_", opt$filename, sep = "" )) write.csv(qtl_results[1], out_file) write.csv(qtl_results[2], map_out_file) } else { diff --git a/scripts/update_rif_table.py b/scripts/update_rif_table.py new file mode 100755 index 0000000..24edf3d --- /dev/null +++ b/scripts/update_rif_table.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +""" +Script responsible for updating the GeneRIF_BASIC table +""" + +import argparse +import csv +import datetime +import gzip +import logging +import pathlib +import os +from tempfile import TemporaryDirectory +from typing import Dict, Generator + +import requests +from MySQLdb.cursors import DictCursor + +from gn3.db_utils import database_connection + +TAX_IDS = {"10090": 1, "9606": 4, "10116": 2, "3702": 3} + +GENE_INFO_URL = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz" +GENERIFS_BASIC_URL = "https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz" + +VERSION_ID = 5 + + +INSERT_QUERY = """ INSERT INTO GeneRIF_BASIC +(SpeciesId, GeneId, symbol, PubMed_Id, createtime, comment, TaxID, VersionId) +VALUES (%s, %s, %s, %s, %s, %s, %s, %s) +""" + + +def download_file(url: str, dest: pathlib.Path): + """Saves the contents of url in dest""" + with requests.get(url, stream=True) as resp: + resp.raise_for_status() + with open(dest, "wb") as downloaded_file: + for chunk in resp.iter_content(chunk_size=8192): + downloaded_file.write(chunk) + + +def read_tsv_file(fname: pathlib.Path) -> Generator: + """Load tsv file from NCBI""" + with gzip.open(fname, mode="rt") as gz_file: + reader = csv.DictReader(gz_file, delimiter="\t", quoting=csv.QUOTE_NONE) + yield from reader + + +def parse_gene_info_from_ncbi(fname: pathlib.Path) -> Dict[str, str]: + """Parse gene_info into geneid: symbol pairs""" + genedict: Dict[str, str] = {} + for row in read_tsv_file(fname): + if row["#tax_id"] not in TAX_IDS: + continue + gene_id, symbol = row["GeneID"], row["Symbol"] + genedict[gene_id] = symbol + return genedict + + +def build_already_exists_cache(conn) -> dict: + """ + Build cache for all GeneId, SpeciesID, createtime, PubMed_ID combinations. + Helps prevent duplicate inserts. + """ + cache = {} + query = """SELECT + COUNT(*) as cnt, GeneId, SpeciesId, createtime, PubMed_ID + from GeneRIF_BASIC + GROUP BY GeneId, SpeciesId, createtime, PubMed_Id """ + + with conn.cursor(DictCursor) as cursor: + cursor.execute(query) + while row := cursor.fetchone(): + key = ( + str(row["GeneId"]), + str(row["SpeciesId"]), + row["createtime"], + str(row["PubMed_ID"]), + ) + cache[key] = row["cnt"] + return cache + + +def should_add_rif_row(row: dict, exists_cache: dict) -> bool: + """Checks if we can add a rif_row, prevent duplicate errors from Mysql""" + species_id = str(TAX_IDS[row["#Tax ID"]]) + insert_date = datetime.datetime.fromisoformat(row["last update timestamp"]) + search_key = ( + row["Gene ID"], + species_id, + insert_date, + row["PubMed ID (PMID) list"], + ) + if search_key not in exists_cache: + exists_cache[search_key] = 1 + return True + return False + + +def update_rif(sqluri: str): + """Update GeneRIF_BASIC table""" + with TemporaryDirectory() as _tmpdir: + tmpdir = pathlib.Path(_tmpdir) + gene_info_path = tmpdir / "gene_info.gz" + logging.debug("Fetching gene_info data from: %s", GENE_INFO_URL) + download_file(GENE_INFO_URL, gene_info_path) + + logging.debug("Fetching gene_rif_basics data from: %s", GENERIFS_BASIC_URL) + generif_basics_path = tmpdir / "generif_basics.gz" + download_file( + GENERIFS_BASIC_URL, + generif_basics_path, + ) + + logging.debug("Parsing gene_info data") + genedict = parse_gene_info_from_ncbi(gene_info_path) + with database_connection(sql_uri=sqluri) as con: + exists_cache = build_already_exists_cache(con) + cursor = con.cursor() + skipped_if_exists, added = 0, 0 + for row in read_tsv_file(generif_basics_path): + if row["#Tax ID"] not in TAX_IDS: + continue + if not should_add_rif_row(row, exists_cache): + skipped_if_exists += 1 + continue + species_id = TAX_IDS[row["#Tax ID"]] + symbol = genedict.get(row["Gene ID"], "") + insert_values = ( + species_id, # SpeciesId + row["Gene ID"], # GeneId + symbol, # symbol + row["PubMed ID (PMID) list"], # PubMed_ID + row["last update timestamp"], # createtime + row["GeneRIF text"], # comment + row["#Tax ID"], # TaxID + VERSION_ID, # VersionId + ) + cursor.execute(INSERT_QUERY, insert_values) + added += 1 + if added % 40_000 == 0: + logging.debug("Added 40,000 rows to database") + logging.info( + "Generif_BASIC table updated. Added %s. Skipped %s because they " + "already exists. In case of error, you can use VersionID=%s to find " + "rows inserted with this script", added, skipped_if_exists, + VERSION_ID + ) + + +if __name__ == "__main__": + logging.basicConfig( + level=os.environ.get("LOGLEVEL", "DEBUG"), + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S %Z", + ) + parser = argparse.ArgumentParser("Update Generif_BASIC table") + parser.add_argument( + "--sql-uri", + required=True, + help="MYSQL uri path in the form mysql://user:password@localhost/gn2", + ) + args = parser.parse_args() + update_rif(args.sql_uri) |