diff options
author | Munyoki Kilyungi | 2022-12-01 15:32:42 +0300 |
---|---|---|
committer | BonfaceKilz | 2022-12-06 11:28:28 +0300 |
commit | c75ed75c6a5987eed6672be924870f86282ed6d6 (patch) | |
tree | 9d91afcbef1ea9c2ec6557b86b5187f5093374ab /gn3 | |
parent | 09f4d9c4bec6e3395296a69ce718dea30240ed54 (diff) | |
download | genenetwork3-c75ed75c6a5987eed6672be924870f86282ed6d6.tar.gz |
Use a dataset's name to fetch it's metadata from RDF
* gn3/api/metadata.py (jsonify_dataset_metadata): Rewrite metadata
end-point to use a dataset's name instead of it's accession_id.
* gn3/db/rdf.py (get_dataset_metadata): Replace accession_id with
name. Use one single RDF query instead of multiple queries.
Diffstat (limited to 'gn3')
-rw-r--r-- | gn3/api/metadata.py | 6 | ||||
-rw-r--r-- | gn3/db/rdf.py | 158 |
2 files changed, 61 insertions, 103 deletions
diff --git a/gn3/api/metadata.py b/gn3/api/metadata.py index 35e4067..8f6127f 100644 --- a/gn3/api/metadata.py +++ b/gn3/api/metadata.py @@ -13,14 +13,14 @@ from gn3.db.rdf import get_dataset_metadata metadata = Blueprint("metadata", __name__) -@metadata.route("/dataset/<accession_id>", methods=["GET"]) -def jsonify_dataset_metadata(accession_id): +@metadata.route("/dataset/<name>", methods=["GET"]) +def jsonify_dataset_metadata(name): """Fetch a dataset's metadata given it's ACCESSION_ID""" try: return jsonify( get_dataset_metadata( SPARQLWrapper(current_app.config.get("SPARQL_ENDPOINT")), - accession_id, + name, ).data ) # The virtuoso server is misconfigured or it isn't running at all diff --git a/gn3/db/rdf.py b/gn3/db/rdf.py index 7ed80b1..b690d5a 100644 --- a/gn3/db/rdf.py +++ b/gn3/db/rdf.py @@ -24,112 +24,70 @@ def sparql_query( def get_dataset_metadata( - sparql_conn: SPARQLWrapper, accession_id: str + sparql_conn: SPARQLWrapper, name: str ) -> MonadicDict: - """Return info about dataset with ACCESSION_ID.""" - # Check accession_id to protect against query injection. - # TODO: This function doesn't yet return the names of the actual dataset - # files. - pattern = re.compile(r"GN\d+", re.ASCII) - if not pattern.fullmatch(accession_id): - return MonadicDict() - # KLUDGE: We split the SPARQL query because virtuoso is very slow on a - # single large query. - queries = [ - """ + """Return info about dataset with a given NAME""" + __metadata_query = """ PREFIX gn: <http://genenetwork.org/> -SELECT ?name ?dataset_group ?status ?title ?geo_series + +SELECT ?accession_id ?dataset_group ?status ?title ?geo_series ?specifics ?summary ?about_tissue +?about_platform ?about_data_processing ?notes ?experiment_design ?contributors ?citation ?acknowledgement +?platform_name ?tissue_name ?normalization_name ?species_name ?inbred_set_name +?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset . - OPTIONAL { ?dataset gn:name ?name } . + ?dataset gn:accessionId ?accession_id ; + rdf:type gn:dataset ; + gn:name "$name" . + OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing } . + OPTIONAL { ?dataset gn:aboutPlatform ?about_platform } . + OPTIONAL { ?dataset gn:aboutTissue ?about_tissue } . + OPTIONAL { ?dataset gn:acknowledgement ?acknowledgement } . + OPTIONAL { ?dataset gn:citation ?citation } . + OPTIONAL { ?dataset gn:contributors ?contributors } . OPTIONAL { ?dataset gn:datasetGroup ?dataset_group } . - # FIXME: gn:datasetStatus should not be optional. But, some records don't - # have it. OPTIONAL { ?dataset gn:datasetStatus ?status } . - OPTIONAL { ?dataset gn:title ?title } . + OPTIONAL { ?dataset gn:experimentDesign ?experiment_design } . OPTIONAL { ?dataset gn:geoSeries ?geo_series } . -} -""", - """ -PREFIX gn: <http://genenetwork.org/> -SELECT ?platform_name ?normalization_name ?species_name ?inbred_set_name ?tissue_name -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset ; - gn:normalization / gn:name ?normalization_name ; - gn:datasetOfSpecies / gn:menuName ?species_name ; - gn:datasetOfInbredSet / gn:name ?inbred_set_name . - OPTIONAL { ?dataset gn:datasetOfTissue / gn:name ?tissue_name } . - OPTIONAL { ?dataset gn:datasetOfPlatform / gn:name ?platform_name } . -} -""", - """ -PREFIX gn: <http://genenetwork.org/> -SELECT ?specifics ?summary ?about_cases ?about_tissue ?about_platform - ?about_data_processing ?notes ?experiment_design ?contributors - ?citation ?acknowledgment -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset . - OPTIONAL { ?dataset gn:specifics ?specifics . } - OPTIONAL { ?dataset gn:summary ?summary . } - OPTIONAL { ?dataset gn:aboutCases ?about_cases . } - OPTIONAL { ?dataset gn:aboutTissue ?about_tissue . } - OPTIONAL { ?dataset gn:aboutPlatform ?about_platform . } - OPTIONAL { ?dataset gn:aboutDataProcessing ?about_data_processing . } - OPTIONAL { ?dataset gn:notes ?notes . } - OPTIONAL { ?dataset gn:experimentDesign ?experiment_design . } - OPTIONAL { ?dataset gn:contributors ?contributors . } - OPTIONAL { ?dataset gn:citation ?citation . } - OPTIONAL { ?dataset gn:acknowledgment ?acknowledgment . } -} -""", - ] - result: MonadicDict = MonadicDict( - { - "accession_id": accession_id, - } - ) - query_result: MonadicDict = MonadicDict() - for query in queries: - if not ( - # Expecting only one result - sparql_result := sparql_query( - sparql_conn, - Template(query).substitute(accession_id=accession_id) - )[0] - ): - return MonadicDict() - query_result |= sparql_result - for key, value in query_result.items(): - result[key] = value.bind(lambda x: Just(x["value"])) - - investigator_query_result = sparql_query( - sparql_conn, - Template( - """ -PREFIX gn: <http://genenetwork.org/> -SELECT ?name ?address ?city ?state ?zip ?phone ?email ?country ?homepage -WHERE { - ?dataset gn:accessionId "$accession_id" ; - rdf:type gn:dataset ; - gn:datasetOfInvestigator ?investigator . - OPTIONAL { ?investigator foaf:name ?name . } - OPTIONAL { ?investigator gn:address ?address . } - OPTIONAL { ?investigator gn:city ?city . } - OPTIONAL { ?investigator gn:state ?state . } - OPTIONAL { ?investigator gn:zipCode ?zip . } - OPTIONAL { ?investigator foaf:phone ?phone . } - OPTIONAL { ?investigator foaf:mbox ?email . } - OPTIONAL { ?investigator gn:country ?country . } - OPTIONAL { ?investigator foaf:homepage ?homepage . } + OPTIONAL { ?dataset gn:notes ?notes } . + OPTIONAL { ?dataset gn:specifics ?specifics } . + OPTIONAL { ?dataset gn:summary ?summary } . + OPTIONAL { ?dataset gn:title ?title } . + OPTIONAL { + ?dataset gn:normalization ?normalization . + ?normalization gn:name ?normalization_name . + } . + OPTIONAL { + ?dataset gn:datasetOfPlatform ?platform . + ?platform gn:name ?platform_name . + } . + OPTIONAL { + ?dataset gn:datasetOfTissue ?tissue . + ?tissue gn:name ?tissue_name . + } . + OPTIONAL { + ?dataset gn:datasetOfSpecies ?species ; + gn:datasetOfInbredSet ?inbred_set . + ?species gn:name ?species_name . + ?inbred_set gn:name ?inbred_set_name . + } . + OPTIONAL { + ?dataset gn:datasetOfInvestigator ?investigator . + OPTIONAL { ?investigator foaf:name ?name . } + OPTIONAL { ?investigator gn:address ?address . } + OPTIONAL { ?investigator gn:city ?city . } + OPTIONAL { ?investigator gn:state ?state . } + OPTIONAL { ?investigator gn:zipCode ?zip . } + OPTIONAL { ?investigator foaf:phone ?phone . } + OPTIONAL { ?investigator foaf:mbox ?email . } + OPTIONAL { ?investigator gn:country ?country . } + OPTIONAL { ?investigator foaf:homepage ?homepage . } + } } """ - ).substitute(accession_id=accession_id) - )[0] - result["investigators"] = Just({ - key: value.bind(lambda a: a["value"]) - for key, value in investigator_query_result.items() - }) + result: MonadicDict = MonadicDict() + for key, value in sparql_query( + sparql_conn, + Template(__metadata_query).substitute(name=name) + )[0].items(): + result[key] = value.bind(lambda x: Just(x["value"])) return result |