diff options
author | Munyoki Kilyungi | 2024-07-03 14:30:53 +0300 |
---|---|---|
committer | BonfaceKilz | 2024-07-03 16:53:31 +0300 |
commit | 814db45bd13abe2f2da74b2c4228449e7e885736 (patch) | |
tree | cb1eb2197d36ed426e4d4ae4c43e3881c686f209 /scripts | |
parent | ed7814a6c44c99dd5eb5c5a92ec22f20342f4e22 (diff) | |
download | genenetwork3-814db45bd13abe2f2da74b2c4228449e7e885736.tar.gz |
Refactor how the generif md5 sum is calculated and stored in XAPIAN.
* scripts/index-genenetwork (hash_generif_graph): Build the generif
checksum by directly building it from the file.
(is_data_modified): Update how generif-checksums are verified.
(create_xapian_index): Update how generif-checksums are stored in
XAPIAN.
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/index-genenetwork | 44 |
1 files changed, 13 insertions, 31 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index 1775297..441b8b2 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -24,6 +24,7 @@ import resource import re import shutil import sys +import hashlib import tempfile from typing import Callable, Dict, Generator, Hashable, Iterable, List from SPARQLWrapper import SPARQLWrapper, JSON @@ -239,35 +240,10 @@ def build_rdf_cache(sparql_uri: str, query: str, remove_common_words: bool = Fal return smaller_cache -def hash_generif_graph(sparql_uri: str): - sparql = SPARQLWrapper(sparql_uri) - sparql.setReturnFormat(JSON) - query = """ -PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> -PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> -PREFIX gnt: <http://genenetwork.org/term/> -PREFIX gnc: <http://genenetwork.org/category/> - -SELECT SHA256(GROUP_CONCAT(?entries ; separator=\"\\n\")) AS ?hash WHERE { - {{ - SELECT ?type CONCAT(?symbolName, ",", ?speciesName, \"\\n\",GROUP_CONCAT(?comment ; separator=\"\\n\")) AS ?entries WHERE { - ?symbol rdfs:comment _:node ; - rdfs:label ?symbolName . -_:node rdf:type gnc:GNWikiEntry ; - rdf:type ?type ; - gnt:belongsToSpecies ?species ; - rdfs:comment ?comment . -?species gnt:shortName ?speciesName . -} GROUP BY ?speciesName ?symbolName ?type - }} - } GROUP BY ?type -""" - sparql.setQuery(query) - results = sparql.queryAndConvert() - if not isinstance(results, dict): - raise TypeError(f"Expected results to be a dict but found {type(results)}") - bindings = results["results"]["bindings"] - return bindings[0]["hash"]["value"] +def hash_generif_graph(generif_file: str): + with open(generif_file, encoding="utf-8") as f_: + data = f_.read() + return hashlib.md5(data.encode()).hexdigest() # pylint: disable=invalid-name @@ -549,7 +525,11 @@ def is_data_modified(xapian_directory: str, ]) # Return a zero exit status code when the data has changed; # otherwise exit with a 1 exit status code. - if (db.get_metadata("generif-checksum").decode() == hash_generif_graph(sparql_uri) and + generif = pathlib.Path("/var/lib/data/generif-metadata.ttl") + generif_checksum = "-1" + if generif.exists(): + generif_checksum = hash_generif_graph(generif) + if (db.get_metadata("generif-checksum").decode() == generif_checksum and db.get_metadata("checksums").decode() == checksums): sys.exit(1) sys.exit(0) @@ -604,7 +584,9 @@ def create_xapian_index(xapian_directory: str, sql_uri: str, db.set_metadata("tables", " ".join(tables)) db.set_metadata("checksums", " ".join(checksums)) logging.info("Writing generif checksums into index") - db.set_metadata("generif-checksum", hash_generif_graph(sparql_uri).encode()) + generif = pathlib.Path("/var/lib/data/generif-metadata.ttl") + if generif.exists(): + db.set_metadata("generif-checksum", hash_generif_graph(generif).encode()) for child in combined_index.iterdir(): shutil.move(child, xapian_directory) logging.info("Index built") |