From 814db45bd13abe2f2da74b2c4228449e7e885736 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 3 Jul 2024 14:30:53 +0300 Subject: Refactor how the generif md5 sum is calculated and stored in XAPIAN. * scripts/index-genenetwork (hash_generif_graph): Build the generif checksum by directly building it from the file. (is_data_modified): Update how generif-checksums are verified. (create_xapian_index): Update how generif-checksums are stored in XAPIAN. Signed-off-by: Munyoki Kilyungi --- scripts/index-genenetwork | 44 +++++++++++++------------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) (limited to 'scripts') diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index 1775297..441b8b2 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -24,6 +24,7 @@ import resource import re import shutil import sys +import hashlib import tempfile from typing import Callable, Dict, Generator, Hashable, Iterable, List from SPARQLWrapper import SPARQLWrapper, JSON @@ -239,35 +240,10 @@ def build_rdf_cache(sparql_uri: str, query: str, remove_common_words: bool = Fal return smaller_cache -def hash_generif_graph(sparql_uri: str): - sparql = SPARQLWrapper(sparql_uri) - sparql.setReturnFormat(JSON) - query = """ -PREFIX rdf: -PREFIX rdfs: -PREFIX gnt: -PREFIX gnc: - -SELECT SHA256(GROUP_CONCAT(?entries ; separator=\"\\n\")) AS ?hash WHERE { - {{ - SELECT ?type CONCAT(?symbolName, ",", ?speciesName, \"\\n\",GROUP_CONCAT(?comment ; separator=\"\\n\")) AS ?entries WHERE { - ?symbol rdfs:comment _:node ; - rdfs:label ?symbolName . -_:node rdf:type gnc:GNWikiEntry ; - rdf:type ?type ; - gnt:belongsToSpecies ?species ; - rdfs:comment ?comment . -?species gnt:shortName ?speciesName . -} GROUP BY ?speciesName ?symbolName ?type - }} - } GROUP BY ?type -""" - sparql.setQuery(query) - results = sparql.queryAndConvert() - if not isinstance(results, dict): - raise TypeError(f"Expected results to be a dict but found {type(results)}") - bindings = results["results"]["bindings"] - return bindings[0]["hash"]["value"] +def hash_generif_graph(generif_file: str): + with open(generif_file, encoding="utf-8") as f_: + data = f_.read() + return hashlib.md5(data.encode()).hexdigest() # pylint: disable=invalid-name @@ -549,7 +525,11 @@ def is_data_modified(xapian_directory: str, ]) # Return a zero exit status code when the data has changed; # otherwise exit with a 1 exit status code. - if (db.get_metadata("generif-checksum").decode() == hash_generif_graph(sparql_uri) and + generif = pathlib.Path("/var/lib/data/generif-metadata.ttl") + generif_checksum = "-1" + if generif.exists(): + generif_checksum = hash_generif_graph(generif) + if (db.get_metadata("generif-checksum").decode() == generif_checksum and db.get_metadata("checksums").decode() == checksums): sys.exit(1) sys.exit(0) @@ -604,7 +584,9 @@ def create_xapian_index(xapian_directory: str, sql_uri: str, db.set_metadata("tables", " ".join(tables)) db.set_metadata("checksums", " ".join(checksums)) logging.info("Writing generif checksums into index") - db.set_metadata("generif-checksum", hash_generif_graph(sparql_uri).encode()) + generif = pathlib.Path("/var/lib/data/generif-metadata.ttl") + if generif.exists(): + db.set_metadata("generif-checksum", hash_generif_graph(generif).encode()) for child in combined_index.iterdir(): shutil.move(child, xapian_directory) logging.info("Index built") -- cgit v1.2.3