From 4fd5710dc04b51c0953a2063d9e934dddc13ad6c Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 3 Jul 2024 15:15:37 +0300 Subject: Generate a checksum for all the ttl files. * scripts/index-genenetwork (hash_generif_graph): Rename to hash_rdf_graph. Generate a checksum of all the turtle files inside the ttl directory that's the basis for the GN virtuoso graph. (create_xapian_index): Rename hash_generif_graph -> hash_rdf_graph. Signed-off-by: Munyoki Kilyungi --- scripts/index-genenetwork | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'scripts') diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index 219858c..de3edb0 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -240,10 +240,12 @@ def build_rdf_cache(sparql_uri: str, query: str, remove_common_words: bool = Fal return smaller_cache -def hash_generif_graph(generif_file: pathlib.Path) -> str: - with open(generif_file, encoding="utf-8") as f_: - data = f_.read() - return hashlib.md5(data.encode()).hexdigest() +def hash_rdf_graph(ttl_dir: pathlib.Path) -> str: + ttl_hash = hashlib.new("md5") + for ttl_file in ttl_dir.glob("*.ttl"): + with open(ttl_file, encoding="utf-8") as f_: + ttl_hash.update(f_.read().encode()) + return ttl_hash.hexdigest() # pylint: disable=invalid-name @@ -528,7 +530,7 @@ def is_data_modified(xapian_directory: str, generif = pathlib.Path("/var/lib/data/generif-metadata.ttl") generif_checksum = "-1" if generif.exists(): - generif_checksum = hash_generif_graph(generif) + generif_checksum = hash_rdf_graph(generif) if (db.get_metadata("generif-checksum").decode() == generif_checksum and db.get_metadata("checksums").decode() == checksums): sys.exit(1) @@ -586,7 +588,7 @@ def create_xapian_index(xapian_directory: str, sql_uri: str, logging.info("Writing generif checksums into index") generif = pathlib.Path("/var/lib/data/generif-metadata.ttl") if generif.exists(): - db.set_metadata("generif-checksum", hash_generif_graph(generif).encode()) + db.set_metadata("generif-checksum", hash_rdf_graph(generif).encode()) for child in combined_index.iterdir(): shutil.move(child, xapian_directory) logging.info("Index built") -- cgit v1.2.3