aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorMunyoki Kilyungi2024-07-03 14:30:53 +0300
committerBonfaceKilz2024-07-03 16:53:31 +0300
commit814db45bd13abe2f2da74b2c4228449e7e885736 (patch)
treecb1eb2197d36ed426e4d4ae4c43e3881c686f209 /scripts
parented7814a6c44c99dd5eb5c5a92ec22f20342f4e22 (diff)
downloadgenenetwork3-814db45bd13abe2f2da74b2c4228449e7e885736.tar.gz
Refactor how the generif md5 sum is calculated and stored in XAPIAN.
* scripts/index-genenetwork (hash_generif_graph): Build the generif checksum by directly building it from the file. (is_data_modified): Update how generif-checksums are verified. (create_xapian_index): Update how generif-checksums are stored in XAPIAN. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/index-genenetwork44
1 files changed, 13 insertions, 31 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork
index 1775297..441b8b2 100755
--- a/scripts/index-genenetwork
+++ b/scripts/index-genenetwork
@@ -24,6 +24,7 @@ import resource
import re
import shutil
import sys
+import hashlib
import tempfile
from typing import Callable, Dict, Generator, Hashable, Iterable, List
from SPARQLWrapper import SPARQLWrapper, JSON
@@ -239,35 +240,10 @@ def build_rdf_cache(sparql_uri: str, query: str, remove_common_words: bool = Fal
return smaller_cache
-def hash_generif_graph(sparql_uri: str):
- sparql = SPARQLWrapper(sparql_uri)
- sparql.setReturnFormat(JSON)
- query = """
-PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-PREFIX gnt: <http://genenetwork.org/term/>
-PREFIX gnc: <http://genenetwork.org/category/>
-
-SELECT SHA256(GROUP_CONCAT(?entries ; separator=\"\\n\")) AS ?hash WHERE {
- {{
- SELECT ?type CONCAT(?symbolName, ",", ?speciesName, \"\\n\",GROUP_CONCAT(?comment ; separator=\"\\n\")) AS ?entries WHERE {
- ?symbol rdfs:comment _:node ;
- rdfs:label ?symbolName .
-_:node rdf:type gnc:GNWikiEntry ;
- rdf:type ?type ;
- gnt:belongsToSpecies ?species ;
- rdfs:comment ?comment .
-?species gnt:shortName ?speciesName .
-} GROUP BY ?speciesName ?symbolName ?type
- }}
- } GROUP BY ?type
-"""
- sparql.setQuery(query)
- results = sparql.queryAndConvert()
- if not isinstance(results, dict):
- raise TypeError(f"Expected results to be a dict but found {type(results)}")
- bindings = results["results"]["bindings"]
- return bindings[0]["hash"]["value"]
+def hash_generif_graph(generif_file: str):
+ with open(generif_file, encoding="utf-8") as f_:
+ data = f_.read()
+ return hashlib.md5(data.encode()).hexdigest()
# pylint: disable=invalid-name
@@ -549,7 +525,11 @@ def is_data_modified(xapian_directory: str,
])
# Return a zero exit status code when the data has changed;
# otherwise exit with a 1 exit status code.
- if (db.get_metadata("generif-checksum").decode() == hash_generif_graph(sparql_uri) and
+ generif = pathlib.Path("/var/lib/data/generif-metadata.ttl")
+ generif_checksum = "-1"
+ if generif.exists():
+ generif_checksum = hash_generif_graph(generif)
+ if (db.get_metadata("generif-checksum").decode() == generif_checksum and
db.get_metadata("checksums").decode() == checksums):
sys.exit(1)
sys.exit(0)
@@ -604,7 +584,9 @@ def create_xapian_index(xapian_directory: str, sql_uri: str,
db.set_metadata("tables", " ".join(tables))
db.set_metadata("checksums", " ".join(checksums))
logging.info("Writing generif checksums into index")
- db.set_metadata("generif-checksum", hash_generif_graph(sparql_uri).encode())
+ generif = pathlib.Path("/var/lib/data/generif-metadata.ttl")
+ if generif.exists():
+ db.set_metadata("generif-checksum", hash_generif_graph(generif).encode())
for child in combined_index.iterdir():
shutil.move(child, xapian_directory)
logging.info("Index built")