about summary refs log tree commit diff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/index-genenetwork44
1 files changed, 13 insertions, 31 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork
index 1775297..441b8b2 100755
--- a/scripts/index-genenetwork
+++ b/scripts/index-genenetwork
@@ -24,6 +24,7 @@ import resource
 import re
 import shutil
 import sys
+import hashlib
 import tempfile
 from typing import Callable, Dict, Generator, Hashable, Iterable, List
 from SPARQLWrapper import SPARQLWrapper, JSON
@@ -239,35 +240,10 @@ def build_rdf_cache(sparql_uri: str, query: str, remove_common_words: bool = Fal
     return smaller_cache
 
 
-def hash_generif_graph(sparql_uri: str):
-    sparql = SPARQLWrapper(sparql_uri)
-    sparql.setReturnFormat(JSON)
-    query = """
-PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
-PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-PREFIX gnt: <http://genenetwork.org/term/>
-PREFIX gnc: <http://genenetwork.org/category/>
-
-SELECT SHA256(GROUP_CONCAT(?entries ; separator=\"\\n\")) AS ?hash WHERE {
-   {{
-     SELECT ?type CONCAT(?symbolName, ",", ?speciesName, \"\\n\",GROUP_CONCAT(?comment ; separator=\"\\n\")) AS ?entries WHERE {
-    ?symbol rdfs:comment _:node ;
-            rdfs:label ?symbolName .
-_:node rdf:type gnc:GNWikiEntry ;
-       rdf:type ?type ;
-       gnt:belongsToSpecies ?species ;
-       rdfs:comment ?comment .
-?species gnt:shortName ?speciesName .
-} GROUP BY ?speciesName ?symbolName ?type
-   }}
-   } GROUP BY ?type
-"""
-    sparql.setQuery(query)
-    results = sparql.queryAndConvert()
-    if not isinstance(results, dict):
-        raise TypeError(f"Expected results to be a dict but found {type(results)}")
-    bindings = results["results"]["bindings"]
-    return bindings[0]["hash"]["value"]
+def hash_generif_graph(generif_file: str):
+    with open(generif_file, encoding="utf-8") as f_:
+        data = f_.read()
+        return hashlib.md5(data.encode()).hexdigest()
 
 
 # pylint: disable=invalid-name
@@ -549,7 +525,11 @@ def is_data_modified(xapian_directory: str,
             ])
         # Return a zero exit status code when the data has changed;
         # otherwise exit with a 1 exit status code.
-        if (db.get_metadata("generif-checksum").decode() == hash_generif_graph(sparql_uri) and
+        generif = pathlib.Path("/var/lib/data/generif-metadata.ttl")
+        generif_checksum = "-1"
+        if generif.exists():
+            generif_checksum = hash_generif_graph(generif)
+        if (db.get_metadata("generif-checksum").decode() == generif_checksum and
             db.get_metadata("checksums").decode() == checksums):
             sys.exit(1)
         sys.exit(0)
@@ -604,7 +584,9 @@ def create_xapian_index(xapian_directory: str, sql_uri: str,
                 db.set_metadata("tables", " ".join(tables))
                 db.set_metadata("checksums", " ".join(checksums))
                 logging.info("Writing generif checksums into index")
-                db.set_metadata("generif-checksum", hash_generif_graph(sparql_uri).encode())
+                generif = pathlib.Path("/var/lib/data/generif-metadata.ttl")
+                if generif.exists():
+                    db.set_metadata("generif-checksum", hash_generif_graph(generif).encode())
         for child in combined_index.iterdir():
             shutil.move(child, xapian_directory)
     logging.info("Index built")