Use global cache to store generif metadata.

This global caches has 3,528 entries and there's no expectation for it to grow significantly. Since child processes inherit the parent’s memory, we can pass the global cache to them, reducing fetch times from 0.001s to 0.00001s, significantly boosting performance when indexing the entire database and enriching results with RDF metadata. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
author: Munyoki Kilyungi 2024-06-01 22:18:31 +0300
committer: BonfaceKilz 2024-06-01 23:57:00 +0300
commit: f35dd4a230f0dc316e5b097d8cfbf350d8d440e5 (patch)
tree: e53feb8f4be8c80f67e14a9ec905211feb099b20
parent: 69cb03484eea2c7011ac4c838a448b02307a4b55 (diff)
download: genenetwork3-f35dd4a230f0dc316e5b097d8cfbf350d8d440e5.tar.gz
1 files changed, 32 insertions, 26 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork
index e7390fa..8769689 100755
--- a/scripts/index-genenetwork
+++ b/scripts/index-genenetwork
@@ -8,15 +8,13 @@ xapian index. This xapian index is later used in providing search
 through the web interface.
 
 """
-from string import Template
 from collections import deque, namedtuple
 import contextlib
-from functools import partial, lru_cache
+from functools import partial
 import itertools
 import json
 import logging
 from multiprocessing import Lock, Process
-from string import Template
 import os
 import pathlib
 import resource
@@ -170,36 +168,34 @@ def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDataba
         db.close()
 
 
-@curry(2)
-@lru_cache(maxsize=1_000)
-def get_rif_metadata(symbol, species):
+
+def build_rif_cache():
+    cache = {}
     sparql = SPARQLWrapper(
         "http://localhost:8982/sparql"
     )
     sparql.setReturnFormat(JSON)
-    query = Template("""
+    query = """
 PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 PREFIX gnt: <http://genenetwork.org/term/>
-PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
 PREFIX gnc: <http://genenetwork.org/category/>
-PREFIX gn: <http://genenetwork.org/id/>
-PREFIX dct: <http://purl.org/dc/terms/>
-PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
 
-SELECT DISTINCT ?comment WHERE {
+SELECT ?symbolName ?speciesName GROUP_CONCAT(?comment ; separator=\"\\n\") AS ?comment WHERE {
     ?symbol rdfs:comment _:node ;
-            rdfs:label '$symbol' .
+            rdfs:label ?symbolName .
 _:node rdf:type gnc:GNWikiEntry ;
        gnt:belongsToSpecies ?species ;
        rdfs:comment ?comment .
-?species gnt:shortName '$species' .
-}
-""")
-    sparql.setQuery(query.substitute(symbol=symbol,
-                                     species=species))
+?species gnt:shortName ?speciesName .
+} GROUP BY ?speciesName ?symbolName
+"""
+    sparql.setQuery(query)
     results = sparql.queryAndConvert()["results"]["bindings"]
-    return results
+    for entry in results:
+        x = (entry["speciesName"]["value"], entry["symbolName"]["value"],)
+        cache[x] = entry["comment"]["value"]
+    return cache
 
 
 # pylint: disable=invalid-name
@@ -221,12 +217,16 @@ def index_text(text: str) -> None:
     termgenerator.index_text(text)
     termgenerator.increase_termpos()
 
-# pylint: disable=unnecessary-lambda
-index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text)
-def index_rif_comments(entries):
-    for entry in entries:
-        termgenerator.index_text(entry["comment"]["value"], 0, "XRF")
 
+@curry(2)
+def index_rif_comments(species, symbol):
+    key = (species, symbol,)
+    entry = rdfcache.get(key)
+    if entry:
+        termgenerator.index_text(entry, 0, "XRF")
+
+
+index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text)
 index_authors = lambda authors: termgenerator.index_text(authors, 0, "A")
 index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS")
 index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG")
@@ -247,6 +247,7 @@ add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(ye
 # its parent. We use this to pass data retrieved from SQL from parent
 # to child. Specifically, we use this global variable.
 data: Iterable
+rdfcache: Iterable
 # We use this lock to ensure that only one process writes its Xapian
 # index to disk at a time.
 xapian_lock = Lock()
@@ -284,8 +285,11 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None:
             trait["chr"].bind(index_chr)
             trait["geno_chr"].bind(index_peakchr)
 
-            # Index generif entries
-            Maybe.apply(get_rif_metadata).to_arguments(trait["symbol"], trait["species"]).bind(index_rif_comments)
+            Maybe.apply(
+                index_rif_comments
+            ).to_arguments(
+                trait["species"], trait["symbol"]
+            )
 
             doc.set_data(json.dumps(trait.data))
             (Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}"))
@@ -366,6 +370,8 @@ def index_query(index_function: Callable, query: SQLQuery,
     i = start
     try:
         with worker_queue() as spawn_worker:
+            global rdfcache
+            rdfcache = build_rif_cache()
             with database_connection(sql_uri) as conn:
                 for chunk in group(query_sql(conn, serialize_sql(
                         # KLUDGE: MariaDB does not allow an offset
author	Munyoki Kilyungi	2024-06-01 22:18:31 +0300
committer	BonfaceKilz	2024-06-01 23:57:00 +0300
commit	f35dd4a230f0dc316e5b097d8cfbf350d8d440e5 (patch)
tree	e53feb8f4be8c80f67e14a9ec905211feb099b20
parent	69cb03484eea2c7011ac4c838a448b02307a4b55 (diff)
download	genenetwork3-f35dd4a230f0dc316e5b097d8cfbf350d8d440e5.tar.gz