From f35dd4a230f0dc316e5b097d8cfbf350d8d440e5 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Sat, 1 Jun 2024 22:18:31 +0300 Subject: Use global cache to store generif metadata. This global caches has 3,528 entries and there's no expectation for it to grow significantly. Since child processes inherit the parent’s memory, we can pass the global cache to them, reducing fetch times from 0.001s to 0.00001s, significantly boosting performance when indexing the entire database and enriching results with RDF metadata. Signed-off-by: Munyoki Kilyungi --- scripts/index-genenetwork | 58 ++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index e7390fa..8769689 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -8,15 +8,13 @@ xapian index. This xapian index is later used in providing search through the web interface. """ -from string import Template from collections import deque, namedtuple import contextlib -from functools import partial, lru_cache +from functools import partial import itertools import json import logging from multiprocessing import Lock, Process -from string import Template import os import pathlib import resource @@ -170,36 +168,34 @@ def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDataba db.close() -@curry(2) -@lru_cache(maxsize=1_000) -def get_rif_metadata(symbol, species): + +def build_rif_cache(): + cache = {} sparql = SPARQLWrapper( "http://localhost:8982/sparql" ) sparql.setReturnFormat(JSON) - query = Template(""" + query = """ PREFIX rdf: PREFIX rdfs: PREFIX gnt: -PREFIX skos: PREFIX gnc: -PREFIX gn: -PREFIX dct: -PREFIX xkos: -SELECT DISTINCT ?comment WHERE { +SELECT ?symbolName ?speciesName GROUP_CONCAT(?comment ; separator=\"\\n\") AS ?comment WHERE { ?symbol rdfs:comment _:node ; - rdfs:label '$symbol' . + rdfs:label ?symbolName . _:node rdf:type gnc:GNWikiEntry ; gnt:belongsToSpecies ?species ; rdfs:comment ?comment . -?species gnt:shortName '$species' . -} -""") - sparql.setQuery(query.substitute(symbol=symbol, - species=species)) +?species gnt:shortName ?speciesName . +} GROUP BY ?speciesName ?symbolName +""" + sparql.setQuery(query) results = sparql.queryAndConvert()["results"]["bindings"] - return results + for entry in results: + x = (entry["speciesName"]["value"], entry["symbolName"]["value"],) + cache[x] = entry["comment"]["value"] + return cache # pylint: disable=invalid-name @@ -221,12 +217,16 @@ def index_text(text: str) -> None: termgenerator.index_text(text) termgenerator.increase_termpos() -# pylint: disable=unnecessary-lambda -index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) -def index_rif_comments(entries): - for entry in entries: - termgenerator.index_text(entry["comment"]["value"], 0, "XRF") +@curry(2) +def index_rif_comments(species, symbol): + key = (species, symbol,) + entry = rdfcache.get(key) + if entry: + termgenerator.index_text(entry, 0, "XRF") + + +index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) index_authors = lambda authors: termgenerator.index_text(authors, 0, "A") index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS") index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG") @@ -247,6 +247,7 @@ add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(ye # its parent. We use this to pass data retrieved from SQL from parent # to child. Specifically, we use this global variable. data: Iterable +rdfcache: Iterable # We use this lock to ensure that only one process writes its Xapian # index to disk at a time. xapian_lock = Lock() @@ -284,8 +285,11 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: trait["chr"].bind(index_chr) trait["geno_chr"].bind(index_peakchr) - # Index generif entries - Maybe.apply(get_rif_metadata).to_arguments(trait["symbol"], trait["species"]).bind(index_rif_comments) + Maybe.apply( + index_rif_comments + ).to_arguments( + trait["species"], trait["symbol"] + ) doc.set_data(json.dumps(trait.data)) (Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}")) @@ -366,6 +370,8 @@ def index_query(index_function: Callable, query: SQLQuery, i = start try: with worker_queue() as spawn_worker: + global rdfcache + rdfcache = build_rif_cache() with database_connection(sql_uri) as conn: for chunk in group(query_sql(conn, serialize_sql( # KLUDGE: MariaDB does not allow an offset -- cgit v1.2.3