diff options
author | Munyoki Kilyungi | 2024-06-01 22:18:31 +0300 |
---|---|---|
committer | BonfaceKilz | 2024-06-01 23:57:00 +0300 |
commit | f35dd4a230f0dc316e5b097d8cfbf350d8d440e5 (patch) | |
tree | e53feb8f4be8c80f67e14a9ec905211feb099b20 | |
parent | 69cb03484eea2c7011ac4c838a448b02307a4b55 (diff) | |
download | genenetwork3-f35dd4a230f0dc316e5b097d8cfbf350d8d440e5.tar.gz |
Use global cache to store generif metadata.
This global caches has 3,528 entries and there's no expectation for it
to grow significantly. Since child processes inherit the parent’s
memory, we can pass the global cache to them, reducing fetch times
from 0.001s to 0.00001s, significantly boosting performance when
indexing the entire database and enriching results with RDF metadata.
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-x | scripts/index-genenetwork | 58 |
1 files changed, 32 insertions, 26 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index e7390fa..8769689 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -8,15 +8,13 @@ xapian index. This xapian index is later used in providing search through the web interface. """ -from string import Template from collections import deque, namedtuple import contextlib -from functools import partial, lru_cache +from functools import partial import itertools import json import logging from multiprocessing import Lock, Process -from string import Template import os import pathlib import resource @@ -170,36 +168,34 @@ def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDataba db.close() -@curry(2) -@lru_cache(maxsize=1_000) -def get_rif_metadata(symbol, species): + +def build_rif_cache(): + cache = {} sparql = SPARQLWrapper( "http://localhost:8982/sparql" ) sparql.setReturnFormat(JSON) - query = Template(""" + query = """ PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX gnt: <http://genenetwork.org/term/> -PREFIX skos: <http://www.w3.org/2004/02/skos/core#> PREFIX gnc: <http://genenetwork.org/category/> -PREFIX gn: <http://genenetwork.org/id/> -PREFIX dct: <http://purl.org/dc/terms/> -PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#> -SELECT DISTINCT ?comment WHERE { +SELECT ?symbolName ?speciesName GROUP_CONCAT(?comment ; separator=\"\\n\") AS ?comment WHERE { ?symbol rdfs:comment _:node ; - rdfs:label '$symbol' . + rdfs:label ?symbolName . _:node rdf:type gnc:GNWikiEntry ; gnt:belongsToSpecies ?species ; rdfs:comment ?comment . -?species gnt:shortName '$species' . -} -""") - sparql.setQuery(query.substitute(symbol=symbol, - species=species)) +?species gnt:shortName ?speciesName . +} GROUP BY ?speciesName ?symbolName +""" + sparql.setQuery(query) results = sparql.queryAndConvert()["results"]["bindings"] - return results + for entry in results: + x = (entry["speciesName"]["value"], entry["symbolName"]["value"],) + cache[x] = entry["comment"]["value"] + return cache # pylint: disable=invalid-name @@ -221,12 +217,16 @@ def index_text(text: str) -> None: termgenerator.index_text(text) termgenerator.increase_termpos() -# pylint: disable=unnecessary-lambda -index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) -def index_rif_comments(entries): - for entry in entries: - termgenerator.index_text(entry["comment"]["value"], 0, "XRF") +@curry(2) +def index_rif_comments(species, symbol): + key = (species, symbol,) + entry = rdfcache.get(key) + if entry: + termgenerator.index_text(entry, 0, "XRF") + + +index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) index_authors = lambda authors: termgenerator.index_text(authors, 0, "A") index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS") index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG") @@ -247,6 +247,7 @@ add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(ye # its parent. We use this to pass data retrieved from SQL from parent # to child. Specifically, we use this global variable. data: Iterable +rdfcache: Iterable # We use this lock to ensure that only one process writes its Xapian # index to disk at a time. xapian_lock = Lock() @@ -284,8 +285,11 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: trait["chr"].bind(index_chr) trait["geno_chr"].bind(index_peakchr) - # Index generif entries - Maybe.apply(get_rif_metadata).to_arguments(trait["symbol"], trait["species"]).bind(index_rif_comments) + Maybe.apply( + index_rif_comments + ).to_arguments( + trait["species"], trait["symbol"] + ) doc.set_data(json.dumps(trait.data)) (Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}")) @@ -366,6 +370,8 @@ def index_query(index_function: Callable, query: SQLQuery, i = start try: with worker_queue() as spawn_worker: + global rdfcache + rdfcache = build_rif_cache() with database_connection(sql_uri) as conn: for chunk in group(query_sql(conn, serialize_sql( # KLUDGE: MariaDB does not allow an offset |