aboutsummaryrefslogtreecommitdiff
path: root/scripts/index-genenetwork
diff options
context:
space:
mode:
authorMunyoki Kilyungi2024-06-01 22:18:31 +0300
committerBonfaceKilz2024-06-01 23:57:00 +0300
commitf35dd4a230f0dc316e5b097d8cfbf350d8d440e5 (patch)
treee53feb8f4be8c80f67e14a9ec905211feb099b20 /scripts/index-genenetwork
parent69cb03484eea2c7011ac4c838a448b02307a4b55 (diff)
downloadgenenetwork3-f35dd4a230f0dc316e5b097d8cfbf350d8d440e5.tar.gz
Use global cache to store generif metadata.
This global caches has 3,528 entries and there's no expectation for it to grow significantly. Since child processes inherit the parent’s memory, we can pass the global cache to them, reducing fetch times from 0.001s to 0.00001s, significantly boosting performance when indexing the entire database and enriching results with RDF metadata. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
Diffstat (limited to 'scripts/index-genenetwork')
-rwxr-xr-xscripts/index-genenetwork58
1 files changed, 32 insertions, 26 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork
index e7390fa..8769689 100755
--- a/scripts/index-genenetwork
+++ b/scripts/index-genenetwork
@@ -8,15 +8,13 @@ xapian index. This xapian index is later used in providing search
through the web interface.
"""
-from string import Template
from collections import deque, namedtuple
import contextlib
-from functools import partial, lru_cache
+from functools import partial
import itertools
import json
import logging
from multiprocessing import Lock, Process
-from string import Template
import os
import pathlib
import resource
@@ -170,36 +168,34 @@ def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDataba
db.close()
-@curry(2)
-@lru_cache(maxsize=1_000)
-def get_rif_metadata(symbol, species):
+
+def build_rif_cache():
+ cache = {}
sparql = SPARQLWrapper(
"http://localhost:8982/sparql"
)
sparql.setReturnFormat(JSON)
- query = Template("""
+ query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX gnt: <http://genenetwork.org/term/>
-PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX gnc: <http://genenetwork.org/category/>
-PREFIX gn: <http://genenetwork.org/id/>
-PREFIX dct: <http://purl.org/dc/terms/>
-PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
-SELECT DISTINCT ?comment WHERE {
+SELECT ?symbolName ?speciesName GROUP_CONCAT(?comment ; separator=\"\\n\") AS ?comment WHERE {
?symbol rdfs:comment _:node ;
- rdfs:label '$symbol' .
+ rdfs:label ?symbolName .
_:node rdf:type gnc:GNWikiEntry ;
gnt:belongsToSpecies ?species ;
rdfs:comment ?comment .
-?species gnt:shortName '$species' .
-}
-""")
- sparql.setQuery(query.substitute(symbol=symbol,
- species=species))
+?species gnt:shortName ?speciesName .
+} GROUP BY ?speciesName ?symbolName
+"""
+ sparql.setQuery(query)
results = sparql.queryAndConvert()["results"]["bindings"]
- return results
+ for entry in results:
+ x = (entry["speciesName"]["value"], entry["symbolName"]["value"],)
+ cache[x] = entry["comment"]["value"]
+ return cache
# pylint: disable=invalid-name
@@ -221,12 +217,16 @@ def index_text(text: str) -> None:
termgenerator.index_text(text)
termgenerator.increase_termpos()
-# pylint: disable=unnecessary-lambda
-index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text)
-def index_rif_comments(entries):
- for entry in entries:
- termgenerator.index_text(entry["comment"]["value"], 0, "XRF")
+@curry(2)
+def index_rif_comments(species, symbol):
+ key = (species, symbol,)
+ entry = rdfcache.get(key)
+ if entry:
+ termgenerator.index_text(entry, 0, "XRF")
+
+
+index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text)
index_authors = lambda authors: termgenerator.index_text(authors, 0, "A")
index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS")
index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG")
@@ -247,6 +247,7 @@ add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(ye
# its parent. We use this to pass data retrieved from SQL from parent
# to child. Specifically, we use this global variable.
data: Iterable
+rdfcache: Iterable
# We use this lock to ensure that only one process writes its Xapian
# index to disk at a time.
xapian_lock = Lock()
@@ -284,8 +285,11 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None:
trait["chr"].bind(index_chr)
trait["geno_chr"].bind(index_peakchr)
- # Index generif entries
- Maybe.apply(get_rif_metadata).to_arguments(trait["symbol"], trait["species"]).bind(index_rif_comments)
+ Maybe.apply(
+ index_rif_comments
+ ).to_arguments(
+ trait["species"], trait["symbol"]
+ )
doc.set_data(json.dumps(trait.data))
(Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}"))
@@ -366,6 +370,8 @@ def index_query(index_function: Callable, query: SQLQuery,
i = start
try:
with worker_queue() as spawn_worker:
+ global rdfcache
+ rdfcache = build_rif_cache()
with database_connection(sql_uri) as conn:
for chunk in group(query_sql(conn, serialize_sql(
# KLUDGE: MariaDB does not allow an offset