aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/index-genenetwork58
1 files changed, 32 insertions, 26 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork
index e7390fa..8769689 100755
--- a/scripts/index-genenetwork
+++ b/scripts/index-genenetwork
@@ -8,15 +8,13 @@ xapian index. This xapian index is later used in providing search
through the web interface.
"""
-from string import Template
from collections import deque, namedtuple
import contextlib
-from functools import partial, lru_cache
+from functools import partial
import itertools
import json
import logging
from multiprocessing import Lock, Process
-from string import Template
import os
import pathlib
import resource
@@ -170,36 +168,34 @@ def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDataba
db.close()
-@curry(2)
-@lru_cache(maxsize=1_000)
-def get_rif_metadata(symbol, species):
+
+def build_rif_cache():
+ cache = {}
sparql = SPARQLWrapper(
"http://localhost:8982/sparql"
)
sparql.setReturnFormat(JSON)
- query = Template("""
+ query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX gnt: <http://genenetwork.org/term/>
-PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX gnc: <http://genenetwork.org/category/>
-PREFIX gn: <http://genenetwork.org/id/>
-PREFIX dct: <http://purl.org/dc/terms/>
-PREFIX xkos: <http://rdf-vocabulary.ddialliance.org/xkos#>
-SELECT DISTINCT ?comment WHERE {
+SELECT ?symbolName ?speciesName GROUP_CONCAT(?comment ; separator=\"\\n\") AS ?comment WHERE {
?symbol rdfs:comment _:node ;
- rdfs:label '$symbol' .
+ rdfs:label ?symbolName .
_:node rdf:type gnc:GNWikiEntry ;
gnt:belongsToSpecies ?species ;
rdfs:comment ?comment .
-?species gnt:shortName '$species' .
-}
-""")
- sparql.setQuery(query.substitute(symbol=symbol,
- species=species))
+?species gnt:shortName ?speciesName .
+} GROUP BY ?speciesName ?symbolName
+"""
+ sparql.setQuery(query)
results = sparql.queryAndConvert()["results"]["bindings"]
- return results
+ for entry in results:
+ x = (entry["speciesName"]["value"], entry["symbolName"]["value"],)
+ cache[x] = entry["comment"]["value"]
+ return cache
# pylint: disable=invalid-name
@@ -221,12 +217,16 @@ def index_text(text: str) -> None:
termgenerator.index_text(text)
termgenerator.increase_termpos()
-# pylint: disable=unnecessary-lambda
-index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text)
-def index_rif_comments(entries):
- for entry in entries:
- termgenerator.index_text(entry["comment"]["value"], 0, "XRF")
+@curry(2)
+def index_rif_comments(species, symbol):
+ key = (species, symbol,)
+ entry = rdfcache.get(key)
+ if entry:
+ termgenerator.index_text(entry, 0, "XRF")
+
+
+index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text)
index_authors = lambda authors: termgenerator.index_text(authors, 0, "A")
index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS")
index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG")
@@ -247,6 +247,7 @@ add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(ye
# its parent. We use this to pass data retrieved from SQL from parent
# to child. Specifically, we use this global variable.
data: Iterable
+rdfcache: Iterable
# We use this lock to ensure that only one process writes its Xapian
# index to disk at a time.
xapian_lock = Lock()
@@ -284,8 +285,11 @@ def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None:
trait["chr"].bind(index_chr)
trait["geno_chr"].bind(index_peakchr)
- # Index generif entries
- Maybe.apply(get_rif_metadata).to_arguments(trait["symbol"], trait["species"]).bind(index_rif_comments)
+ Maybe.apply(
+ index_rif_comments
+ ).to_arguments(
+ trait["species"], trait["symbol"]
+ )
doc.set_data(json.dumps(trait.data))
(Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}"))
@@ -366,6 +370,8 @@ def index_query(index_function: Callable, query: SQLQuery,
i = start
try:
with worker_queue() as spawn_worker:
+ global rdfcache
+ rdfcache = build_rif_cache()
with database_connection(sql_uri) as conn:
for chunk in group(query_sql(conn, serialize_sql(
# KLUDGE: MariaDB does not allow an offset