From 06da0390a1de5d0aa8eb6d7a0ed3120e350f8a0b Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Tue, 18 Oct 2022 14:40:47 +0530 Subject: Add xapian indexing script. * scripts/index-genenetwork: New file. * setup.py (install_requires): Add click, pymonad and xapian-bindings. (scripts): Add scripts/index-genenetwork. --- scripts/index-genenetwork | 374 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100755 scripts/index-genenetwork (limited to 'scripts') diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork new file mode 100755 index 0000000..5d231ad --- /dev/null +++ b/scripts/index-genenetwork @@ -0,0 +1,374 @@ +#! /usr/bin/env python3 + +# pylint: disable=invalid-name + +"""This script must be run each time the database is updated. It runs +queries against the SQL database, indexes the results and builds a +xapian index. This xapian index is later used in providing search +through the web interface. + +""" + +from collections import deque, namedtuple +import contextlib +from functools import partial +import itertools +import json +import logging +from multiprocessing import Lock, Process +import os +import pathlib +import resource +import shutil +import tempfile + +import MySQLdb +import click +from pymonad.maybe import Just, Maybe, Nothing +from pymonad.tools import curry +import xapian + +from gn3.db_utils import database_connection +from gn3.monads import query_sql + +DOCUMENTS_PER_CHUNK = 100000 + +SQLQuery = namedtuple("SQLQuery", + ["fields", "tables", "where", "offset", "limit"], + defaults=[Nothing, 0, Nothing]) + +# FIXME: Some Max LRS values in the DB are wrongly listed as 0.000, +# but shouldn't be displayed. Make them NULLs in the database. +genes_query = SQLQuery( + ["ProbeSet.Name AS name", + "ProbeSet.Symbol AS symbol", + "ProbeSet.description AS description", + "ProbeSet.Chr AS chr", + "ProbeSet.Mb as mb", + "ProbeSet.alias AS alias", + "ProbeSet.GenbankId AS genbankid", + "ProbeSet.UniGeneId AS unigeneid", + "ProbeSet.Probe_Target_Description AS probe_target_description", + "ProbeSetFreeze.Name AS dataset", + "ProbeSetFreeze.FullName AS dataset_fullname", + "Species.Name AS species", + "InbredSet.Name AS `group`", + "Tissue.Name AS tissue", + "ProbeSetXRef.Mean AS mean", + "ProbeSetXRef.LRS AS lrs", + "ProbeSetXRef.additive AS additive", + "Geno.Chr AS geno_chr", + "Geno.Mb as geno_mb"], + ["Species", + "INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id", + "INNER JOIN ProbeFreeze ON ProbeFreeze.InbredSetId = InbredSet.Id", + "INNER JOIN Tissue ON ProbeFreeze.TissueId = Tissue.Id", + "INNER JOIN ProbeSetFreeze ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id", + "INNER JOIN ProbeSetXRef ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id", + "INNER JOIN ProbeSet ON ProbeSet.Id = ProbeSetXRef.ProbeSetId", + """LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name + AND Geno.SpeciesId = Species.Id"""], + Just("ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0")) + +# FIXME: Some years are blank strings or strings that contain text +# other than the year. These should be fixed in the database and the +# year field must be made an integer. +phenotypes_query = SQLQuery( + ["Species.Name AS species", + "InbredSet.Name AS `group`", + "PublishFreeze.Name AS dataset", + "PublishFreeze.FullName AS dataset_fullname", + "PublishXRef.Id AS name", + """COALESCE(Phenotype.Post_publication_abbreviation, + Phenotype.Pre_publication_abbreviation) + AS abbreviation""", + """COALESCE(Phenotype.Post_publication_description, + Phenotype.Pre_publication_description) + AS description""", + "Phenotype.Lab_code", + "Publication.Abstract", + "Publication.Title", + "Publication.Authors AS authors", + """IF(CONVERT(Publication.Year, UNSIGNED)=0, + NULL, CONVERT(Publication.Year, UNSIGNED)) AS year""", + "Publication.PubMed_ID AS pubmed_id", + "PublishXRef.LRS as lrs", + "PublishXRef.additive", + "InbredSet.InbredSetCode AS inbredsetcode", + "PublishXRef.mean", + "Geno.Chr as geno_chr", + "Geno.Mb as geno_mb"], + ["Species", + "INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id", + "INNER JOIN PublishFreeze ON PublishFreeze.InbredSetId = InbredSet.Id", + "INNER JOIN PublishXRef ON PublishXRef.InbredSetId = InbredSet.Id", + "INNER JOIN Phenotype ON PublishXRef.PhenotypeId = Phenotype.Id", + "INNER JOIN Publication ON PublishXRef.PublicationId = Publication.Id", + "LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id"]) + + +def serialize_sql(query): + """Serialize SQLQuery object to a string.""" + sql = f"SELECT {', '.join(query.fields)} FROM {' '.join(query.tables)}" + def append_to_sql(appendee): + nonlocal sql + sql += appendee + + query.where.bind(lambda where: append_to_sql(f" WHERE {where}")) + query.limit.bind(lambda limit: append_to_sql(f" LIMIT {limit}")) + if query.offset != 0: + sql += f" OFFSET {query.offset}" + return sql + + +@contextlib.contextmanager +def locked_xapian_writable_database(path): + """Open xapian database for writing. + + When a process is writing to a xapian database opened by this + function, no other process may do so. This avoids I/O contention + between processes. + """ + # pylint: disable-next=invalid-name + db = xapian.WritableDatabase(str(path)) + db.begin_transaction() + try: + yield db + except Exception as exception: + db.cancel_transaction() + raise exception + else: + xapian_lock.acquire() + try: + db.commit_transaction() + finally: + xapian_lock.release() + finally: + db.close() + + +# pylint: disable=invalid-name +def write_document(db, identifier, doctype, doc): + """Write document into xapian database.""" + # We use the XT and Q prefixes to indicate the type and idterm + # respectively. + idterm = f"Q{doctype}:{identifier.lower()}" + doc.add_boolean_term(f"XT{doctype}") + doc.add_boolean_term(idterm) + db.replace_document(idterm, doc) + +termgenerator = xapian.TermGenerator() +termgenerator.set_stemmer(xapian.Stem("en")) + +def index_text(text): + """Index text and increase term position.""" + termgenerator.index_text(text) + termgenerator.increase_termpos() + +# pylint: disable=unnecessary-lambda +index_text_without_positions = lambda text: termgenerator.index_text_without_positions(text) + +index_authors = lambda authors: termgenerator.index_text(authors, 0, "A") +index_species = lambda species: termgenerator.index_text_without_positions(species, 0, "XS") +index_group = lambda group: termgenerator.index_text_without_positions(group, 0, "XG") +index_tissue = lambda tissue: termgenerator.index_text(tissue, 0, "XI") +index_dataset = lambda dataset: termgenerator.index_text(dataset, 0, "XDS") +index_symbol = lambda symbol: termgenerator.index_text_without_positions(symbol, 0, "XY") +index_chr = lambda chr: termgenerator.index_text_without_positions(chr, 0, "XC") +index_peakchr = lambda peakchr: termgenerator.index_text_without_positions(peakchr, 0, "XPC") + +add_mean = lambda doc, mean: doc.add_value(0, xapian.sortable_serialise(mean)) +add_peak = lambda doc, peak: doc.add_value(1, xapian.sortable_serialise(peak)) +add_mb = lambda doc, mb: doc.add_value(2, xapian.sortable_serialise(mb)) +add_peakmb = lambda doc, peakmb: doc.add_value(3, xapian.sortable_serialise(peakmb)) +add_additive = lambda doc, additive: doc.add_value(4, xapian.sortable_serialise(additive)) +add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(year))) + +# When a child process is forked, it inherits a copy of the memory of +# its parent. We use this to pass data retrieved from SQL from parent +# to child. Specifically, we use this global variable. +data = None +# We use this lock to ensure that only one process writes its Xapian +# index to disk at a time. +xapian_lock = Lock() + +def index_genes(xapian_build_directory, chunk_index): + """Index genes data into a Xapian index.""" + with locked_xapian_writable_database(xapian_build_directory / f"genes-{chunk_index:04d}") as db: + for trait in data: + # pylint: disable=cell-var-from-loop + doc = xapian.Document() + termgenerator.set_document(doc) + + # Add values. + trait["mean"].bind(partial(add_mean, doc)) + trait["lrs"].bind(partial(add_peak, doc)) + trait["mb"].bind(partial(add_mb, doc)) + trait["geno_mb"].bind(partial(add_peakmb, doc)) + trait["additive"].bind(partial(add_additive, doc)) + + # Index free text. + for key in ["description", "tissue", "dataset_fullname"]: + trait[key].bind(index_text) + trait.pop("probe_target_description").bind(index_text) + for key in ["name", "symbol", "species", "group"]: + trait[key].bind(index_text_without_positions) + for key in ["alias", "genbankid", "unigeneid"]: + trait.pop(key).bind(index_text_without_positions) + + # Index text with prefixes. + trait["species"].bind(index_species) + trait["group"].bind(index_group) + trait["tissue"].bind(index_tissue) + trait["dataset_fullname"].bind(index_dataset) + trait["symbol"].bind(index_symbol) + trait["chr"].bind(index_chr) + trait["geno_chr"].bind(index_peakchr) + + doc.set_data(json.dumps(trait.data)) + (Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}")) + .to_arguments(trait["name"], trait["dataset"]) + .bind(lambda idterm: write_document(db, idterm, "gene", doc))) + + +def index_phenotypes(xapian_build_directory, chunk_index): + """Index phenotypes data into a Xapian index.""" + with locked_xapian_writable_database( + xapian_build_directory / f"phenotypes-{chunk_index:04d}") as db: + for trait in data: + # pylint: disable=cell-var-from-loop + doc = xapian.Document() + termgenerator.set_document(doc) + + # Add values. + trait["mean"].bind(partial(add_mean, doc)) + trait["lrs"].bind(partial(add_peak, doc)) + trait["geno_mb"].bind(partial(add_peakmb, doc)) + trait["additive"].bind(partial(add_additive, doc)) + trait["year"].bind(partial(add_year, doc)) + + # Index free text. + for key in ["description", "authors", "dataset_fullname"]: + trait[key].bind(index_text) + for key in ["Abstract", "Title"]: + trait.pop(key).bind(index_text) + for key in ["species", "group", "inbredsetcode"]: + trait[key].bind(index_text_without_positions) + for key in ["abbreviation", "Lab_code"]: + trait.pop(key).bind(index_text_without_positions) + + # Index text with prefixes. + trait["species"].bind(index_species) + trait["group"].bind(index_group) + trait["authors"].bind(index_authors) + trait["geno_chr"].bind(index_peakchr) + trait["dataset_fullname"].bind(index_dataset) + + # Convert name from integer to string. + trait["name"] = trait["name"].map(str) + # Split comma-separated authors into a list. + trait["authors"] = trait["authors"].map( + lambda s: [author.strip() for author in s.split(",")]) + + doc.set_data(json.dumps(trait.data)) + (Maybe.apply(curry(2, lambda name, dataset: f"{name}:{dataset}")) + .to_arguments(trait["name"], trait["dataset"]) + .bind(lambda idterm: write_document(db, idterm, "phenotype", doc))) + + +def group(generator, chunk_size): + """Group elements of generator into chunks.""" + return iter(lambda: tuple(itertools.islice(generator, chunk_size)), ()) + + +@contextlib.contextmanager +def worker_queue(number_of_workers=os.cpu_count()): + """Manage a pool of worker processes returning a function to spawn them.""" + processes = deque() + + def spawn(target, args): + if len(processes) == number_of_workers: + processes.popleft().join() + process = Process(target=target, args=args) + process.start() + processes.append(process) + + yield spawn + for process in processes: + process.join() + + +def index_query(index_function, query, xapian_build_directory, start=0): + """Run SQL query, and index its results for Xapian.""" + i = start + try: + with worker_queue() as spawn_worker: + with database_connection() as conn: + for chunk in group(query_sql(conn, serialize_sql( + # KLUDGE: MariaDB does not allow an offset + # without a limit. So, set limit to a "high" + # value. + query._replace(limit=Just(2**64 - 1), + offset=start*DOCUMENTS_PER_CHUNK)), + server_side=True), + DOCUMENTS_PER_CHUNK): + # pylint: disable=global-statement + global data + data = chunk + spawn_worker(index_function, (xapian_build_directory, i)) + logging.debug("Spawned worker process on chunk %s", i) + i += 1 + # In the event of an operational error, open a new connection and + # resume indexing. + # pylint: disable=protected-access + except MySQLdb._exceptions.OperationalError: + logging.warning("Reopening connection to recovering from SQL operational error", + exc_info=True) + index_query(index_function, query, xapian_build_directory, i) + + +@contextlib.contextmanager +def temporary_directory(prefix, parent_directory): + """Create temporary directory returning it as a PosixPath.""" + with tempfile.TemporaryDirectory(prefix=prefix, dir=parent_directory) as tmpdirname: + yield pathlib.Path(tmpdirname) + + +def xapian_compact(combined_index, indices): + """Compact and combine several Xapian indices.""" + # xapian-compact opens all indices simultaneously. So, raise the limit on + # the number of open files. + soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (max(soft, min(10*len(indices), hard)), hard)) + db = xapian.Database() + try: + for index in indices: + db.add_database(xapian.Database(str(index))) + db.compact(str(combined_index), xapian.DBCOMPACT_MULTIPASS | xapian.Compactor.FULLER) + finally: + db.close() + + +@click.command(help="Index GeneNetwork data and build Xapian search index in XAPIAN_DIRECTORY.") +@click.argument("xapian_directory") +# pylint: disable=missing-function-docstring +def main(xapian_directory): + logging.basicConfig(level=os.environ.get("LOGLEVEL", "DEBUG"), + format='%(relativeCreated)s: %(levelname)s: %(message)s') + pathlib.Path(xapian_directory).mkdir(exist_ok=True) + with temporary_directory("combined", xapian_directory) as combined_index: + with temporary_directory("build", xapian_directory) as xapian_build_directory: + logging.info("Indexing genes") + index_query(index_genes, genes_query, xapian_build_directory) + logging.info("Indexing phenotypes") + index_query(index_phenotypes, phenotypes_query, xapian_build_directory) + logging.info("Combining and compacting indices") + xapian_compact(combined_index, list(xapian_build_directory.iterdir())) + for child in combined_index.iterdir(): + shutil.move(child, xapian_directory) + logging.info("Index built") + + +if __name__ == "__main__": + # pylint: disable=no-value-for-parameter + main() -- cgit v1.2.3