diff options
author | Arun Isaac | 2023-02-13 17:33:27 +0000 |
---|---|---|
committer | Arun Isaac | 2023-02-13 17:36:09 +0000 |
commit | 152d0fb36b76b7c68d8202ba3665617bad3e684b (patch) | |
tree | 7c523a30c078579d0aee7d145468019e708d4c81 /scripts | |
parent | a93b9599d6e342f8fd588022ca14336465f7ff7c (diff) | |
download | genenetwork3-152d0fb36b76b7c68d8202ba3665617bad3e684b.tar.gz |
scripts: Type hint xapian indexing script.
* scripts/index-genenetwork: Import Callable, Generator, Iterable and List
from typing. Type hint all functions.
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/index-genenetwork | 29 |
1 files changed, 15 insertions, 14 deletions
diff --git a/scripts/index-genenetwork b/scripts/index-genenetwork index 5d231ad..7600f90 100755 --- a/scripts/index-genenetwork +++ b/scripts/index-genenetwork @@ -21,6 +21,7 @@ import pathlib import resource import shutil import tempfile +from typing import Callable, Generator, Iterable, List import MySQLdb import click @@ -107,7 +108,7 @@ phenotypes_query = SQLQuery( "LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id"]) -def serialize_sql(query): +def serialize_sql(query: SQLQuery) -> str: """Serialize SQLQuery object to a string.""" sql = f"SELECT {', '.join(query.fields)} FROM {' '.join(query.tables)}" def append_to_sql(appendee): @@ -122,7 +123,7 @@ def serialize_sql(query): @contextlib.contextmanager -def locked_xapian_writable_database(path): +def locked_xapian_writable_database(path: pathlib.Path) -> xapian.WritableDatabase: """Open xapian database for writing. When a process is writing to a xapian database opened by this @@ -148,7 +149,7 @@ def locked_xapian_writable_database(path): # pylint: disable=invalid-name -def write_document(db, identifier, doctype, doc): +def write_document(db: xapian.WritableDatabase, identifier: str, doctype: str, doc: xapian.Document) -> None: """Write document into xapian database.""" # We use the XT and Q prefixes to indicate the type and idterm # respectively. @@ -160,7 +161,7 @@ def write_document(db, identifier, doctype, doc): termgenerator = xapian.TermGenerator() termgenerator.set_stemmer(xapian.Stem("en")) -def index_text(text): +def index_text(text: str) -> None: """Index text and increase term position.""" termgenerator.index_text(text) termgenerator.increase_termpos() @@ -187,12 +188,12 @@ add_year = lambda doc, year: doc.add_value(5, xapian.sortable_serialise(float(ye # When a child process is forked, it inherits a copy of the memory of # its parent. We use this to pass data retrieved from SQL from parent # to child. Specifically, we use this global variable. -data = None +data: Iterable # We use this lock to ensure that only one process writes its Xapian # index to disk at a time. xapian_lock = Lock() -def index_genes(xapian_build_directory, chunk_index): +def index_genes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: """Index genes data into a Xapian index.""" with locked_xapian_writable_database(xapian_build_directory / f"genes-{chunk_index:04d}") as db: for trait in data: @@ -231,7 +232,7 @@ def index_genes(xapian_build_directory, chunk_index): .bind(lambda idterm: write_document(db, idterm, "gene", doc))) -def index_phenotypes(xapian_build_directory, chunk_index): +def index_phenotypes(xapian_build_directory: pathlib.Path, chunk_index: int) -> None: """Index phenotypes data into a Xapian index.""" with locked_xapian_writable_database( xapian_build_directory / f"phenotypes-{chunk_index:04d}") as db: @@ -276,15 +277,15 @@ def index_phenotypes(xapian_build_directory, chunk_index): .bind(lambda idterm: write_document(db, idterm, "phenotype", doc))) -def group(generator, chunk_size): +def group(generator: Iterable, chunk_size: int) -> Iterable: """Group elements of generator into chunks.""" return iter(lambda: tuple(itertools.islice(generator, chunk_size)), ()) @contextlib.contextmanager -def worker_queue(number_of_workers=os.cpu_count()): +def worker_queue(number_of_workers: int = os.cpu_count()) -> Generator: """Manage a pool of worker processes returning a function to spawn them.""" - processes = deque() + processes: deque = deque() def spawn(target, args): if len(processes) == number_of_workers: @@ -298,7 +299,7 @@ def worker_queue(number_of_workers=os.cpu_count()): process.join() -def index_query(index_function, query, xapian_build_directory, start=0): +def index_query(index_function: Callable, query: SQLQuery, xapian_build_directory: pathlib.Path, start: int = 0) -> None: """Run SQL query, and index its results for Xapian.""" i = start try: @@ -328,13 +329,13 @@ def index_query(index_function, query, xapian_build_directory, start=0): @contextlib.contextmanager -def temporary_directory(prefix, parent_directory): +def temporary_directory(prefix: str, parent_directory: str) -> Generator: """Create temporary directory returning it as a PosixPath.""" with tempfile.TemporaryDirectory(prefix=prefix, dir=parent_directory) as tmpdirname: yield pathlib.Path(tmpdirname) -def xapian_compact(combined_index, indices): +def xapian_compact(combined_index: pathlib.Path, indices: List[pathlib.Path]) -> None: """Compact and combine several Xapian indices.""" # xapian-compact opens all indices simultaneously. So, raise the limit on # the number of open files. @@ -352,7 +353,7 @@ def xapian_compact(combined_index, indices): @click.command(help="Index GeneNetwork data and build Xapian search index in XAPIAN_DIRECTORY.") @click.argument("xapian_directory") # pylint: disable=missing-function-docstring -def main(xapian_directory): +def main(xapian_directory: str) -> None: logging.basicConfig(level=os.environ.get("LOGLEVEL", "DEBUG"), format='%(relativeCreated)s: %(levelname)s: %(message)s') pathlib.Path(xapian_directory).mkdir(exist_ok=True) |