diff options
Diffstat (limited to 'wqflask/scripts/index.py')
-rw-r--r-- | wqflask/scripts/index.py | 190 |
1 files changed, 0 insertions, 190 deletions
diff --git a/wqflask/scripts/index.py b/wqflask/scripts/index.py deleted file mode 100644 index 14e93dd1..00000000 --- a/wqflask/scripts/index.py +++ /dev/null @@ -1,190 +0,0 @@ -"""This script must be run each time the database is updated. It runs -queries against the SQL database, indexes the results and builds a -xapian index. This xapian index is later used in providing search -through the web interface. - -""" - -from functools import partial -import json -import xapian - -from utility.monads import sql_query_mdict -from wqflask.database import database_connection, xapian_writable_database - - -def index_text(termgenerator, text): - """Index text and increase term position.""" - termgenerator.index_text(text) - termgenerator.increase_termpos() - - -# pylint: disable=invalid-name -def write_document(db, idterm, doctype, doc): - """Write document into xapian database.""" - # We use the XT prefix to indicate the type. - doc.add_boolean_term(f"XT{doctype}") - doc.add_boolean_term(idterm) - db.replace_document(idterm, doc) - - -# pylint: disable=missing-function-docstring -def main(): - termgenerator = xapian.TermGenerator() - termgenerator.set_stemmer(xapian.Stem("en")) - - indexer = partial(index_text, termgenerator) - - authors_indexer = lambda text: termgenerator.index_text(text, 1, "A") - species_indexer = lambda text: termgenerator.index_text(text, 1, "XS") - group_indexer = lambda text: termgenerator.index_text(text, 1, "XG") - tissue_indexer = lambda text: termgenerator.index_text(text, 1, "XI") - description_indexer = lambda text: termgenerator.index_text(text, 1, "XD") - dataset_indexer = lambda text: termgenerator.index_text(text, 1, "XDS") - symbol_indexer = lambda text: termgenerator.index_text(text, 1, "XY") - chr_indexer = lambda text: termgenerator.index_text(text, 0, "XC") - peakchr_indexer = lambda text: termgenerator.index_text(text, 0, "XPC") - - mean_adder = lambda mean: doc.add_value(0, xapian.sortable_serialise(mean)) - peak_adder = lambda peak: doc.add_value(1, xapian.sortable_serialise(peak)) - mb_adder = lambda mb: doc.add_value(2, xapian.sortable_serialise(mb)) - peakmb_adder = lambda peakmb: doc.add_value(3, xapian.sortable_serialise(peakmb)) - additive_adder = lambda additive: doc.add_value(4, xapian.sortable_serialise(additive)) - year_adder = lambda year: doc.add_value(5, xapian.sortable_serialise(float(year))) - - # FIXME: Some Max LRS values in the DB are wrongly listed as - # 0.000, but shouldn't be displayed. Make them NULLs in the - # database. - # pylint: disable=invalid-name - with xapian_writable_database() as db: - with database_connection() as conn: - for trait in sql_query_mdict(conn, """ - SELECT ProbeSet.Name AS name, - ProbeSet.Symbol AS symbol, - ProbeSet.description AS description, - ProbeSet.Chr AS chr, - ProbeSet.Mb AS mb, - ProbeSet.alias AS alias, - ProbeSet.GenbankId AS genbankid, - ProbeSet.UniGeneId AS unigeneid, - ProbeSet.Probe_Target_Description AS probe_target_description, - ProbeSetFreeze.Name AS dataset, - ProbeSetFreeze.FullName AS dataset_fullname, - ProbeSetFreeze.Id AS dataset_id, - Species.Name AS species, - InbredSet.Name AS `group`, - Tissue.Name AS tissue, - ProbeSetXRef.Mean AS mean, - ProbeSetXRef.LRS AS lrs, - ProbeSetXRef.additive AS additive, - Geno.Chr as geno_chr, - Geno.Mb as geno_mb - FROM Species - INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id - INNER JOIN ProbeFreeze ON ProbeFreeze.InbredSetId = InbredSet.Id - INNER JOIN Tissue ON ProbeFreeze.TissueId = Tissue.Id - INNER JOIN ProbeSetFreeze ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id - INNER JOIN ProbeSetXRef ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id - INNER JOIN ProbeSet ON ProbeSet.Id = ProbeSetXRef.ProbeSetId - LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id - WHERE ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0 - """): - doc = xapian.Document() - termgenerator.set_document(doc) - - # Add values. - trait["mean"].bind(mean_adder) - trait["lrs"].bind(peak_adder) - trait["mb"].bind(mb_adder) - trait["geno_mb"].bind(peakmb_adder) - trait["additive"].bind(additive_adder) - - # Index text. - trait["name"].bind(indexer) - trait["description"].bind(indexer) - trait["symbol"].bind(indexer) - trait.pop("alias").bind(indexer) - trait.pop("genbankid").bind(indexer) - trait.pop("unigeneid").bind(indexer) - trait.pop("probe_target_description").bind(indexer) - trait["species"].bind(species_indexer) - trait["group"].bind(group_indexer) - trait["tissue"].bind(tissue_indexer) - trait["description"].bind(description_indexer) - trait["dataset"].bind(dataset_indexer) - trait["symbol"].bind(symbol_indexer) - trait["chr"].bind(chr_indexer) - trait["geno_chr"].bind(peakchr_indexer) - - doc.set_data(json.dumps(trait.data)) - write_document(db, trait["name"].bind(lambda name: f"Q{name}"), "gene", doc) - - with database_connection() as conn: - # FIXME: Some years are blank strings or strings that - # contain text other than the year. These should be fixed - # in the database and the year field must be made an integer. - for i, trait in enumerate(sql_query_mdict(conn, """ - SELECT Species.Name AS species, - InbredSet.Name AS `group`, - PublishFreeze.Name AS dataset, - PublishFreeze.FullName AS dataset_fullname, - PublishXRef.Id AS name, - COALESCE(Phenotype.Post_publication_abbreviation, Phenotype.Pre_publication_abbreviation) AS abbreviation, - COALESCE(Phenotype.Post_publication_description, Phenotype.Pre_publication_description) AS description, - Phenotype.Lab_code, - Publication.Abstract, - Publication.Title, - Publication.Authors AS authors, - IF(CONVERT(Publication.Year, UNSIGNED)=0, NULL, CONVERT(Publication.Year, UNSIGNED)) AS year, - Publication.PubMed_ID AS pubmed_id, - PublishXRef.LRS as lrs, - PublishXRef.additive, - InbredSet.InbredSetCode AS inbredsetcode, - PublishXRef.mean, - PublishFreeze.Id AS dataset_id, - Geno.Chr as geno_chr, - Geno.Mb as geno_mb - FROM Species - INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id - INNER JOIN PublishFreeze ON PublishFreeze.InbredSetId = InbredSet.Id - INNER JOIN PublishXRef ON PublishXRef.InbredSetId = InbredSet.Id - INNER JOIN Phenotype ON PublishXRef.PhenotypeId = Phenotype.Id - INNER JOIN Publication ON PublishXRef.PublicationId = Publication.Id - LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id - """)): - doc = xapian.Document() - termgenerator.set_document(doc) - - # Add values. - trait["mean"].bind(mean_adder) - trait["lrs"].bind(peak_adder) - trait["geno_mb"].bind(peakmb_adder) - trait["additive"].bind(additive_adder) - trait["year"].bind(year_adder) - - # Index text. - trait.pop("abbreviation").bind(indexer) - trait["description"].bind(indexer) - trait.pop("Lab_code").bind(indexer) - trait.pop("Abstract").bind(indexer) - trait.pop("Title").bind(indexer) - trait["authors"].bind(indexer) - trait["inbredsetcode"].bind(indexer) - trait["species"].bind(species_indexer) - trait["group"].bind(group_indexer) - trait["description"].bind(description_indexer) - trait["authors"].bind(authors_indexer) - trait["geno_chr"].bind(peakchr_indexer) - - # Convert name from integer to string. - trait["name"] = trait["name"].map(str) - # Split comma-separated authors into a list. - trait["authors"] = trait["authors"].map( - lambda s: [author.strip() for author in s.split(",")]) - - doc.set_data(json.dumps(trait.data)) - write_document(db, f"Q{i}", "phenotype", doc) - - -if __name__ == "__main__": - main() |