aboutsummaryrefslogtreecommitdiff
path: root/wqflask/scripts/index.py
diff options
context:
space:
mode:
authorArun Isaac2022-10-28 13:30:58 +0530
committerArun Isaac2022-10-28 13:30:58 +0530
commit38e7b62c2e94fe4ae1dbaa91349b7b36792685ac (patch)
treed7a0a56f3c8deac2290978bd0f56824f6a989d1d /wqflask/scripts/index.py
parent5933321ff4077900a8d4ee7fa1588d8a4e52d0e5 (diff)
downloadgenenetwork2-38e7b62c2e94fe4ae1dbaa91349b7b36792685ac.tar.gz
Remove xapian indexing script.
The xapian indexing script has been moved to genenetwork3. All further development will happen there. * wqflask/scripts/index.py: Delete file.
Diffstat (limited to 'wqflask/scripts/index.py')
-rw-r--r--wqflask/scripts/index.py190
1 files changed, 0 insertions, 190 deletions
diff --git a/wqflask/scripts/index.py b/wqflask/scripts/index.py
deleted file mode 100644
index 14e93dd1..00000000
--- a/wqflask/scripts/index.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""This script must be run each time the database is updated. It runs
-queries against the SQL database, indexes the results and builds a
-xapian index. This xapian index is later used in providing search
-through the web interface.
-
-"""
-
-from functools import partial
-import json
-import xapian
-
-from utility.monads import sql_query_mdict
-from wqflask.database import database_connection, xapian_writable_database
-
-
-def index_text(termgenerator, text):
- """Index text and increase term position."""
- termgenerator.index_text(text)
- termgenerator.increase_termpos()
-
-
-# pylint: disable=invalid-name
-def write_document(db, idterm, doctype, doc):
- """Write document into xapian database."""
- # We use the XT prefix to indicate the type.
- doc.add_boolean_term(f"XT{doctype}")
- doc.add_boolean_term(idterm)
- db.replace_document(idterm, doc)
-
-
-# pylint: disable=missing-function-docstring
-def main():
- termgenerator = xapian.TermGenerator()
- termgenerator.set_stemmer(xapian.Stem("en"))
-
- indexer = partial(index_text, termgenerator)
-
- authors_indexer = lambda text: termgenerator.index_text(text, 1, "A")
- species_indexer = lambda text: termgenerator.index_text(text, 1, "XS")
- group_indexer = lambda text: termgenerator.index_text(text, 1, "XG")
- tissue_indexer = lambda text: termgenerator.index_text(text, 1, "XI")
- description_indexer = lambda text: termgenerator.index_text(text, 1, "XD")
- dataset_indexer = lambda text: termgenerator.index_text(text, 1, "XDS")
- symbol_indexer = lambda text: termgenerator.index_text(text, 1, "XY")
- chr_indexer = lambda text: termgenerator.index_text(text, 0, "XC")
- peakchr_indexer = lambda text: termgenerator.index_text(text, 0, "XPC")
-
- mean_adder = lambda mean: doc.add_value(0, xapian.sortable_serialise(mean))
- peak_adder = lambda peak: doc.add_value(1, xapian.sortable_serialise(peak))
- mb_adder = lambda mb: doc.add_value(2, xapian.sortable_serialise(mb))
- peakmb_adder = lambda peakmb: doc.add_value(3, xapian.sortable_serialise(peakmb))
- additive_adder = lambda additive: doc.add_value(4, xapian.sortable_serialise(additive))
- year_adder = lambda year: doc.add_value(5, xapian.sortable_serialise(float(year)))
-
- # FIXME: Some Max LRS values in the DB are wrongly listed as
- # 0.000, but shouldn't be displayed. Make them NULLs in the
- # database.
- # pylint: disable=invalid-name
- with xapian_writable_database() as db:
- with database_connection() as conn:
- for trait in sql_query_mdict(conn, """
- SELECT ProbeSet.Name AS name,
- ProbeSet.Symbol AS symbol,
- ProbeSet.description AS description,
- ProbeSet.Chr AS chr,
- ProbeSet.Mb AS mb,
- ProbeSet.alias AS alias,
- ProbeSet.GenbankId AS genbankid,
- ProbeSet.UniGeneId AS unigeneid,
- ProbeSet.Probe_Target_Description AS probe_target_description,
- ProbeSetFreeze.Name AS dataset,
- ProbeSetFreeze.FullName AS dataset_fullname,
- ProbeSetFreeze.Id AS dataset_id,
- Species.Name AS species,
- InbredSet.Name AS `group`,
- Tissue.Name AS tissue,
- ProbeSetXRef.Mean AS mean,
- ProbeSetXRef.LRS AS lrs,
- ProbeSetXRef.additive AS additive,
- Geno.Chr as geno_chr,
- Geno.Mb as geno_mb
- FROM Species
- INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id
- INNER JOIN ProbeFreeze ON ProbeFreeze.InbredSetId = InbredSet.Id
- INNER JOIN Tissue ON ProbeFreeze.TissueId = Tissue.Id
- INNER JOIN ProbeSetFreeze ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id
- INNER JOIN ProbeSetXRef ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
- INNER JOIN ProbeSet ON ProbeSet.Id = ProbeSetXRef.ProbeSetId
- LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id
- WHERE ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0
- """):
- doc = xapian.Document()
- termgenerator.set_document(doc)
-
- # Add values.
- trait["mean"].bind(mean_adder)
- trait["lrs"].bind(peak_adder)
- trait["mb"].bind(mb_adder)
- trait["geno_mb"].bind(peakmb_adder)
- trait["additive"].bind(additive_adder)
-
- # Index text.
- trait["name"].bind(indexer)
- trait["description"].bind(indexer)
- trait["symbol"].bind(indexer)
- trait.pop("alias").bind(indexer)
- trait.pop("genbankid").bind(indexer)
- trait.pop("unigeneid").bind(indexer)
- trait.pop("probe_target_description").bind(indexer)
- trait["species"].bind(species_indexer)
- trait["group"].bind(group_indexer)
- trait["tissue"].bind(tissue_indexer)
- trait["description"].bind(description_indexer)
- trait["dataset"].bind(dataset_indexer)
- trait["symbol"].bind(symbol_indexer)
- trait["chr"].bind(chr_indexer)
- trait["geno_chr"].bind(peakchr_indexer)
-
- doc.set_data(json.dumps(trait.data))
- write_document(db, trait["name"].bind(lambda name: f"Q{name}"), "gene", doc)
-
- with database_connection() as conn:
- # FIXME: Some years are blank strings or strings that
- # contain text other than the year. These should be fixed
- # in the database and the year field must be made an integer.
- for i, trait in enumerate(sql_query_mdict(conn, """
- SELECT Species.Name AS species,
- InbredSet.Name AS `group`,
- PublishFreeze.Name AS dataset,
- PublishFreeze.FullName AS dataset_fullname,
- PublishXRef.Id AS name,
- COALESCE(Phenotype.Post_publication_abbreviation, Phenotype.Pre_publication_abbreviation) AS abbreviation,
- COALESCE(Phenotype.Post_publication_description, Phenotype.Pre_publication_description) AS description,
- Phenotype.Lab_code,
- Publication.Abstract,
- Publication.Title,
- Publication.Authors AS authors,
- IF(CONVERT(Publication.Year, UNSIGNED)=0, NULL, CONVERT(Publication.Year, UNSIGNED)) AS year,
- Publication.PubMed_ID AS pubmed_id,
- PublishXRef.LRS as lrs,
- PublishXRef.additive,
- InbredSet.InbredSetCode AS inbredsetcode,
- PublishXRef.mean,
- PublishFreeze.Id AS dataset_id,
- Geno.Chr as geno_chr,
- Geno.Mb as geno_mb
- FROM Species
- INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id
- INNER JOIN PublishFreeze ON PublishFreeze.InbredSetId = InbredSet.Id
- INNER JOIN PublishXRef ON PublishXRef.InbredSetId = InbredSet.Id
- INNER JOIN Phenotype ON PublishXRef.PhenotypeId = Phenotype.Id
- INNER JOIN Publication ON PublishXRef.PublicationId = Publication.Id
- LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id
- """)):
- doc = xapian.Document()
- termgenerator.set_document(doc)
-
- # Add values.
- trait["mean"].bind(mean_adder)
- trait["lrs"].bind(peak_adder)
- trait["geno_mb"].bind(peakmb_adder)
- trait["additive"].bind(additive_adder)
- trait["year"].bind(year_adder)
-
- # Index text.
- trait.pop("abbreviation").bind(indexer)
- trait["description"].bind(indexer)
- trait.pop("Lab_code").bind(indexer)
- trait.pop("Abstract").bind(indexer)
- trait.pop("Title").bind(indexer)
- trait["authors"].bind(indexer)
- trait["inbredsetcode"].bind(indexer)
- trait["species"].bind(species_indexer)
- trait["group"].bind(group_indexer)
- trait["description"].bind(description_indexer)
- trait["authors"].bind(authors_indexer)
- trait["geno_chr"].bind(peakchr_indexer)
-
- # Convert name from integer to string.
- trait["name"] = trait["name"].map(str)
- # Split comma-separated authors into a list.
- trait["authors"] = trait["authors"].map(
- lambda s: [author.strip() for author in s.split(",")])
-
- doc.set_data(json.dumps(trait.data))
- write_document(db, f"Q{i}", "phenotype", doc)
-
-
-if __name__ == "__main__":
- main()