Add indexing script for global search.

* wqflask/scripts/index.py: New file.
author: Arun Isaac 2022-09-21 13:41:15 +0530
committer: Arun Isaac 2022-09-26 13:39:45 +0530
commit: 73327a58707d7321e2d4c616da442e598ed13c57 (patch)
tree: 5c138264bd4ee5bf5d210dbbaefcb449dad312ab /wqflask
parent: 66a6593daf4580f3056c96ccfc658b556ea619fc (diff)
download: genenetwork2-73327a58707d7321e2d4c616da442e598ed13c57.tar.gz
1 files changed, 147 insertions, 0 deletions
diff --git a/wqflask/scripts/index.py b/wqflask/scripts/index.py
new file mode 100644
index 00000000..fe2b819c
--- /dev/null
+++ b/wqflask/scripts/index.py
@@ -0,0 +1,147 @@
+"""This script must be run each time the database is updated. It runs
+queries against the SQL database, indexes the results and builds a
+xapian index. This xapian index is later used in providing search
+through the web interface.
+
+"""
+
+from functools import partial
+import json
+import xapian
+
+from utility.monads import MonadicDictCursor
+from wqflask.database import database_connection, xapian_writable_database
+
+
+def index_text(termgenerator, text):
+    """Index text and increase term position."""
+    termgenerator.index_text(text)
+    termgenerator.increase_termpos()
+
+
+# pylint: disable=missing-function-docstring
+def main():
+    with database_connection() as conn, conn.cursor(MonadicDictCursor) as cursor:
+        # FIXME: Some Max LRS values in the DB are wrongly listed as
+        # 0.000, but shouldn't be displayed. Make them NULLs in the
+        # database.
+        cursor.execute("""
+        SELECT ProbeSet.Name AS name,
+               ProbeSet.Symbol AS symbol,
+               ProbeSet.description AS description,
+               ProbeSet.Chr AS chr,
+               ProbeSet.Mb AS mb,
+               ProbeSet.alias AS alias,
+               ProbeSet.GenbankId AS genbankid,
+               ProbeSet.UniGeneId AS unigeneid,
+               ProbeSet.Probe_Target_Description AS probe_target_description,
+               ProbeSetFreeze.Name AS dataset,
+               ProbeSetFreeze.FullName AS dataset_fullname,
+               ProbeSetFreeze.Id AS dataset_id,
+               Species.Name AS species,
+               InbredSet.Name AS `group`,
+               Tissue.Name AS tissue,
+               ProbeSetXRef.Mean AS mean,
+               ProbeSetXRef.LRS AS lrs,
+               ProbeSetXRef.additive AS additive,
+               Geno.Chr as geno_chr,
+               Geno.Mb as geno_mb
+        FROM Species
+             INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id
+             INNER JOIN ProbeFreeze ON ProbeFreeze.InbredSetId = InbredSet.Id
+             INNER JOIN Tissue ON ProbeFreeze.TissueId = Tissue.Id
+             INNER JOIN ProbeSetFreeze ON ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id
+             INNER JOIN ProbeSetXRef ON ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
+             INNER JOIN ProbeSet ON ProbeSet.Id = ProbeSetXRef.ProbeSetId
+             LEFT JOIN Geno ON ProbeSetXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id
+        WHERE ProbeSetFreeze.confidentiality < 1 AND ProbeSetFreeze.public > 0
+        """)
+        termgenerator = xapian.TermGenerator()
+        termgenerator.set_stemmer(xapian.Stem("en"))
+        indexer = partial(index_text, termgenerator)
+
+        # pylint: disable=invalid-name
+        with xapian_writable_database() as db:
+            for trait in cursor.fetchall():
+                doc = xapian.Document()
+                termgenerator.set_document(doc)
+
+                # Index text.
+                trait["name"].bind(indexer)
+                trait["description"].bind(indexer)
+                trait["symbol"].bind(indexer)
+                trait.pop("alias").bind(indexer)
+                trait.pop("genbankid").bind(indexer)
+                trait.pop("unigeneid").bind(indexer)
+                trait.pop("probe_target_description").bind(indexer)
+
+                # Identify document as type "gene". We use the XT
+                # prefix to indicate the type.
+                doc.add_boolean_term("XTgene")
+                doc.set_data(json.dumps(trait.data))
+                # Write document into xapian database.
+                idterm = trait["name"].bind(lambda name: "Q" + name)
+                doc.add_boolean_term(idterm)
+                db.replace_document(idterm, doc)
+
+        cursor.execute("""
+        SELECT Species.Name AS species,
+               InbredSet.Name AS `group`,
+               PublishFreeze.Name AS dataset,
+               PublishFreeze.FullName AS dataset_fullname,
+               PublishXRef.Id AS name,
+               COALESCE(Phenotype.Post_publication_abbreviation, Phenotype.Pre_publication_abbreviation) AS abbreviation,
+               COALESCE(Phenotype.Post_publication_description, Phenotype.Pre_publication_description) AS description,
+               Phenotype.Lab_code,
+               Publication.Abstract,
+               Publication.Title,
+               Publication.Authors AS authors,
+               Publication.Year AS year,
+               Publication.PubMed_ID AS pubmed_id,
+               PublishXRef.LRS as lrs,
+               PublishXRef.additive,
+               InbredSet.InbredSetCode AS inbredsetcode,
+               PublishXRef.mean,
+               PublishFreeze.Id AS dataset_id,
+               Geno.Chr as geno_chr,
+               Geno.Mb as geno_mb
+        FROM Species
+             INNER JOIN InbredSet ON InbredSet.SpeciesId = Species.Id
+             INNER JOIN PublishFreeze ON PublishFreeze.InbredSetId = InbredSet.Id
+             INNER JOIN PublishXRef ON PublishXRef.InbredSetId = InbredSet.Id
+             INNER JOIN Phenotype ON PublishXRef.PhenotypeId = Phenotype.Id
+             INNER JOIN Publication ON PublishXRef.PublicationId = Publication.Id
+             LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id
+        """)
+        with xapian_writable_database() as db:
+            for i, trait in enumerate(cursor.fetchall()):
+                doc = xapian.Document()
+                termgenerator.set_document(doc)
+
+                # Index text.
+                trait.pop("abbreviation").bind(indexer)
+                trait["description"].bind(indexer)
+                trait.pop("Lab_code").bind(indexer)
+                trait.pop("Abstract").bind(indexer)
+                trait.pop("Title").bind(indexer)
+                trait["authors"].bind(indexer)
+                trait["inbredsetcode"].bind(indexer)
+
+                # Convert name from integer to string.
+                trait["name"] = trait["name"].map(str)
+                # Split comma-separated authors into a list.
+                trait["authors"] = trait["authors"].map(
+                    lambda s: [author.strip() for author in s.split(",")])
+
+                # Identify document as type "phenotype". We use the XT
+                # prefix to indicate the type.
+                doc.add_boolean_term("XTphenotype")
+                # Write document into xapian database.
+                doc.set_data(json.dumps(trait.data))
+                idterm = f"Q{i}"
+                doc.add_boolean_term(idterm)
+                db.replace_document(idterm, doc)
+
+
+if __name__ == "__main__":
+    main()
author	Arun Isaac	2022-09-21 13:41:15 +0530
committer	Arun Isaac	2022-09-26 13:39:45 +0530
commit	73327a58707d7321e2d4c616da442e598ed13c57 (patch)
tree	5c138264bd4ee5bf5d210dbbaefcb449dad312ab /wqflask
parent	66a6593daf4580f3056c96ccfc658b556ea619fc (diff)
download	genenetwork2-73327a58707d7321e2d4c616da442e598ed13c57.tar.gz