Add perf scripts that compare fetching a trait using GN API vs LMDB

* scripts/performance/(README,org, timeit_gn2.py, timeit_lmdb.py): New files. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
author: Munyoki Kilyungi 2023-02-03 12:58:13 +0300
committer: BonfaceKilz 2023-02-03 13:08:35 +0300
commit: c7d242233403594d7a5422344a49a16c2c738222 (patch)
tree: 216ce9923cc103e05790cf6d3ae4d304fb3a6018
parent: ceaefbd3022044f13ba2d763598460b83bfa19ee (diff)
download: genenetwork2-c7d242233403594d7a5422344a49a16c2c738222.tar.gz
3 files changed, 131 insertions, 0 deletions
diff --git a/scripts/performance/README.org b/scripts/performance/README.org
new file mode 100644
index 00000000..f7fb392f
--- /dev/null
+++ b/scripts/performance/README.org
@@ -0,0 +1,41 @@
+* Introduction
+
+This directory contains scripts that are used to instrument the performance of fetching a trait using the GN2 API vs using LMDB.  Assuming you have a script called ,runcmd:
+
+#+name: ,runcmd
+#+begin_src sh
+#!/bin/sh
+
+env GN2_PROFILE=$HOME/opt/genenetwork2 \
+    TMPDIR=$HOME/tmp SERVER_PORT=5004 \
+    WEBSERVER_MODE=DEV \
+    GENENETWORK_FILES=$HOME/data/genotype_files/ \
+    SPARQL_ENDPOINT=http://localhost:8892/sparql\
+    SQL_URI="mysql://root:root@localhost:3306/db_webqtl_s"\
+    GN_PROXY_URL=http://localhost:8080 \
+    GN_SERVER_URL=http://localhost:8083/api \
+    GN3_LOCAL_URL=http://localhost:8083 \
+    GN_LOCAL_URL=http://localhost:8083 \
+    $HOME/projects/oqo-genenetwork2/bin/genenetwork2 \
+    $HOME/projects/oqo-genenetwork2/etc/default_settings.py\
+    -cli $*
+#+end_src
+
+To run the script 10 times, execute:
+
+: ,runcmd testing python $HOME/projects/oqo-genenetwork2/testing/scripts/performance/timeit_gn2.py 10
+
+: ,runcmd testing python $HOME/projects/oqo-genenetwork2/testing/scripts/performance/timeit_lmdb.py 10
+
+Here are some rudimentary results:
+
+Assuming you have already dumped "HLCPublish/10001" - which contains 476 strains - somewhere in LMDB, the time it takes to fetch "HLCPublish/10001" N times is:
+
+| Number |      gn2 (seconds) |      lmdb (seconds) |  gn2/lmdb |
+|--------+--------------------+---------------------+-----------|
+|     10 | 0.5971280680023483 | 0.04270002100020065 | 13.984257 |
+|     50 | 3.6268229950001114 | 0.15371317300014198 | 23.594744 |
+|    100 |  5.885073402001581 |  0.3161755159999302 | 18.613312 |
+|  1_000 |   60.6393681030022 |   3.107457533998968 | 19.514142 |
+| 10_000 |  723.0237347940019 |  27.541215700002795 | 26.252426 |
+#+TBLFM: $4=$2/$3
diff --git a/scripts/performance/timeit_gn2.py b/scripts/performance/timeit_gn2.py
new file mode 100644
index 00000000..1c64e22c
--- /dev/null
+++ b/scripts/performance/timeit_gn2.py
@@ -0,0 +1,54 @@
+import sys
+import timeit
+
+
+print(timeit.timeit(
+"""
+class UserSessionSimulator():
+    def __init__(self, user_id):
+        self._user_id = user_id
+
+    @property
+    def user_id(self):
+        return self._user_id
+
+
+def dump_sample_data(dataset_name, trait_id):
+    with database_connection() as conn, conn.cursor() as cursor:
+        sample_data = {"headers": ["Name", "Value", "SE"], "data": []}
+
+        with app.app_context():
+            g.user_session = UserSessionSimulator(None)
+            data = show_trait.ShowTrait(
+                cursor, user_id=None,
+                kw={
+                    "trait_id": trait_id,
+                    "dataset": dataset_name
+                }
+            )
+            attributes = data.js_data.get("attributes")
+            for id_ in attributes:
+                sample_data["headers"].append(attributes[id_].name)
+            for sample in data.js_data.get("sample_lists")[0]:
+                sample_data["data"].append(
+                    [
+                        sample.name,
+                        sample.value or 'x',
+                        sample.variance or 'x',
+                        *[str(sample.extra_attributes.get(str(key), "x"))
+                          for key in attributes],
+                    ])
+            return sample_data
+
+print(dump_sample_data("HLCPublish", "10001"))
+""",
+    setup="""
+# Required Evils!
+from flask import g
+from wqflask import app
+
+from wqflask.database import database_connection
+from wqflask.show_trait import show_trait
+""",
+    number=int(sys.argv[1])
+))
diff --git a/scripts/performance/timeit_lmdb.py b/scripts/performance/timeit_lmdb.py
new file mode 100644
index 00000000..17514667
--- /dev/null
+++ b/scripts/performance/timeit_lmdb.py
@@ -0,0 +1,36 @@
+import sys
+import timeit
+
+print(timeit.timeit(
+"""
+PATH = "/home/munyoki/tmp/dataset/HLCPublish/10001/"
+env = lmdb.open(PATH)
+
+BLOB_HASH_DIGEST = 32
+
+# def index_matrix(row_pointers):
+
+with env.begin(write=False) as txn:
+    current_hash = txn.get(b"current")
+    matrix_hash = txn.get(current_hash + b":matrix")
+    row_pointers = txn.get(matrix_hash +
+                           b":row-pointers")
+    nrows, = struct.unpack("<Q",
+                           txn.get(matrix_hash + b":nrows"))
+    metadata = txn.get(matrix_hash + b":metadata")
+    sample_data = []
+    for i in range(0, (nrows-1)*32, 32):
+        sample_data.append(
+            json.loads(txn.get(row_pointers[i:i+32]).decode())
+        )
+    print(sample_data)
+    print(metadata.decode())
+""",
+    setup="""
+import struct
+import array
+import json
+import lmdb
+""",
+    number=int(sys.argv[1])
+))
author	Munyoki Kilyungi	2023-02-03 12:58:13 +0300
committer	BonfaceKilz	2023-02-03 13:08:35 +0300
commit	c7d242233403594d7a5422344a49a16c2c738222 (patch)
tree	216ce9923cc103e05790cf6d3ae4d304fb3a6018
parent	ceaefbd3022044f13ba2d763598460b83bfa19ee (diff)
download	genenetwork2-c7d242233403594d7a5422344a49a16c2c738222.tar.gz