aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-02-03 12:58:13 +0300
committerBonfaceKilz2023-02-03 13:08:35 +0300
commitc7d242233403594d7a5422344a49a16c2c738222 (patch)
tree216ce9923cc103e05790cf6d3ae4d304fb3a6018
parentceaefbd3022044f13ba2d763598460b83bfa19ee (diff)
downloadgenenetwork2-c7d242233403594d7a5422344a49a16c2c738222.tar.gz
Add perf scripts that compare fetching a trait using GN API vs LMDB
* scripts/performance/(README,org, timeit_gn2.py, timeit_lmdb.py): New files. Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rw-r--r--scripts/performance/README.org41
-rw-r--r--scripts/performance/timeit_gn2.py54
-rw-r--r--scripts/performance/timeit_lmdb.py36
3 files changed, 131 insertions, 0 deletions
diff --git a/scripts/performance/README.org b/scripts/performance/README.org
new file mode 100644
index 00000000..f7fb392f
--- /dev/null
+++ b/scripts/performance/README.org
@@ -0,0 +1,41 @@
+* Introduction
+
+This directory contains scripts that are used to instrument the performance of fetching a trait using the GN2 API vs using LMDB. Assuming you have a script called ,runcmd:
+
+#+name: ,runcmd
+#+begin_src sh
+#!/bin/sh
+
+env GN2_PROFILE=$HOME/opt/genenetwork2 \
+ TMPDIR=$HOME/tmp SERVER_PORT=5004 \
+ WEBSERVER_MODE=DEV \
+ GENENETWORK_FILES=$HOME/data/genotype_files/ \
+ SPARQL_ENDPOINT=http://localhost:8892/sparql\
+ SQL_URI="mysql://root:root@localhost:3306/db_webqtl_s"\
+ GN_PROXY_URL=http://localhost:8080 \
+ GN_SERVER_URL=http://localhost:8083/api \
+ GN3_LOCAL_URL=http://localhost:8083 \
+ GN_LOCAL_URL=http://localhost:8083 \
+ $HOME/projects/oqo-genenetwork2/bin/genenetwork2 \
+ $HOME/projects/oqo-genenetwork2/etc/default_settings.py\
+ -cli $*
+#+end_src
+
+To run the script 10 times, execute:
+
+: ,runcmd testing python $HOME/projects/oqo-genenetwork2/testing/scripts/performance/timeit_gn2.py 10
+
+: ,runcmd testing python $HOME/projects/oqo-genenetwork2/testing/scripts/performance/timeit_lmdb.py 10
+
+Here are some rudimentary results:
+
+Assuming you have already dumped "HLCPublish/10001" - which contains 476 strains - somewhere in LMDB, the time it takes to fetch "HLCPublish/10001" N times is:
+
+| Number | gn2 (seconds) | lmdb (seconds) | gn2/lmdb |
+|--------+--------------------+---------------------+-----------|
+| 10 | 0.5971280680023483 | 0.04270002100020065 | 13.984257 |
+| 50 | 3.6268229950001114 | 0.15371317300014198 | 23.594744 |
+| 100 | 5.885073402001581 | 0.3161755159999302 | 18.613312 |
+| 1_000 | 60.6393681030022 | 3.107457533998968 | 19.514142 |
+| 10_000 | 723.0237347940019 | 27.541215700002795 | 26.252426 |
+#+TBLFM: $4=$2/$3
diff --git a/scripts/performance/timeit_gn2.py b/scripts/performance/timeit_gn2.py
new file mode 100644
index 00000000..1c64e22c
--- /dev/null
+++ b/scripts/performance/timeit_gn2.py
@@ -0,0 +1,54 @@
+import sys
+import timeit
+
+
+print(timeit.timeit(
+"""
+class UserSessionSimulator():
+ def __init__(self, user_id):
+ self._user_id = user_id
+
+ @property
+ def user_id(self):
+ return self._user_id
+
+
+def dump_sample_data(dataset_name, trait_id):
+ with database_connection() as conn, conn.cursor() as cursor:
+ sample_data = {"headers": ["Name", "Value", "SE"], "data": []}
+
+ with app.app_context():
+ g.user_session = UserSessionSimulator(None)
+ data = show_trait.ShowTrait(
+ cursor, user_id=None,
+ kw={
+ "trait_id": trait_id,
+ "dataset": dataset_name
+ }
+ )
+ attributes = data.js_data.get("attributes")
+ for id_ in attributes:
+ sample_data["headers"].append(attributes[id_].name)
+ for sample in data.js_data.get("sample_lists")[0]:
+ sample_data["data"].append(
+ [
+ sample.name,
+ sample.value or 'x',
+ sample.variance or 'x',
+ *[str(sample.extra_attributes.get(str(key), "x"))
+ for key in attributes],
+ ])
+ return sample_data
+
+print(dump_sample_data("HLCPublish", "10001"))
+""",
+ setup="""
+# Required Evils!
+from flask import g
+from wqflask import app
+
+from wqflask.database import database_connection
+from wqflask.show_trait import show_trait
+""",
+ number=int(sys.argv[1])
+))
diff --git a/scripts/performance/timeit_lmdb.py b/scripts/performance/timeit_lmdb.py
new file mode 100644
index 00000000..17514667
--- /dev/null
+++ b/scripts/performance/timeit_lmdb.py
@@ -0,0 +1,36 @@
+import sys
+import timeit
+
+print(timeit.timeit(
+"""
+PATH = "/home/munyoki/tmp/dataset/HLCPublish/10001/"
+env = lmdb.open(PATH)
+
+BLOB_HASH_DIGEST = 32
+
+# def index_matrix(row_pointers):
+
+with env.begin(write=False) as txn:
+ current_hash = txn.get(b"current")
+ matrix_hash = txn.get(current_hash + b":matrix")
+ row_pointers = txn.get(matrix_hash +
+ b":row-pointers")
+ nrows, = struct.unpack("<Q",
+ txn.get(matrix_hash + b":nrows"))
+ metadata = txn.get(matrix_hash + b":metadata")
+ sample_data = []
+ for i in range(0, (nrows-1)*32, 32):
+ sample_data.append(
+ json.loads(txn.get(row_pointers[i:i+32]).decode())
+ )
+ print(sample_data)
+ print(metadata.decode())
+""",
+ setup="""
+import struct
+import array
+import json
+import lmdb
+""",
+ number=int(sys.argv[1])
+))