diff options
author | Munyoki Kilyungi | 2023-02-03 12:58:13 +0300 |
---|---|---|
committer | BonfaceKilz | 2023-02-03 13:08:35 +0300 |
commit | c7d242233403594d7a5422344a49a16c2c738222 (patch) | |
tree | 216ce9923cc103e05790cf6d3ae4d304fb3a6018 | |
parent | ceaefbd3022044f13ba2d763598460b83bfa19ee (diff) | |
download | genenetwork2-c7d242233403594d7a5422344a49a16c2c738222.tar.gz |
Add perf scripts that compare fetching a trait using GN API vs LMDB
* scripts/performance/(README,org, timeit_gn2.py, timeit_lmdb.py): New
files.
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rw-r--r-- | scripts/performance/README.org | 41 | ||||
-rw-r--r-- | scripts/performance/timeit_gn2.py | 54 | ||||
-rw-r--r-- | scripts/performance/timeit_lmdb.py | 36 |
3 files changed, 131 insertions, 0 deletions
diff --git a/scripts/performance/README.org b/scripts/performance/README.org new file mode 100644 index 00000000..f7fb392f --- /dev/null +++ b/scripts/performance/README.org @@ -0,0 +1,41 @@ +* Introduction + +This directory contains scripts that are used to instrument the performance of fetching a trait using the GN2 API vs using LMDB. Assuming you have a script called ,runcmd: + +#+name: ,runcmd +#+begin_src sh +#!/bin/sh + +env GN2_PROFILE=$HOME/opt/genenetwork2 \ + TMPDIR=$HOME/tmp SERVER_PORT=5004 \ + WEBSERVER_MODE=DEV \ + GENENETWORK_FILES=$HOME/data/genotype_files/ \ + SPARQL_ENDPOINT=http://localhost:8892/sparql\ + SQL_URI="mysql://root:root@localhost:3306/db_webqtl_s"\ + GN_PROXY_URL=http://localhost:8080 \ + GN_SERVER_URL=http://localhost:8083/api \ + GN3_LOCAL_URL=http://localhost:8083 \ + GN_LOCAL_URL=http://localhost:8083 \ + $HOME/projects/oqo-genenetwork2/bin/genenetwork2 \ + $HOME/projects/oqo-genenetwork2/etc/default_settings.py\ + -cli $* +#+end_src + +To run the script 10 times, execute: + +: ,runcmd testing python $HOME/projects/oqo-genenetwork2/testing/scripts/performance/timeit_gn2.py 10 + +: ,runcmd testing python $HOME/projects/oqo-genenetwork2/testing/scripts/performance/timeit_lmdb.py 10 + +Here are some rudimentary results: + +Assuming you have already dumped "HLCPublish/10001" - which contains 476 strains - somewhere in LMDB, the time it takes to fetch "HLCPublish/10001" N times is: + +| Number | gn2 (seconds) | lmdb (seconds) | gn2/lmdb | +|--------+--------------------+---------------------+-----------| +| 10 | 0.5971280680023483 | 0.04270002100020065 | 13.984257 | +| 50 | 3.6268229950001114 | 0.15371317300014198 | 23.594744 | +| 100 | 5.885073402001581 | 0.3161755159999302 | 18.613312 | +| 1_000 | 60.6393681030022 | 3.107457533998968 | 19.514142 | +| 10_000 | 723.0237347940019 | 27.541215700002795 | 26.252426 | +#+TBLFM: $4=$2/$3 diff --git a/scripts/performance/timeit_gn2.py b/scripts/performance/timeit_gn2.py new file mode 100644 index 00000000..1c64e22c --- /dev/null +++ b/scripts/performance/timeit_gn2.py @@ -0,0 +1,54 @@ +import sys +import timeit + + +print(timeit.timeit( +""" +class UserSessionSimulator(): + def __init__(self, user_id): + self._user_id = user_id + + @property + def user_id(self): + return self._user_id + + +def dump_sample_data(dataset_name, trait_id): + with database_connection() as conn, conn.cursor() as cursor: + sample_data = {"headers": ["Name", "Value", "SE"], "data": []} + + with app.app_context(): + g.user_session = UserSessionSimulator(None) + data = show_trait.ShowTrait( + cursor, user_id=None, + kw={ + "trait_id": trait_id, + "dataset": dataset_name + } + ) + attributes = data.js_data.get("attributes") + for id_ in attributes: + sample_data["headers"].append(attributes[id_].name) + for sample in data.js_data.get("sample_lists")[0]: + sample_data["data"].append( + [ + sample.name, + sample.value or 'x', + sample.variance or 'x', + *[str(sample.extra_attributes.get(str(key), "x")) + for key in attributes], + ]) + return sample_data + +print(dump_sample_data("HLCPublish", "10001")) +""", + setup=""" +# Required Evils! +from flask import g +from wqflask import app + +from wqflask.database import database_connection +from wqflask.show_trait import show_trait +""", + number=int(sys.argv[1]) +)) diff --git a/scripts/performance/timeit_lmdb.py b/scripts/performance/timeit_lmdb.py new file mode 100644 index 00000000..17514667 --- /dev/null +++ b/scripts/performance/timeit_lmdb.py @@ -0,0 +1,36 @@ +import sys +import timeit + +print(timeit.timeit( +""" +PATH = "/home/munyoki/tmp/dataset/HLCPublish/10001/" +env = lmdb.open(PATH) + +BLOB_HASH_DIGEST = 32 + +# def index_matrix(row_pointers): + +with env.begin(write=False) as txn: + current_hash = txn.get(b"current") + matrix_hash = txn.get(current_hash + b":matrix") + row_pointers = txn.get(matrix_hash + + b":row-pointers") + nrows, = struct.unpack("<Q", + txn.get(matrix_hash + b":nrows")) + metadata = txn.get(matrix_hash + b":metadata") + sample_data = [] + for i in range(0, (nrows-1)*32, 32): + sample_data.append( + json.loads(txn.get(row_pointers[i:i+32]).decode()) + ) + print(sample_data) + print(metadata.decode()) +""", + setup=""" +import struct +import array +import json +import lmdb +""", + number=int(sys.argv[1]) +)) |