diff options
author | Arun Isaac | 2022-06-17 14:07:26 +0530 |
---|---|---|
committer | Arun Isaac | 2022-06-17 14:07:26 +0530 |
commit | b480e1e4f98bfab66168811201eb541f965be554 (patch) | |
tree | fc7c03620cc23f61bd48db2249612c2ace6e830f /gn3/genodb.py | |
parent | c2a6570a3dc2a9ae092b6a8273e7c6c0904dc1e1 (diff) | |
download | genenetwork3-b480e1e4f98bfab66168811201eb541f965be554.tar.gz |
gn3: genodb: Read optimized storage for the current matrix.
The genotype database now stores the current version of the matrix alone in a
read-optimized form, while storing the older versions of the matrix in a more
compressed form. We are only interested in the current version of the
matrix. So, always use the read optimized storage.
* gn3/genodb.py (Matrix)[row_pointers, column_pointers]: Delete fields.
[array, transpose]: New fields.
* gn3/genodb.py (matrix, row, column): Read from read-optimized storage.
(vector_ref): Delete function.
Diffstat (limited to 'gn3/genodb.py')
-rw-r--r-- | gn3/genodb.py | 22 |
1 files changed, 9 insertions, 13 deletions
diff --git a/gn3/genodb.py b/gn3/genodb.py index 8827223..ba668d2 100644 --- a/gn3/genodb.py +++ b/gn3/genodb.py @@ -27,7 +27,7 @@ import numpy as np # pylint: disable=invalid-name,redefined-builtin GenotypeDatabase = namedtuple('GenotypeDatabase', 'txn hash_length') -Matrix = namedtuple('Matrix', 'db nrows ncols row_pointers column_pointers') +Matrix = namedtuple('Matrix', 'db nrows ncols array transpose') @contextmanager def open(path): @@ -48,26 +48,22 @@ def get_metadata(db, hash, metadata): def matrix(db): '''Get current matrix from genotype database.''' - hash = get(db, b'current')[0:db.hash_length] + hash = get(db, b'versions')[0:db.hash_length] + read_optimized_blob = get(db, get(db, b'current')) nrows = int.from_bytes(get_metadata(db, hash, 'nrows'), byteorder='little') ncols = int.from_bytes(get_metadata(db, hash, 'ncols'), byteorder='little') - row_column_pointers = get(db, hash) return Matrix(db, nrows, ncols, - row_column_pointers[0 : nrows*db.hash_length], - row_column_pointers[nrows*db.hash_length :]) - -def vector_ref(db, index, pointers): - '''Get vector from byte array of pointers.''' - start = index * db.hash_length - end = start + db.hash_length - return np.frombuffer(get(db, pointers[start:end]), dtype=np.uint8) + np.reshape(np.frombuffer(read_optimized_blob[0 : nrows*ncols], dtype=np.uint8), + (nrows, ncols)), + np.reshape(np.frombuffer(read_optimized_blob[nrows*ncols :], dtype=np.uint8), + (nrows, ncols))) def row(matrix, index): '''Get row of matrix.''' # pylint: disable=redefined-outer-name - return vector_ref(matrix.db, index, matrix.row_pointers) + return matrix.array[index,:] def column(matrix, index): '''Get column of matrix.''' # pylint: disable=redefined-outer-name - return vector_ref(matrix.db, index, matrix.column_pointers) + return matrix.transpose[index,:] |