From b480e1e4f98bfab66168811201eb541f965be554 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 17 Jun 2022 14:07:26 +0530 Subject: gn3: genodb: Read optimized storage for the current matrix. The genotype database now stores the current version of the matrix alone in a read-optimized form, while storing the older versions of the matrix in a more compressed form. We are only interested in the current version of the matrix. So, always use the read optimized storage. * gn3/genodb.py (Matrix)[row_pointers, column_pointers]: Delete fields. [array, transpose]: New fields. * gn3/genodb.py (matrix, row, column): Read from read-optimized storage. (vector_ref): Delete function. --- gn3/genodb.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/gn3/genodb.py b/gn3/genodb.py index 8827223..ba668d2 100644 --- a/gn3/genodb.py +++ b/gn3/genodb.py @@ -27,7 +27,7 @@ import numpy as np # pylint: disable=invalid-name,redefined-builtin GenotypeDatabase = namedtuple('GenotypeDatabase', 'txn hash_length') -Matrix = namedtuple('Matrix', 'db nrows ncols row_pointers column_pointers') +Matrix = namedtuple('Matrix', 'db nrows ncols array transpose') @contextmanager def open(path): @@ -48,26 +48,22 @@ def get_metadata(db, hash, metadata): def matrix(db): '''Get current matrix from genotype database.''' - hash = get(db, b'current')[0:db.hash_length] + hash = get(db, b'versions')[0:db.hash_length] + read_optimized_blob = get(db, get(db, b'current')) nrows = int.from_bytes(get_metadata(db, hash, 'nrows'), byteorder='little') ncols = int.from_bytes(get_metadata(db, hash, 'ncols'), byteorder='little') - row_column_pointers = get(db, hash) return Matrix(db, nrows, ncols, - row_column_pointers[0 : nrows*db.hash_length], - row_column_pointers[nrows*db.hash_length :]) - -def vector_ref(db, index, pointers): - '''Get vector from byte array of pointers.''' - start = index * db.hash_length - end = start + db.hash_length - return np.frombuffer(get(db, pointers[start:end]), dtype=np.uint8) + np.reshape(np.frombuffer(read_optimized_blob[0 : nrows*ncols], dtype=np.uint8), + (nrows, ncols)), + np.reshape(np.frombuffer(read_optimized_blob[nrows*ncols :], dtype=np.uint8), + (nrows, ncols))) def row(matrix, index): '''Get row of matrix.''' # pylint: disable=redefined-outer-name - return vector_ref(matrix.db, index, matrix.row_pointers) + return matrix.array[index,:] def column(matrix, index): '''Get column of matrix.''' # pylint: disable=redefined-outer-name - return vector_ref(matrix.db, index, matrix.column_pointers) + return matrix.transpose[index,:] -- cgit v1.2.3