diff options
-rw-r--r-- | gn3/genodb.py | 107 |
1 files changed, 71 insertions, 36 deletions
diff --git a/gn3/genodb.py b/gn3/genodb.py index c2decb8..2a29098 100644 --- a/gn3/genodb.py +++ b/gn3/genodb.py @@ -1,39 +1,74 @@ +'''Genotype database reader + +This module is a tiny Python library to read a GeneNetwork genotype +database. It exports the following functions. + +* open - Open a genotype database +* matrix - Get current matrix +* row - Get row of matrix +* column - Get column of matrix + +Here is a typical invocation to read row 17 and column 13 from a genotype +database at `/tmp/bxd`. + +from gn3 import genodb + +with genodb.open('/tmp/bxd') as db: + matrix = genodb.matrix(db) + print(genodb.row(matrix, 17)) + print(genodb.column(matrix, 13)) + +''' + +from collections import namedtuple +from contextlib import contextmanager import lmdb import numpy as np -class GenotypeDatabase: - def __init__(self, path): - self.env = lmdb.open(path, readonly=True, create=False) - self.txn = self.env.begin() - # 32 bytes in a SHA256 hash - self.hash_length = 32 - def __enter__(self): - return self - def __exit__(self, type, value, traceback): - self.txn.abort() - self.env.close() - def get(self, hash): - return self.txn.get(hash) - def get_metadata(self, hash, metadata): - return self.txn.get(hash + b':' + metadata.encode()) - def matrix(self): - hash = self.get(b'current')[0:self.hash_length] - return Matrix(self, hash) - -class Matrix(): - def __init__(self, db, hash): - self.nrows = int.from_bytes(db.get_metadata(hash, 'nrows'), byteorder='little') - self.ncols = int.from_bytes(db.get_metadata(hash, 'ncols'), byteorder='little') - row_column_pointers = db.get(hash) - self.row_pointers = row_column_pointers[0 : self.nrows*db.hash_length] - self.column_pointers = row_column_pointers[self.nrows*db.hash_length :] - self.db = db - def __vector(self, index, pointers): - start = index * self.db.hash_length - end = start + self.db.hash_length - return np.frombuffer(self.db.get(pointers[start:end]), - dtype=np.uint8) - def row(self, index): - return self.__vector(index, self.row_pointers) - def column(self, index): - return self.__vector(index, self.column_pointers) +# pylint: disable=invalid-name,redefined-builtin + +GenotypeDatabase = namedtuple('GenotypeDatabase', 'txn hash_length') +Matrix = namedtuple('Matrix', 'db nrows ncols row_pointers column_pointers') + +@contextmanager +def open(path): + '''Open genotype database.''' + env = lmdb.open(path, readonly=True, create=False) + txn = env.begin() + yield GenotypeDatabase(txn, 32) # 32 bytes in a SHA256 hash + txn.abort() + env.close() + +def get(db, key): + '''Get value associated with key in genotype database.''' + return db.txn.get(key) + +def get_metadata(db, hash, metadata): + '''Get metadata associated with hash in genotype database.''' + return db.txn.get(hash + b':' + metadata.encode()) + +def matrix(db): + '''Get current matrix from genotype database.''' + hash = get(db, b'current')[0:db.hash_length] + nrows = int.from_bytes(get_metadata(db, hash, 'nrows'), byteorder='little') + ncols = int.from_bytes(get_metadata(db, hash, 'ncols'), byteorder='little') + row_column_pointers = get(db, hash) + return Matrix(db, nrows, ncols, + row_column_pointers[0 : nrows*db.hash_length], + row_column_pointers[nrows*db.hash_length :]) + +def vector_ref(db, index, pointers): + '''Get vector from byte array of pointers.''' + start = index * db.hash_length + end = start + db.hash_length + return np.frombuffer(get(db, pointers[start:end]), dtype=np.uint8) + +def row(matrix, index): + '''Get row of matrix.''' + # pylint: disable=redefined-outer-name + return vector_ref(matrix.db, index, matrix.row_pointers) + +def column(matrix, index): + '''Get column of matrix.''' + # pylint: disable=redefined-outer-name + return vector_ref(matrix.db, index, matrix.column_pointers) |