aboutsummaryrefslogtreecommitdiff
path: root/gn3/genodb.py
blob: c2decb837d14fc48b0eb6f53418729e0650d2ab8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import lmdb
import numpy as np

class GenotypeDatabase:
    def __init__(self, path):
        self.env = lmdb.open(path, readonly=True, create=False)
        self.txn = self.env.begin()
        # 32 bytes in a SHA256 hash
        self.hash_length = 32
    def __enter__(self):
        return self
    def __exit__(self, type, value, traceback):
        self.txn.abort()
        self.env.close()
    def get(self, hash):
        return self.txn.get(hash)
    def get_metadata(self, hash, metadata):
        return self.txn.get(hash + b':' + metadata.encode())
    def matrix(self):
        hash = self.get(b'current')[0:self.hash_length]
        return Matrix(self, hash)

class Matrix():
    def __init__(self, db, hash):
        self.nrows = int.from_bytes(db.get_metadata(hash, 'nrows'), byteorder='little')
        self.ncols = int.from_bytes(db.get_metadata(hash, 'ncols'), byteorder='little')
        row_column_pointers = db.get(hash)
        self.row_pointers = row_column_pointers[0 : self.nrows*db.hash_length]
        self.column_pointers = row_column_pointers[self.nrows*db.hash_length :]
        self.db = db
    def __vector(self, index, pointers):
        start = index * self.db.hash_length
        end = start + self.db.hash_length
        return np.frombuffer(self.db.get(pointers[start:end]),
                             dtype=np.uint8)
    def row(self, index):
        return self.__vector(index, self.row_pointers)
    def column(self, index):
        return self.__vector(index, self.column_pointers)