diff options
Diffstat (limited to 'gn3/db')
| -rw-r--r-- | gn3/db/case_attributes.py | 487 | ||||
| -rw-r--r-- | gn3/db/correlations.py | 4 | ||||
| -rw-r--r-- | gn3/db/datasets.py | 15 | ||||
| -rw-r--r-- | gn3/db/menu.py | 15 | ||||
| -rw-r--r-- | gn3/db/probesets.py | 61 | ||||
| -rw-r--r-- | gn3/db/rdf/wiki.py | 158 | ||||
| -rw-r--r-- | gn3/db/sample_data.py | 228 | ||||
| -rw-r--r-- | gn3/db/traits.py | 3 | ||||
| -rw-r--r-- | gn3/db/wiki.py | 32 |
9 files changed, 792 insertions, 211 deletions
diff --git a/gn3/db/case_attributes.py b/gn3/db/case_attributes.py index bb15248..a55f2d8 100644 --- a/gn3/db/case_attributes.py +++ b/gn3/db/case_attributes.py @@ -1,126 +1,389 @@ """Module that contains functions for editing case-attribute data""" -from typing import Any, Optional, Tuple +from pathlib import Path +from typing import Optional +from dataclasses import dataclass +from enum import Enum, auto import json -import MySQLdb - - -def get_case_attributes(conn) -> Optional[Tuple]: - """Get all the case attributes from the database.""" - with conn.cursor() as cursor: - cursor.execute("SELECT Id, Name, Description FROM CaseAttribute") - return cursor.fetchall() - - -def get_unreviewed_diffs(conn: Any) -> Optional[tuple]: - """Fetch all case attributes in GN""" - with conn.cursor() as cursor: - cursor.execute( - "SELECT id, editor, json_diff_data FROM " - "caseattributes_audit WHERE status = 'review'" - ) - return cursor.fetchall() - - -def insert_case_attribute_audit( - conn: Any, status: str, author: str, data: str -) -> int: - """Update the case_attribute_audit table""" - rowcount = 0 - try: - with conn.cursor() as cursor: - cursor.execute( - "INSERT INTO caseattributes_audit " - "(status, editor, json_diff_data) " - "VALUES (%s, %s, %s)", - (status, author, data,), +import pickle +import lmdb + + +class EditStatus(Enum): + """Enumeration for the status of the edits.""" + review = auto() # pylint: disable=[invalid-name] + approved = auto() # pylint: disable=[invalid-name] + rejected = auto() # pylint: disable=[invalid-name] + + def __str__(self): + """Print out human-readable form.""" + return self.name + + +@dataclass +class CaseAttributeEdit: + """Represents an edit operation for case attributes in the database. + + Attributes: + - inbredset_id (int): The ID of the inbred set associated with + the edit. + - status: (EditStatus): The status of this edit. + - user_id (str): The ID of the user performing the edit. + - changes (dict): A dictionary containing the changes to be + applied to the case attributes. + + """ + inbredset_id: int + status: EditStatus + user_id: str + changes: dict + + +def queue_edit(cursor, directory: Path, edit: CaseAttributeEdit) -> Optional[int]: + """Queues a case attribute edit for review by inserting it into + the audit table and storing its review ID in an LMDB database. + + Args: + cursor: A database cursor for executing SQL queries. + directory (Path): The base directory path for the LMDB database. + edit (CaseAttributeEdit): A dataclass containing the edit details, including + inbredset_id, status, user_id, and changes. + + Returns: + int: An id the particular case-attribute that was updated. + """ + cursor.execute( + "INSERT INTO " + "caseattributes_audit(status, editor, json_diff_data) " + "VALUES (%s, %s, %s) " + "ON DUPLICATE KEY UPDATE status=%s", + (str(edit.status), edit.user_id, + json.dumps(edit.changes), str(EditStatus.review),)) + directory.mkdir(parents=True, exist_ok=True) + env = lmdb.open(directory.as_posix(), map_size=8_000_000) # 1 MB + with env.begin(write=True) as txn: + review_ids = set() + if reviews := txn.get(b"review"): + review_ids = pickle.loads(reviews) + _id = cursor.lastrowid + review_ids.add(_id) + txn.put(b"review", pickle.dumps(review_ids)) + return _id + + +def __fetch_case_attrs_changes__(cursor, change_ids: tuple) -> list: + """Fetches case attribute change records from the audit table for + given change IDs. + + Retrieves records from the `caseattributes_audit` table for the + specified `change_ids`, including the editor, JSON diff data, and + timestamp. The JSON diff data is deserialized into a Python + dictionary for each record. Results are ordered by timestamp in + descending order (most recent first). + + Args: + cursor: A MySQLdb cursor for executing SQL queries. + change_ids (tuple): A tuple of integers representing the IDs + of changes to fetch. + + Returns: + list: A list of dictionaries, each containing the `editor`, + `json_diff_data` (as a deserialized dictionary), and `time_stamp` + for the matching change IDs. Returns an empty list if no records + are found. + + Notes: + - The function assumes `change_ids` is a non-empty tuple of valid integers. + - The `json_diff_data` column in `caseattributes_audit` is expected to contain valid + JSON strings, which are deserialized into dictionaries. + - The query uses parameterized placeholders to prevent SQL injection. + - This is an internal helper function (indicated by double underscores) used by + other functions like `get_changes`. + + Raises: + json.JSONDecodeError: If any `json_diff_data` value cannot be deserialized. + TypeError: If `change_ids` is empty or contains non-integer values, potentially + causing a database error. + + """ + if not change_ids: + return {} # type:ignore + placeholders = ','.join(['%s'] * len(change_ids)) + cursor.execute( + "SELECT editor, json_diff_data, time_stamp " + f"FROM caseattributes_audit WHERE id IN ({placeholders}) " + "ORDER BY time_stamp DESC", + change_ids + ) + results = cursor.fetchall() + for el in results: + el["json_diff_data"] = json.loads(el["json_diff_data"]) + return results + + +def view_change(cursor, change_id: int) -> dict: + """Queries the `caseattributes_audit` table to fetch the + `json_diff_data` column for the given `change_id`. The JSON data + is deserialized into a Python dictionary and returned. If no + record is found or the `json_diff_data` is None, an empty + dictionary is returned. + + Args: + cursor: A MySQLdb cursor for executing SQL queries. + change_id (int): The ID of the change to retrieve from the + `caseattributes_audit` table. + + Returns: + dict: The deserialized JSON diff data as a dictionary if the + record exists and contains valid JSON; otherwise, an + empty dictionary. + + Raises: + json.JSONDecodeError: If the `json_diff_data` cannot be + deserialized due to invalid JSON. + TypeError: If `cursor.fetchone()` returns None (e.g., no + record found) and `json_diff_data` is accessed, though the + function handles this by returning an empty dictionary. + + """ + cursor.execute( + "SELECT json_diff_data " + "FROM caseattributes_audit " + "WHERE id = %s", + (change_id,) + ) + json_diff_data, _ = cursor.fetchone() + if json_diff_data: + json_diff_data = json.loads(json_diff_data) + return json_diff_data + return {} + + +def get_changes(cursor, change_type: EditStatus, directory: Path) -> dict: + """Retrieves case attribute changes for given lmdb data in + directory categorized by review status. + + Fetches change IDs from an LMDB database, categorized into the + "data" key based on the EditStatus + + Args: + - cursor: A MySQLdb cursor for executing SQL queries. + - change_type (EditStatus): The status of changes to retrieve + ('review', 'approved', or 'rejected'). + - directory (Path): The base directory path for the LMDB + database. + + Returns: + dict: A dictionary with two keys: + -'count': A dictionary with counts of 'reviews', + 'approvals' and 'rejections'. + - 'data': contains the json diff data of the modified data + + Raises: + json.JSONDecodeError: If any `json_diff_data` in the audit + table cannot be deserialized by + `__fetch_case_attrs_changes__`. + TypeError: If `inbredset_id` is not an integer or if LMDB data + cannot be deserialized. Also raised when an invalid change_id + is used. + + """ + directory.mkdir(parents=True, exist_ok=True) + review_ids, approved_ids, rejected_ids = set(), set(), set() + directory.mkdir(parents=True, exist_ok=True) + env = lmdb.open(directory.as_posix(), map_size=8_000_000) # 1 MB + with env.begin(write=False) as txn: + if reviews := txn.get(b"review"): + review_ids = pickle.loads(reviews) + if approvals := txn.get(b"approved"): + approved_ids = pickle.loads(approvals) + if rejections := txn.get(b"rejected"): + rejected_ids = pickle.loads(rejections) + changes = {} + match change_type: + case EditStatus.review: + changes = dict( + zip(review_ids, + __fetch_case_attrs_changes__(cursor, tuple(review_ids))) ) - rowcount = cursor.rowcount - except Exception as _e: - raise MySQLdb.Error(_e) from _e - return rowcount - - -def reject_case_attribute(conn: Any, case_attr_audit_id: int) -> int: - """Given the id of the json_diff in the case_attribute_audit table, reject - it""" - rowcount = 0 - try: - with conn.cursor() as cursor: - cursor.execute( - "UPDATE caseattributes_audit SET " - "status = 'rejected' WHERE id = %s", - (case_attr_audit_id,), + case EditStatus.approved: + changes = dict( + zip(approved_ids, + __fetch_case_attrs_changes__(cursor, tuple(approved_ids))) ) - rowcount = cursor.rowcount - except Exception as _e: - raise MySQLdb.Error(_e) from _e - return rowcount + case EditStatus.rejected: + changes = dict(zip(rejected_ids, + __fetch_case_attrs_changes__(cursor, tuple(rejected_ids)))) + case _: + raise TypeError + return { + "change-type": str(change_type), + "count": { + "reviews": len(review_ids), + "approvals": len(approved_ids), + "rejections": len(rejected_ids) + }, + "data": changes + } + + +# pylint: disable=[too-many-locals, too-many-branches] +def apply_change(cursor, change_type: EditStatus, change_id: int, directory: Path) -> bool: + """Applies or rejects a case attribute change and updates its + status in the audit table and LMDB. + + Processes a change identified by `change_id` based on the + specified `change_type` (approved or rejected). For approved + changes, applies modifications to the `CaseAttributeXRefNew` table + using bulk inserts and updates the audit status. For rejected + changes, updates the audit status only. Manages change IDs in + LMDB by moving them from the 'review' set to either 'approved' or + 'rejected' sets. Returns False if the `change_id` is not in the + review set. + + Args: + cursor: A MySQLdb cursor for executing SQL queries. + change_type (EditStatus): The action to perform, either + `EditStatus.approved` or `EditStatus.rejected`. + change_id (int): The ID of the change to process, + corresponding to a record in `caseattributes_audit`. + directory (Path): The base directory path for the LMDB + database. + + Returns: + bool: True if the change was successfully applied or rejected, + False if `change_id` is not found in the LMDB 'review' + set. + Notes: + - Opens an LMDB environment in the specified `directory` with + a map size of 8 MB. + - For `EditStatus.approved`, fetches `json_diff_data` from + `caseattributes_audit`, extracts modifications, and performs + bulk inserts into `CaseAttributeXRefNew` with `ON DUPLICATE + KEY UPDATE`. + - For `EditStatus.rejected`, updates the + `caseattributes_audit` status without modifying case + attributes. + - Uses bulk `SELECT` queries to fetch `StrainId` and + `CaseAttributeId` values efficiently. + - Assumes `CaseAttributeXRefNew` has a unique key on + `(InbredSetId, StrainId, CaseAttributeId)` for `ON DUPLICATE + KEY UPDATE`. + - The `json_diff_data` is expected to contain valid JSON with + an `inbredset_id` and `Modifications.Current` structure. + - The second column from `fetchone()` is ignored (denoted by + `_`). -def approve_case_attribute(conn: Any, case_attr_audit_id: int) -> int: - """Given the id of the json_diff in the case_attribute_audit table, - approve it + Raises: + ValueError: If `change_type` is neither `EditStatus.approved` + nor `EditStatus.rejected`. + json.JSONDecodeError: If `json_diff_data` cannot be + deserialized for approved changes. + TypeError: If `cursor.fetchone()` returns None for + `json_diff_data` or if `strain_id` or `caseattr_id` are + missing during bulk insert preparation. + pickle.UnpicklingError: If LMDB data (e.g., 'review' or + 'approved' sets) cannot be deserialized. """ - rowcount = 0 - try: - with conn.cursor() as cursor: - cursor.execute( - "SELECT json_diff_data FROM caseattributes_audit " - "WHERE id = %s", - (case_attr_audit_id,), - ) - diff_data = cursor.fetchone() - if diff_data: - diff_data = json.loads(diff_data[0]) - # Insert (Most Important) - if diff_data.get("Insert"): - data = diff_data.get("Insert") + review_ids, approved_ids, rejected_ids = set(), set(), set() + directory.mkdir(parents=True, exist_ok=True) + env = lmdb.open(directory.as_posix(), map_size=8_000_000) # 1 MB + with env.begin(write=True) as txn: + if reviews := txn.get(b"review"): + review_ids = pickle.loads(reviews) + if change_id not in review_ids: + return False + match change_type: + case EditStatus.rejected: + cursor.execute( + "UPDATE caseattributes_audit " + "SET status = %s " + "WHERE id = %s", + (str(change_type), change_id)) + if rejections := txn.get(b"rejected"): + rejected_ids = pickle.loads(rejections) + rejected_ids.add(change_id) + review_ids.discard(change_id) + txn.put(b"review", pickle.dumps(review_ids)) + txn.put(b"rejected", pickle.dumps(rejected_ids)) + return True + case EditStatus.approved: + cursor.execute( + "SELECT json_diff_data " + "FROM caseattributes_audit WHERE " + "id = %s", + (change_id,) + ) + result = cursor.fetchone() + if result is None: + return False + json_diff_data = result[0] + json_diff_data = json.loads(json_diff_data) + inbredset_id = json_diff_data.get("inbredset_id") + modifications = json_diff_data.get( + "Modifications", {}).get("Current", {}) + strains = tuple(modifications.keys()) + case_attrs = set() + for data in modifications.values(): + case_attrs.update(data.keys()) + + # Bulk fetch strain ids + strain_id_map = {} + if strains: cursor.execute( - "INSERT INTO CaseAttribute " - "(Name, Description) VALUES " - "(%s, %s)", - ( - data.get("name").strip(), - data.get("description").strip(), - ), + "SELECT Name, Id FROM Strain WHERE Name IN " + f"({', '.join(['%s'] * len(strains))})", + strains ) - # Delete - elif diff_data.get("Deletion"): - data = diff_data.get("Deletion") + for name, strain_id in cursor.fetchall(): + strain_id_map[name] = strain_id + + # Bulk fetch case attr ids + caseattr_id_map = {} + if case_attrs: cursor.execute( - "DELETE FROM CaseAttribute WHERE Id = %s", - (data.get("id"),), + "SELECT Name, CaseAttributeId FROM CaseAttribute " + "WHERE InbredSetId = %s AND Name IN " + f"({', '.join(['%s'] * len(case_attrs))})", + (inbredset_id, *case_attrs) ) - # Modification - elif diff_data.get("Modification"): - data = diff_data.get("Modification") - if desc_ := data.get("description"): - cursor.execute( - "UPDATE CaseAttribute SET " - "Description = %s WHERE Id = %s", - ( - desc_.get("Current"), - diff_data.get("id"), - ), - ) - if name_ := data.get("name"): - cursor.execute( - "UPDATE CaseAttribute SET " - "Name = %s WHERE Id = %s", - ( - name_.get("Current"), - diff_data.get("id"), - ), - ) - if cursor.rowcount: - cursor.execute( - "UPDATE caseattributes_audit SET " - "status = 'approved' WHERE id = %s", - (case_attr_audit_id,), + for name, caseattr_id in cursor.fetchall(): + caseattr_id_map[name] = caseattr_id + + # Bulk insert data + insert_data = [] + for strain, data in modifications.items(): + strain_id = strain_id_map.get(strain) + for case_attr, value in data.items(): + insert_data.append({ + "inbredset_id": inbredset_id, + "strain_id": strain_id, + "caseattr_id": caseattr_id_map.get(case_attr), + "value": value, + }) + if insert_data: + cursor.executemany( + "INSERT INTO CaseAttributeXRefNew " + "(InbredSetId, StrainId, CaseAttributeId, Value) " + "VALUES (%(inbredset_id)s, %(strain_id)s, %(caseattr_id)s, %(value)s) " + "ON DUPLICATE KEY UPDATE Value = VALUES(Value)", + insert_data ) - rowcount = cursor.rowcount - except Exception as _e: - raise MySQLdb.Error(_e) from _e - return rowcount + + # Update LMDB and audit table + cursor.execute( + "UPDATE caseattributes_audit " + "SET status = %s " + "WHERE id = %s", + (str(change_type), change_id)) + if approvals := txn.get(b"approved"): + approved_ids = pickle.loads(approvals) + approved_ids.add(change_id) + review_ids.discard(change_id) + txn.put(b"review", pickle.dumps(review_ids)) + txn.put(b"approved", pickle.dumps(approved_ids)) + return True + case _: + raise ValueError diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py index aec8eac..5d6cfb3 100644 --- a/gn3/db/correlations.py +++ b/gn3/db/correlations.py @@ -328,7 +328,7 @@ def build_temporary_tissue_correlations_table( return temp_table_name -def fetch_tissue_correlations(# pylint: disable=R0913 +def fetch_tissue_correlations(# pylint: disable=[R0913, too-many-arguments, too-many-positional-arguments] dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str, return_number: int, conn: Any) -> dict: """ @@ -529,7 +529,7 @@ def __build_query__( f"ORDER BY {db_type}.Id"), 1) -# pylint: disable=too-many-arguments +# pylint: disable=[too-many-arguments, too-many-positional-arguments] def __fetch_data__( conn, sample_ids: tuple, db_name: str, db_type: str, method: str, temp_table: Optional[str]) -> Tuple[Tuple[Any], int]: diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py index f3b4f9f..fea207b 100644 --- a/gn3/db/datasets.py +++ b/gn3/db/datasets.py @@ -79,6 +79,21 @@ def retrieve_mrna_group_name(connection: Any, probeset_id: int, dataset_name: st return res[0] return None +def retrieve_group_id(connection: Any, group_name: str): + """ + Given the group name, retrieve the group ID + """ + query = ( + "SELECT iset.Id " + "FROM InbredSet AS iset " + "WHERE iset.Name = %(group_name)s") + with connection.cursor() as cursor: + cursor.execute(query, {"group_name": group_name}) + res = cursor.fetchone() + if res: + return res[0] + return None + def retrieve_phenotype_group_name(connection: Any, dataset_id: int): """ Given the dataset id (PublishFreeze.Id in the database), retrieve the name diff --git a/gn3/db/menu.py b/gn3/db/menu.py index 8dccabf..34dedde 100644 --- a/gn3/db/menu.py +++ b/gn3/db/menu.py @@ -1,10 +1,12 @@ """Menu generation code for the data in the dropdowns in the index page.""" - +import logging from typing import Tuple from functools import reduce from gn3.db.species import get_all_species +logger = logging.getLogger(__name__) + def gen_dropdown_json(conn): """ Generates and outputs (as json file) the data for the main dropdown menus on @@ -14,10 +16,12 @@ def gen_dropdown_json(conn): groups = get_groups(conn, tuple(row[0] for row in species)) types = get_types(conn, groups) datasets = get_datasets(conn, types) - return dict(species=species, - groups=groups, - types=types, - datasets=datasets) + return { + "species": species, + "groups": groups, + "types": types, + "datasets": datasets + } def get_groups(conn, species_names: Tuple[str, ...]): """Build groups list""" @@ -35,6 +39,7 @@ def get_groups(conn, species_names: Tuple[str, ...]): "IFNULL(InbredSet.Family, InbredSet.FullName) ASC, " "InbredSet.FullName ASC, " "InbredSet.MenuOrderId ASC") + logger.debug("'get_groups' QUERY: %s, %s", query, species_names) cursor.execute(query, tuple(species_names)) results = cursor.fetchall() diff --git a/gn3/db/probesets.py b/gn3/db/probesets.py index 910f05b..e725add 100644 --- a/gn3/db/probesets.py +++ b/gn3/db/probesets.py @@ -8,6 +8,9 @@ from gn3.db_utils import Connection as DBConnection from .query_tools import mapping_to_query_columns + +# pylint: disable = line-too-long + @dataclass(frozen=True) class Probeset: # pylint: disable=[too-many-instance-attributes] """Data Type that represents a Probeset""" @@ -40,40 +43,42 @@ class Probeset: # pylint: disable=[too-many-instance-attributes] # Mapping from the Phenotype dataclass to the actual column names in the # database probeset_mapping = { - "id_": "Id", - "name": "Name", - "symbol": "symbol", - "description": "description", - "probe_target_description": "Probe_Target_Description", - "chr_": "Chr", - "mb": "Mb", - "alias": "alias", - "geneid": "GeneId", - "homologeneid": "HomoloGeneID", - "unigeneid": "UniGeneId", - "omim": "OMIM", - "refseq_transcriptid": "RefSeq_TranscriptId", - "blatseq": "BlatSeq", - "targetseq": "TargetSeq", - "strand_probe": "Strand_Probe", - "probe_set_target_region": "Probe_set_target_region", - "probe_set_specificity": "Probe_set_specificity", - "probe_set_blat_score": "Probe_set_BLAT_score", - "probe_set_blat_mb_start": "Probe_set_Blat_Mb_start", - "probe_set_blat_mb_end": "Probe_set_Blat_Mb_end", - "probe_set_strand": "Probe_set_strand", - "probe_set_note_by_rw": "Probe_set_Note_by_RW", - "flag": "flag" + "id_": "ProbeSet.Id", + "name": "ProbeSet.Name", + "symbol": "ProbeSet.symbol", + "description": "ProbeSet.description", + "probe_target_description": "ProbeSet.Probe_Target_Description", + "chr_": "ProbeSet.Chr", + "mb": "ProbeSet.Mb", + "alias": "ProbeSet.alias", + "geneid": "ProbeSet.GeneId", + "homologeneid": "ProbeSet.HomoloGeneID", + "unigeneid": "ProbeSet.UniGeneId", + "omim": "ProbeSet.OMIM", + "refseq_transcriptid": "ProbeSet.RefSeq_TranscriptId", + "blatseq": "ProbeSet.BlatSeq", + "targetseq": "ProbeSet.TargetSeq", + "strand_probe": "ProbeSet.Strand_Probe", + "probe_set_target_region": "ProbeSet.Probe_set_target_region", + "probe_set_specificity": "ProbeSet.Probe_set_specificity", + "probe_set_blat_score": "ProbeSet.Probe_set_BLAT_score", + "probe_set_blat_mb_start": "ProbeSet.Probe_set_Blat_Mb_start", + "probe_set_blat_mb_end": "ProbeSet.Probe_set_Blat_Mb_end", + "probe_set_strand": "ProbeSet.Probe_set_strand", + "probe_set_note_by_rw": "ProbeSet.Probe_set_Note_by_RW", + "flag": "ProbeSet.flag" } -def fetch_probeset_metadata_by_name(conn: DBConnection, name: str) -> dict: +def fetch_probeset_metadata_by_name(conn: DBConnection, trait_name: str, dataset_name: str) -> dict: """Fetch a ProbeSet's metadata by its `name`.""" with conn.cursor(cursorclass=DictCursor) as cursor: cols = ", ".join(mapping_to_query_columns(probeset_mapping)) cursor.execute((f"SELECT {cols} " - "FROM ProbeSet " - "WHERE Name = %(name)s"), - {"name": name}) + "FROM ProbeSetFreeze " + "INNER JOIN ProbeSetXRef ON ProbeSetXRef.`ProbeSetFreezeId` = ProbeSetFreeze.`Id` " + "INNER JOIN ProbeSet ON ProbeSet.`Id` = ProbeSetXRef.`ProbeSetId` " + "WHERE ProbeSet.Name = %(trait_name)s AND ProbeSetFreeze.Name = %(ds_name)s"), + {"trait_name": trait_name, "ds_name": dataset_name}) return cursor.fetchone() def update_probeset(conn, probeset_id, data:dict) -> int: diff --git a/gn3/db/rdf/wiki.py b/gn3/db/rdf/wiki.py index b2b301a..dd8d204 100644 --- a/gn3/db/rdf/wiki.py +++ b/gn3/db/rdf/wiki.py @@ -15,6 +15,7 @@ from gn3.db.rdf import ( RDF_PREFIXES, query_frame_and_compact, update_rdf, + sparql_query, ) @@ -41,6 +42,10 @@ def __sanitize_result(result: dict) -> dict: if not result: return {} categories = result.get("categories") + if (version := result.get("version")) and isinstance(version, str): + result["version"] = int(version) + if (wiki_id := result.get("id")) and isinstance(version, str): + result["id"] = int(wiki_id) if isinstance(categories, str): result["categories"] = [categories] if categories else [] result["categories"] = sorted(result["categories"]) @@ -79,7 +84,7 @@ CONSTRUCT { gnt:belongsToCategory ?category ; gnt:hasVersion ?max ; dct:created ?created ; - dct:identifier ?id_ . + dct:identifier ?id . } FROM $graph WHERE { ?comment rdfs:label ?text_ ; gnt:symbol ?symbol ; @@ -88,12 +93,12 @@ CONSTRUCT { dct:created ?createTime . FILTER ( LCASE(STR(?symbol)) = LCASE("$symbol") ) . { - SELECT (MAX(?vers) AS ?max) ?id_ WHERE { + SELECT (MAX(?vers) AS ?max_) ?id_ WHERE { ?comment dct:identifier ?id_ ; dct:hasVersion ?vers . } } - ?comment dct:hasVersion ?max . + ?comment dct:hasVersion ?max_ . OPTIONAL { ?comment gnt:reason ?reason_ } . OPTIONAL { ?comment gnt:belongsToSpecies ?speciesId . @@ -106,6 +111,8 @@ CONSTRUCT { OPTIONAL { ?comment gnt:belongsToCategory ?category_ } . BIND (str(?createTime) AS ?created) . BIND (str(?text_) AS ?text) . + BIND (str(?max_) AS ?max) . + BIND (str(?id_) AS ?id) . BIND (STR(COALESCE(?pmid_, "")) AS ?pmid) . BIND (COALESCE(?reason_, "") AS ?reason) . BIND (STR(COALESCE(?weburl_, "")) AS ?weburl) . @@ -154,7 +161,7 @@ CONSTRUCT { rdfs:label ?text_ ; gnt:symbol ?symbol ; dct:created ?createTime ; - dct:hasVersion ?version ; + dct:hasVersion ?version_ ; dct:identifier $comment_id . OPTIONAL { ?comment gnt:reason ?reason_ } . OPTIONAL { @@ -167,6 +174,7 @@ CONSTRUCT { OPTIONAL { ?comment foaf:mbox ?email_ . } . OPTIONAL { ?comment gnt:belongsToCategory ?category_ . } . BIND (str(?text_) AS ?text) . + BIND (str(?version_) AS ?version) . BIND (str(?createTime) AS ?created) . BIND (STR(COALESCE(?pmid_, "")) AS ?pmid) . BIND (COALESCE(?reason_, "") AS ?reason) . @@ -186,38 +194,42 @@ CONSTRUCT { def update_wiki_comment( - insert_dict: dict, - sparql_user: str, - sparql_password: str, - sparql_auth_uri: str, - graph: str = "<http://genenetwork.org>", + insert_dict: dict, + sparql_user: str, + sparql_password: str, + sparql_auth_uri: str, + graph: str = "<http://genenetwork.org>", ) -> str: """Update a wiki comment by inserting a comment with the same -identifier but an updated version id. + identifier but an updated version id. """ name = f"gn:wiki-{insert_dict['Id']}-{insert_dict['versionId']}" - comment_triple = Template("""$name rdfs:label '''$comment'''@en ; + comment_triple = Template( + """$name rdfs:label '''$comment'''@en ; rdf:type gnc:GNWikiEntry ; gnt:symbol "$symbol" ; dct:identifier "$comment_id"^^xsd:integer ; dct:hasVersion "$next_version"^^xsd:integer ; dct:created "$created"^^xsd:datetime . -""").substitute( +""" + ).substitute( comment=insert_dict["comment"], - name=name, symbol=insert_dict['symbol'], - comment_id=insert_dict["Id"], next_version=insert_dict["versionId"], - created=insert_dict["createtime"]) + name=name, + symbol=insert_dict["symbol"], + comment_id=insert_dict["Id"], + next_version=insert_dict["versionId"], + created=insert_dict["createtime"], + ) using = "" if insert_dict["email"]: comment_triple += f"{name} foaf:mbox <{insert_dict['email']}> .\n" if insert_dict["initial"]: comment_triple += f"{name} gnt:initial \"{insert_dict['initial']}\" .\n" - if insert_dict["species"]: + if insert_dict["species"] and insert_dict["species"].lower() != "no specific species": comment_triple += f"{name} gnt:belongsToSpecies ?speciesId .\n" using = Template( - """ USING $graph WHERE { ?speciesId gnt:shortName "$species" . } """).substitute( - graph=graph, species=insert_dict["species"] - ) + """ USING $graph WHERE { ?speciesId gnt:shortName "$species" . } """ + ).substitute(graph=graph, species=insert_dict["species"]) if insert_dict["reason"]: comment_triple += f"{name} gnt:reason \"{insert_dict['reason']}\" .\n" if insert_dict["weburl"]: @@ -236,10 +248,110 @@ INSERT { GRAPH $graph { $comment_triple} } $using -""").substitute(prefix=RDF_PREFIXES, - graph=graph, - comment_triple=comment_triple, - using=using), +""" + ).substitute( + prefix=RDF_PREFIXES, graph=graph, comment_triple=comment_triple, using=using + ), + sparql_user=sparql_user, + sparql_password=sparql_password, + sparql_auth_uri=sparql_auth_uri, + ) + + +def get_rif_entries_by_symbol( + symbol: str, sparql_uri: str, graph: str = "<http://genenetwork.org>" +) -> dict: + """Fetch NCBI RIF entries for a given symbol (case-insensitive). + +This function retrieves NCBI RIF entries using a SPARQL `SELECT` query +instead of a `CONSTRUCT` to avoid truncation. The Virtuoso SPARQL +engine limits query results to 1,048,576 triples per solution, and +NCBI entries can exceed this limit. Since there may be more than +2,000 entries, which could result in the number of triples surpassing +the limit, `SELECT` is used to ensure complete data retrieval without +truncation. See: + +<https://community.openlinksw.com/t/sparql-query-limiting-results-to-100000-triples/2131> + + """ + # XXX: Consider pagination + query = Template( + """ +$prefix + +SELECT ?comment ?symbol ?species ?pubmed_id ?version ?created ?gene_id ?taxonomic_id +FROM $graph WHERE { + ?comment_id rdfs:label ?text_ ; + gnt:symbol ?symbol ; + rdf:type gnc:NCBIWikiEntry ; + gnt:hasGeneId ?gene_id_ ; + dct:hasVersion ?version ; + dct:references ?pmid_ ; + dct:created ?createTime ; + gnt:belongsToSpecies ?speciesId . + ?speciesId rdfs:label ?species . + FILTER ( LCASE(?symbol) = LCASE("$symbol") ) . + OPTIONAL { ?comment_id skos:notation ?taxonId_ . } . + BIND (STR(?text_) AS ?comment) . + BIND (xsd:integer(STRAFTER(STR(?taxonId_), STR(taxon:))) AS ?taxonomic_id) . + BIND (xsd:integer(STRAFTER(STR(?pmid_), STR(pubmed:))) AS ?pubmed_id) . + BIND (xsd:integer(STRAFTER(STR(?gene_id_), STR(generif:))) AS ?gene_id) . + BIND (STR(?createTime) AS ?created) . +} ORDER BY ?species ?createTime +""" + ).substitute(prefix=RDF_PREFIXES, graph=graph, symbol=symbol) + results: dict[str, dict | list] = { + "@context": { + "dct": "http://purl.org/dc/terms/", + "gnt": "http://genenetwork.org/term/", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "http://www.w3.org/2004/02/skos/core#", + "symbol": "gnt:symbol", + "species": "gnt:species", + "taxonomic_id": "skos:notation", + "gene_id": "gnt:hasGeneId", + "pubmed_id": "dct:references", + "created": "dct:created", + "comment": "rdfs:comment", + "version": "dct:hasVersion", + } + } + data: list[dict[str, int | str]] = [] + for entry in sparql_query(query=query, endpoint=sparql_uri, format_type="json"): + data.append( + { + key: int(metadata.get("value")) + if metadata.get("value").isdigit() + else metadata.get("value") + for key, metadata in entry.items() + } + ) + results["data"] = data + return results + + +def delete_wiki_entries_by_id( + wiki_id: int, + sparql_user: str, + sparql_password: str, + sparql_auth_uri: str, + graph: str = "<http://genenetwork.org>", +) -> str: + """Delete all wiki entries associated with a given ID.""" + query = Template( + """ +$prefix + +DELETE WHERE { + GRAPH $graph { + ?comment dct:identifier \"$wiki_id\"^^xsd:integer . + ?comment ?p ?o . + } +} +""" + ).substitute(prefix=RDF_PREFIXES, graph=graph, wiki_id=wiki_id) + return update_rdf( + query=query, sparql_user=sparql_user, sparql_password=sparql_password, sparql_auth_uri=sparql_auth_uri, diff --git a/gn3/db/sample_data.py b/gn3/db/sample_data.py index 8db40e3..4e01a3a 100644 --- a/gn3/db/sample_data.py +++ b/gn3/db/sample_data.py @@ -59,20 +59,32 @@ def __extract_actions( return result def get_mrna_sample_data( - conn: Any, probeset_id: str, dataset_name: str + conn: Any, probeset_id: int, dataset_name: str, probeset_name: str = None # type: ignore ) -> Dict: """Fetch a mRNA Assay (ProbeSet in the DB) trait's sample data and return it as a dict""" with conn.cursor() as cursor: - cursor.execute(""" -SELECT st.Name, ifnull(psd.value, 'x'), ifnull(psse.error, 'x'), ifnull(ns.count, 'x') -FROM ProbeSetFreeze psf - JOIN ProbeSetXRef psx ON psx.ProbeSetFreezeId = psf.Id - JOIN ProbeSet ps ON ps.Id = psx.ProbeSetId - JOIN ProbeSetData psd ON psd.Id = psx.DataId - JOIN Strain st ON psd.StrainId = st.Id - LEFT JOIN ProbeSetSE psse ON psse.DataId = psd.Id AND psse.StrainId = psd.StrainId - LEFT JOIN NStrain ns ON ns.DataId = psd.Id AND ns.StrainId = psd.StrainId -WHERE ps.Id = %s AND psf.Name= %s""", (probeset_id, dataset_name)) + if probeset_name: + cursor.execute(""" + SELECT st.Name, ifnull(psd.value, 'x'), ifnull(psse.error, 'x'), ifnull(ns.count, 'x') + FROM ProbeSetFreeze psf + JOIN ProbeSetXRef psx ON psx.ProbeSetFreezeId = psf.Id + JOIN ProbeSet ps ON ps.Id = psx.ProbeSetId + JOIN ProbeSetData psd ON psd.Id = psx.DataId + JOIN Strain st ON psd.StrainId = st.Id + LEFT JOIN ProbeSetSE psse ON psse.DataId = psd.Id AND psse.StrainId = psd.StrainId + LEFT JOIN NStrain ns ON ns.DataId = psd.Id AND ns.StrainId = psd.StrainId + WHERE ps.Name = %s AND psf.Name= %s""", (probeset_name, dataset_name)) + else: + cursor.execute(""" + SELECT st.Name, ifnull(psd.value, 'x'), ifnull(psse.error, 'x'), ifnull(ns.count, 'x') + FROM ProbeSetFreeze psf + JOIN ProbeSetXRef psx ON psx.ProbeSetFreezeId = psf.Id + JOIN ProbeSet ps ON ps.Id = psx.ProbeSetId + JOIN ProbeSetData psd ON psd.Id = psx.DataId + JOIN Strain st ON psd.StrainId = st.Id + LEFT JOIN ProbeSetSE psse ON psse.DataId = psd.Id AND psse.StrainId = psd.StrainId + LEFT JOIN NStrain ns ON ns.DataId = psd.Id AND ns.StrainId = psd.StrainId + WHERE ps.Id = %s AND psf.Name= %s""", (probeset_id, dataset_name)) sample_data = {} for data in cursor.fetchall(): @@ -118,18 +130,28 @@ WHERE ps.Id = %s AND psf.Name= %s""", (probeset_id, dataset_name)) return "\n".join(trait_csv) def get_pheno_sample_data( - conn: Any, trait_name: int, phenotype_id: int + conn: Any, trait_name: int, phenotype_id: int, group_id: int = None # type: ignore ) -> Dict: """Fetch a phenotype (Publish in the DB) trait's sample data and return it as a dict""" with conn.cursor() as cursor: - cursor.execute(""" -SELECT st.Name, ifnull(pd.value, 'x'), ifnull(ps.error, 'x'), ifnull(ns.count, 'x') -FROM PublishFreeze pf JOIN PublishXRef px ON px.InbredSetId = pf.InbredSetId - JOIN PublishData pd ON pd.Id = px.DataId JOIN Strain st ON pd.StrainId = st.Id - LEFT JOIN PublishSE ps ON ps.DataId = pd.Id AND ps.StrainId = pd.StrainId - LEFT JOIN NStrain ns ON ns.DataId = pd.Id AND ns.StrainId = pd.StrainId -WHERE px.Id = %s AND px.PhenotypeId = %s -ORDER BY st.Name""", (trait_name, phenotype_id)) + if group_id: + cursor.execute(""" + SELECT st.Name, ifnull(ROUND(pd.value, 2), 'x'), ifnull(ROUND(ps.error, 3), 'x'), ifnull(ns.count, 'x') + FROM PublishFreeze pf JOIN PublishXRef px ON px.InbredSetId = pf.InbredSetId + JOIN PublishData pd ON pd.Id = px.DataId JOIN Strain st ON pd.StrainId = st.Id + LEFT JOIN PublishSE ps ON ps.DataId = pd.Id AND ps.StrainId = pd.StrainId + LEFT JOIN NStrain ns ON ns.DataId = pd.Id AND ns.StrainId = pd.StrainId + WHERE px.Id = %s AND px.InbredSetId = %s + ORDER BY st.Name""", (trait_name, group_id)) + else: + cursor.execute(""" + SELECT st.Name, ifnull(pd.value, 'x'), ifnull(ps.error, 'x'), ifnull(ns.count, 'x') + FROM PublishFreeze pf JOIN PublishXRef px ON px.InbredSetId = pf.InbredSetId + JOIN PublishData pd ON pd.Id = px.DataId JOIN Strain st ON pd.StrainId = st.Id + LEFT JOIN PublishSE ps ON ps.DataId = pd.Id AND ps.StrainId = pd.StrainId + LEFT JOIN NStrain ns ON ns.DataId = pd.Id AND ns.StrainId = pd.StrainId + WHERE px.Id = %s AND px.PhenotypeId = %s + ORDER BY st.Name""", (trait_name, phenotype_id)) sample_data = {} for data in cursor.fetchall(): @@ -302,8 +324,8 @@ def update_sample_data( if data_type == "mrna": strain_id, data_id, inbredset_id = get_mrna_sample_data_ids( conn=conn, - probeset_id=int(probeset_id), - dataset_name=dataset_name, + probeset_id=int(probeset_id),# pylint: disable=[possibly-used-before-assignment] + dataset_name=dataset_name,# pylint: disable=[possibly-used-before-assignment] strain_name=extract_strain_name(csv_header, original_data), ) none_case_attrs = { @@ -315,8 +337,8 @@ def update_sample_data( else: strain_id, data_id, inbredset_id = get_pheno_sample_data_ids( conn=conn, - publishxref_id=int(trait_name), - phenotype_id=phenotype_id, + publishxref_id=int(trait_name),# pylint: disable=[possibly-used-before-assignment] + phenotype_id=phenotype_id,# pylint: disable=[possibly-used-before-assignment] strain_name=extract_strain_name(csv_header, original_data), ) none_case_attrs = { @@ -422,8 +444,8 @@ def delete_sample_data( if data_type == "mrna": strain_id, data_id, inbredset_id = get_mrna_sample_data_ids( conn=conn, - probeset_id=int(probeset_id), - dataset_name=dataset_name, + probeset_id=int(probeset_id),# pylint: disable=[possibly-used-before-assignment] + dataset_name=dataset_name,# pylint: disable=[possibly-used-before-assignment] strain_name=extract_strain_name(csv_header, data), ) none_case_attrs: Dict[str, Any] = { @@ -435,8 +457,8 @@ def delete_sample_data( else: strain_id, data_id, inbredset_id = get_pheno_sample_data_ids( conn=conn, - publishxref_id=int(trait_name), - phenotype_id=phenotype_id, + publishxref_id=int(trait_name),# pylint: disable=[possibly-used-before-assignment] + phenotype_id=phenotype_id,# pylint: disable=[possibly-used-before-assignment] strain_name=extract_strain_name(csv_header, data), ) none_case_attrs = { @@ -528,8 +550,8 @@ def insert_sample_data( if data_type == "mrna": strain_id, data_id, inbredset_id = get_mrna_sample_data_ids( conn=conn, - probeset_id=int(probeset_id), - dataset_name=dataset_name, + probeset_id=int(probeset_id),# pylint: disable=[possibly-used-before-assignment] + dataset_name=dataset_name,# pylint: disable=[possibly-used-before-assignment] strain_name=extract_strain_name(csv_header, data), ) none_case_attrs = { @@ -541,8 +563,8 @@ def insert_sample_data( else: strain_id, data_id, inbredset_id = get_pheno_sample_data_ids( conn=conn, - publishxref_id=int(trait_name), - phenotype_id=phenotype_id, + publishxref_id=int(trait_name),# pylint: disable=[possibly-used-before-assignment] + phenotype_id=phenotype_id,# pylint: disable=[possibly-used-before-assignment] strain_name=extract_strain_name(csv_header, data), ) none_case_attrs = { @@ -584,3 +606,145 @@ def insert_sample_data( return count except Exception as _e: raise MySQLdb.Error(_e) from _e + +def batch_update_sample_data( + conn: Any, diff_data: Dict +): + """Given sample data diffs, execute all relevant update/insert/delete queries""" + def __fetch_data_id(conn, db_type, trait_id, dataset_name): + with conn.cursor() as cursor: + if db_type == "Publish": + cursor.execute( + ( + f"SELECT {db_type}XRef.DataId " + f"FROM {db_type}XRef, {db_type}Freeze " + f"WHERE {db_type}XRef.InbredSetId = {db_type}Freeze.InbredSetId AND " + f"{db_type}XRef.Id = %s AND " + f"{db_type}Freeze.Name = %s" + ), (trait_id, dataset_name) + ) + elif db_type == "ProbeSet": + cursor.execute( + ( + f"SELECT {db_type}XRef.DataId " + f"FROM {db_type}XRef, {db_type}, {db_type}Freeze " + f"WHERE {db_type}XRef.InbredSetId = {db_type}Freeze.InbredSetId AND " + f"{db_type}XRef.ProbeSetId = {db_type}.Id AND " + f"{db_type}.Name = %s AND " + f"{db_type}Freeze.Name = %s" + ), (trait_id, dataset_name) + ) + return cursor.fetchone()[0] + + def __fetch_strain_id(conn, strain_name): + with conn.cursor() as cursor: + cursor.execute( + "SELECT Id FROM Strain WHERE Name = %s", (strain_name,) + ) + return cursor.fetchone()[0] + + def __update_query(conn, db_type, data_id, strain_id, diffs): + with conn.cursor() as cursor: + if 'value' in diffs: + cursor.execute( + ( + f"UPDATE {db_type}Data " + "SET value = %s " + "WHERE Id = %s AND StrainId = %s" + ), (diffs['value']['Current'], data_id, strain_id) + ) + if 'error' in diffs: + cursor.execute( + ( + f"UPDATE {db_type}SE " + "SET error = %s " + "WHERE DataId = %s AND StrainId = %s" + ), (diffs['error']['Current'], data_id, strain_id) + ) + if 'n_cases' in diffs: + cursor.execute( + ( + "UPDATE NStrain " + "SET count = %s " + "WHERE DataId = %s AND StrainId = %s" + ), (diffs['n_cases']['Current'], data_id, strain_id) + ) + + conn.commit() + + def __insert_query(conn, db_type, data_id, strain_id, diffs): + with conn.cursor() as cursor: + if 'value' in diffs: + cursor.execute( + ( + f"INSERT INTO {db_type}Data (Id, StrainId, value)" + "VALUES (%s, %s, %s)" + ), (data_id, strain_id, diffs['value']) + ) + if 'error' in diffs: + cursor.execute( + ( + f"INSERT INTO {db_type}SE (DataId, StrainId, error)" + "VALUES (%s, %s, %s)" + ), (data_id, strain_id, diffs['error']) + ) + if 'n_cases' in diffs: + cursor.execute( + ( + "INSERT INTO NStrain (DataId, StrainId, count)" + "VALUES (%s, %s, %s)" + ), (data_id, strain_id, diffs['n_cases']) + ) + + conn.commit() + + def __delete_query(conn, db_type, data_id, strain_id, diffs): + with conn.cursor() as cursor: + if 'value' in diffs: + cursor.execute( + ( + f"DELETE FROM {db_type}Data " + "WHERE Id = %s AND StrainId = %s" + ), (data_id, strain_id) + ) + if 'error' in diffs: + cursor.execute( + ( + f"DELETE FROM {db_type}SE " + "WHERE DataId = %s AND StrainId = %s" + ), (data_id, strain_id) + ) + if 'n_cases' in diffs: + cursor.execute( + ( + "DELETE FROM NStrain " + "WHERE DataId = %s AND StrainId = %s" + ), (data_id, strain_id) + ) + + conn.commit() + + def __update_data(conn, db_type, data_id, diffs, update_type): + for strain in diffs: + strain_id = __fetch_strain_id(conn, strain) + if update_type == "update": + __update_query(conn, db_type, data_id, strain_id, diffs[strain]) + elif update_type == "insert": + __insert_query(conn, db_type, data_id, strain_id, diffs[strain]) + elif update_type == "delete": + __delete_query(conn, db_type, data_id, strain_id, diffs[strain]) + + for key in diff_data: + dataset, trait = key.split(":") + if "Publish" in dataset: + db_type = "Publish" + else: + db_type = "ProbeSet" + + data_id = __fetch_data_id(conn, db_type, trait, dataset) + + __update_data(conn, db_type, data_id, diff_data[key]['Modifications'], 'update') + __update_data(conn, db_type, data_id, diff_data[key]['Additions'], 'insert') + __update_data(conn, db_type, data_id, diff_data[key]['Deletions'], 'delete') + + return diff_data diff --git a/gn3/db/traits.py b/gn3/db/traits.py index fa13fcc..fbac0da 100644 --- a/gn3/db/traits.py +++ b/gn3/db/traits.py @@ -3,7 +3,6 @@ import os from functools import reduce from typing import Any, Dict, Sequence -from gn3.settings import TMPDIR from gn3.chancy import random_string from gn3.function_helpers import compose from gn3.db.datasets import retrieve_trait_dataset @@ -690,7 +689,7 @@ def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tupl return {} -def generate_traits_filename(base_path: str = TMPDIR): +def generate_traits_filename(base_path: str): """Generate a unique filename for use with generated traits files.""" return ( f"{os.path.abspath(base_path)}/traits_test_file_{random_string(10)}.txt") diff --git a/gn3/db/wiki.py b/gn3/db/wiki.py index 0f46855..e702569 100644 --- a/gn3/db/wiki.py +++ b/gn3/db/wiki.py @@ -22,12 +22,20 @@ def _decode_dict(result: dict): def get_latest_comment(connection, comment_id: int) -> int: """ Latest comment is one with the highest versionId """ cursor = connection.cursor(DictCursor) - query = """ SELECT versionId AS version, symbol, PubMed_ID AS pubmed_ids, sp.Name AS species, - comment, email, weburl, initial, reason - FROM `GeneRIF` gr - INNER JOIN Species sp USING(SpeciesId) - WHERE gr.Id = %s - ORDER BY versionId DESC LIMIT 1; + query = """SELECT versionId AS version, + symbol, + PubMed_ID AS pubmed_ids, + COALESCE(sp.Name, 'no specific species') AS species, + comment, + email, + weburl, + initial, + reason +FROM `GeneRIF` gr +LEFT JOIN Species sp USING(SpeciesId) +WHERE gr.Id = %s +ORDER BY versionId DESC +LIMIT 1; """ cursor.execute(query, (str(comment_id),)) result = _decode_dict(cursor.fetchone()) @@ -48,6 +56,8 @@ def get_latest_comment(connection, comment_id: int) -> int: def get_species_id(cursor, species_name: str) -> int: """Find species id given species `Name`""" + if species_name.lower() == "no specific species": + return 0 cursor.execute( "SELECT SpeciesID from Species WHERE Name = %s", (species_name,)) species_ids = cursor.fetchall() @@ -70,6 +80,14 @@ def get_next_comment_version(cursor, comment_id: int) -> int: return latest_version + 1 +def get_next_comment_id(cursor) -> int: + """Get the next GeneRIF.Id""" + cursor.execute( + "SELECT MAX(Id) from GeneRIF" + ) + return cursor.fetchone()[0] + 1 + + def get_categories_ids(cursor, categories: List[str]) -> List[int]: """Get the categories_ids from a list of category strings""" dict_cats = get_categories(cursor) @@ -93,7 +111,7 @@ def get_categories(cursor) -> Dict[str, int]: def get_species(cursor) -> Dict[str, str]: """Get all species""" - cursor.execute("SELECT Name, SpeciesName from Species") + cursor.execute("SELECT Name, SpeciesName from Species ORDER BY Species.Id") raw_species = cursor.fetchall() dict_cats = dict(raw_species) return dict_cats |
