about summary refs log tree commit diff
path: root/gn3/db
diff options
context:
space:
mode:
Diffstat (limited to 'gn3/db')
-rw-r--r--gn3/db/case_attributes.py487
-rw-r--r--gn3/db/correlations.py4
-rw-r--r--gn3/db/datasets.py15
-rw-r--r--gn3/db/menu.py15
-rw-r--r--gn3/db/probesets.py61
-rw-r--r--gn3/db/rdf/wiki.py158
-rw-r--r--gn3/db/sample_data.py228
-rw-r--r--gn3/db/traits.py3
-rw-r--r--gn3/db/wiki.py32
9 files changed, 792 insertions, 211 deletions
diff --git a/gn3/db/case_attributes.py b/gn3/db/case_attributes.py
index bb15248..a55f2d8 100644
--- a/gn3/db/case_attributes.py
+++ b/gn3/db/case_attributes.py
@@ -1,126 +1,389 @@
 """Module that contains functions for editing case-attribute data"""
-from typing import Any, Optional, Tuple
+from pathlib import Path
+from typing import Optional
+from dataclasses import dataclass
+from enum import Enum, auto
 
 import json
-import MySQLdb
-
-
-def get_case_attributes(conn) -> Optional[Tuple]:
-    """Get all the case attributes from the database."""
-    with conn.cursor() as cursor:
-        cursor.execute("SELECT Id, Name, Description FROM CaseAttribute")
-        return cursor.fetchall()
-
-
-def get_unreviewed_diffs(conn: Any) -> Optional[tuple]:
-    """Fetch all case attributes in GN"""
-    with conn.cursor() as cursor:
-        cursor.execute(
-            "SELECT id, editor, json_diff_data FROM "
-            "caseattributes_audit WHERE status = 'review'"
-        )
-        return cursor.fetchall()
-
-
-def insert_case_attribute_audit(
-    conn: Any, status: str, author: str, data: str
-) -> int:
-    """Update the case_attribute_audit table"""
-    rowcount = 0
-    try:
-        with conn.cursor() as cursor:
-            cursor.execute(
-                "INSERT INTO caseattributes_audit "
-                "(status, editor, json_diff_data) "
-                "VALUES (%s, %s, %s)",
-                (status, author, data,),
+import pickle
+import lmdb
+
+
+class EditStatus(Enum):
+    """Enumeration for the status of the edits."""
+    review = auto()   # pylint: disable=[invalid-name]
+    approved = auto()  # pylint: disable=[invalid-name]
+    rejected = auto()  # pylint: disable=[invalid-name]
+
+    def __str__(self):
+        """Print out human-readable form."""
+        return self.name
+
+
+@dataclass
+class CaseAttributeEdit:
+    """Represents an edit operation for case attributes in the database.
+
+    Attributes:
+        - inbredset_id (int): The ID of the inbred set associated with
+        the edit.
+        - status: (EditStatus): The status of this edit.
+        - user_id (str): The ID of the user performing the edit.
+        - changes (dict): A dictionary containing the changes to be
+        applied to the case attributes.
+
+    """
+    inbredset_id: int
+    status: EditStatus
+    user_id: str
+    changes: dict
+
+
+def queue_edit(cursor, directory: Path, edit: CaseAttributeEdit) -> Optional[int]:
+    """Queues a case attribute edit for review by inserting it into
+    the audit table and storing its review ID in an LMDB database.
+
+    Args:
+        cursor: A database cursor for executing SQL queries.
+        directory (Path): The base directory path for the LMDB database.
+        edit (CaseAttributeEdit): A dataclass containing the edit details, including
+            inbredset_id, status, user_id, and changes.
+
+    Returns:
+        int: An id the particular case-attribute that was updated.
+    """
+    cursor.execute(
+        "INSERT INTO "
+        "caseattributes_audit(status, editor, json_diff_data) "
+        "VALUES (%s, %s, %s) "
+        "ON DUPLICATE KEY UPDATE status=%s",
+        (str(edit.status), edit.user_id,
+         json.dumps(edit.changes), str(EditStatus.review),))
+    directory.mkdir(parents=True, exist_ok=True)
+    env = lmdb.open(directory.as_posix(), map_size=8_000_000)  # 1 MB
+    with env.begin(write=True) as txn:
+        review_ids = set()
+        if reviews := txn.get(b"review"):
+            review_ids = pickle.loads(reviews)
+        _id = cursor.lastrowid
+        review_ids.add(_id)
+        txn.put(b"review", pickle.dumps(review_ids))
+        return _id
+
+
+def __fetch_case_attrs_changes__(cursor, change_ids: tuple) -> list:
+    """Fetches case attribute change records from the audit table for
+    given change IDs.
+
+    Retrieves records from the `caseattributes_audit` table for the
+    specified `change_ids`, including the editor, JSON diff data, and
+    timestamp. The JSON diff data is deserialized into a Python
+    dictionary for each record. Results are ordered by timestamp in
+    descending order (most recent first).
+
+    Args:
+        cursor: A MySQLdb cursor for executing SQL queries.
+        change_ids (tuple): A tuple of integers representing the IDs
+        of changes to fetch.
+
+    Returns:
+        list: A list of dictionaries, each containing the `editor`,
+            `json_diff_data` (as a deserialized dictionary), and `time_stamp`
+            for the matching change IDs. Returns an empty list if no records
+            are found.
+
+    Notes:
+        - The function assumes `change_ids` is a non-empty tuple of valid integers.
+        - The `json_diff_data` column in `caseattributes_audit` is expected to contain valid
+          JSON strings, which are deserialized into dictionaries.
+        - The query uses parameterized placeholders to prevent SQL injection.
+        - This is an internal helper function (indicated by double underscores) used by
+          other functions like `get_changes`.
+
+    Raises:
+        json.JSONDecodeError: If any `json_diff_data` value cannot be deserialized.
+        TypeError: If `change_ids` is empty or contains non-integer values, potentially
+            causing a database error.
+
+    """
+    if not change_ids:
+        return {}  # type:ignore
+    placeholders = ','.join(['%s'] * len(change_ids))
+    cursor.execute(
+        "SELECT editor, json_diff_data, time_stamp "
+        f"FROM caseattributes_audit WHERE id IN ({placeholders}) "
+        "ORDER BY time_stamp DESC",
+        change_ids
+    )
+    results = cursor.fetchall()
+    for el in results:
+        el["json_diff_data"] = json.loads(el["json_diff_data"])
+    return results
+
+
+def view_change(cursor, change_id: int) -> dict:
+    """Queries the `caseattributes_audit` table to fetch the
+    `json_diff_data` column for the given `change_id`. The JSON data
+    is deserialized into a Python dictionary and returned.  If no
+    record is found or the `json_diff_data` is None, an empty
+    dictionary is returned.
+
+    Args:
+        cursor: A MySQLdb cursor for executing SQL queries.
+        change_id (int): The ID of the change to retrieve from the
+        `caseattributes_audit` table.
+
+    Returns:
+        dict: The deserialized JSON diff data as a dictionary if the
+              record exists and contains valid JSON; otherwise, an
+              empty dictionary.
+
+    Raises:
+        json.JSONDecodeError: If the `json_diff_data` cannot be
+            deserialized due to invalid JSON.
+        TypeError: If `cursor.fetchone()` returns None (e.g., no
+            record found) and `json_diff_data` is accessed, though the
+            function handles this by returning an empty dictionary.
+
+    """
+    cursor.execute(
+        "SELECT json_diff_data "
+        "FROM caseattributes_audit "
+        "WHERE id = %s",
+        (change_id,)
+    )
+    json_diff_data, _ = cursor.fetchone()
+    if json_diff_data:
+        json_diff_data = json.loads(json_diff_data)
+        return json_diff_data
+    return {}
+
+
+def get_changes(cursor, change_type: EditStatus, directory: Path) -> dict:
+    """Retrieves case attribute changes for given lmdb data in
+    directory categorized by review status.
+
+    Fetches change IDs from an LMDB database, categorized into the
+    "data" key based on the EditStatus
+
+    Args:
+        - cursor: A MySQLdb cursor for executing SQL queries.
+        - change_type (EditStatus): The status of changes to retrieve
+          ('review', 'approved', or 'rejected').
+        - directory (Path): The base directory path for the LMDB
+          database.
+
+    Returns:
+        dict: A dictionary with two keys:
+            -'count': A dictionary with counts of 'reviews',
+                    'approvals' and 'rejections'.
+            - 'data': contains the json diff data of the modified data
+
+    Raises:
+        json.JSONDecodeError: If any `json_diff_data` in the audit
+            table cannot be deserialized by
+            `__fetch_case_attrs_changes__`.
+        TypeError: If `inbredset_id` is not an integer or if LMDB data
+            cannot be deserialized.  Also raised when an invalid change_id
+            is used.
+
+    """
+    directory.mkdir(parents=True, exist_ok=True)
+    review_ids, approved_ids, rejected_ids = set(), set(), set()
+    directory.mkdir(parents=True, exist_ok=True)
+    env = lmdb.open(directory.as_posix(), map_size=8_000_000)  # 1 MB
+    with env.begin(write=False) as txn:
+        if reviews := txn.get(b"review"):
+            review_ids = pickle.loads(reviews)
+        if approvals := txn.get(b"approved"):
+            approved_ids = pickle.loads(approvals)
+        if rejections := txn.get(b"rejected"):
+            rejected_ids = pickle.loads(rejections)
+    changes = {}
+    match change_type:
+        case EditStatus.review:
+            changes = dict(
+                zip(review_ids,
+                    __fetch_case_attrs_changes__(cursor, tuple(review_ids)))
             )
-            rowcount = cursor.rowcount
-    except Exception as _e:
-        raise MySQLdb.Error(_e) from _e
-    return rowcount
-
-
-def reject_case_attribute(conn: Any, case_attr_audit_id: int) -> int:
-    """Given the id of the json_diff in the case_attribute_audit table, reject
-    it"""
-    rowcount = 0
-    try:
-        with conn.cursor() as cursor:
-            cursor.execute(
-                "UPDATE caseattributes_audit SET "
-                "status = 'rejected' WHERE id = %s",
-                (case_attr_audit_id,),
+        case EditStatus.approved:
+            changes = dict(
+                zip(approved_ids,
+                    __fetch_case_attrs_changes__(cursor, tuple(approved_ids)))
             )
-            rowcount = cursor.rowcount
-    except Exception as _e:
-        raise MySQLdb.Error(_e) from _e
-    return rowcount
+        case EditStatus.rejected:
+            changes = dict(zip(rejected_ids,
+                               __fetch_case_attrs_changes__(cursor, tuple(rejected_ids))))
+        case _:
+            raise TypeError
+    return {
+        "change-type": str(change_type),
+        "count": {
+            "reviews": len(review_ids),
+            "approvals": len(approved_ids),
+            "rejections": len(rejected_ids)
+        },
+        "data": changes
+    }
+
+
+# pylint: disable=[too-many-locals, too-many-branches]
+def apply_change(cursor, change_type: EditStatus, change_id: int, directory: Path) -> bool:
+    """Applies or rejects a case attribute change and updates its
+    status in the audit table and LMDB.
+
+    Processes a change identified by `change_id` based on the
+    specified `change_type` (approved or rejected). For approved
+    changes, applies modifications to the `CaseAttributeXRefNew` table
+    using bulk inserts and updates the audit status. For rejected
+    changes, updates the audit status only.  Manages change IDs in
+    LMDB by moving them from the 'review' set to either 'approved' or
+    'rejected' sets. Returns False if the `change_id` is not in the
+    review set.
+
+    Args:
+        cursor: A MySQLdb cursor for executing SQL queries.
+        change_type (EditStatus): The action to perform, either
+            `EditStatus.approved` or `EditStatus.rejected`.
+        change_id (int): The ID of the change to process,
+            corresponding to a record in `caseattributes_audit`.
+        directory (Path): The base directory path for the LMDB
+        database.
+
+    Returns:
+        bool: True if the change was successfully applied or rejected,
+            False if `change_id` is not found in the LMDB 'review'
+            set.
 
+    Notes:
+        - Opens an LMDB environment in the specified `directory` with
+          a map size of 8 MB.
+        - For `EditStatus.approved`, fetches `json_diff_data` from
+          `caseattributes_audit`, extracts modifications, and performs
+          bulk inserts into `CaseAttributeXRefNew` with `ON DUPLICATE
+          KEY UPDATE`.
+        - For `EditStatus.rejected`, updates the
+          `caseattributes_audit` status without modifying case
+          attributes.
+        - Uses bulk `SELECT` queries to fetch `StrainId` and
+          `CaseAttributeId` values efficiently.
+        - Assumes `CaseAttributeXRefNew` has a unique key on
+          `(InbredSetId, StrainId, CaseAttributeId)` for `ON DUPLICATE
+          KEY UPDATE`.
+        - The `json_diff_data` is expected to contain valid JSON with
+          an `inbredset_id` and `Modifications.Current` structure.
+        - The second column from `fetchone()` is ignored (denoted by
+          `_`).
 
-def approve_case_attribute(conn: Any, case_attr_audit_id: int) -> int:
-    """Given the id of the json_diff in the case_attribute_audit table,
-    approve it
+    Raises:
+        ValueError: If `change_type` is neither `EditStatus.approved`
+            nor `EditStatus.rejected`.
+        json.JSONDecodeError: If `json_diff_data` cannot be
+            deserialized for approved changes.
+        TypeError: If `cursor.fetchone()` returns None for
+            `json_diff_data` or if `strain_id` or `caseattr_id` are
+            missing during bulk insert preparation.
+        pickle.UnpicklingError: If LMDB data (e.g., 'review' or
+            'approved' sets) cannot be deserialized.
 
     """
-    rowcount = 0
-    try:
-        with conn.cursor() as cursor:
-            cursor.execute(
-                "SELECT json_diff_data FROM caseattributes_audit "
-                "WHERE id = %s",
-                (case_attr_audit_id,),
-            )
-            diff_data = cursor.fetchone()
-            if diff_data:
-                diff_data = json.loads(diff_data[0])
-                # Insert (Most Important)
-                if diff_data.get("Insert"):
-                    data = diff_data.get("Insert")
+    review_ids, approved_ids, rejected_ids = set(), set(), set()
+    directory.mkdir(parents=True, exist_ok=True)
+    env = lmdb.open(directory.as_posix(), map_size=8_000_000)  # 1 MB
+    with env.begin(write=True) as txn:
+        if reviews := txn.get(b"review"):
+            review_ids = pickle.loads(reviews)
+        if change_id not in review_ids:
+            return False
+        match change_type:
+            case EditStatus.rejected:
+                cursor.execute(
+                    "UPDATE caseattributes_audit "
+                    "SET status = %s "
+                    "WHERE id = %s",
+                    (str(change_type), change_id))
+                if rejections := txn.get(b"rejected"):
+                    rejected_ids = pickle.loads(rejections)
+                rejected_ids.add(change_id)
+                review_ids.discard(change_id)
+                txn.put(b"review", pickle.dumps(review_ids))
+                txn.put(b"rejected", pickle.dumps(rejected_ids))
+                return True
+            case EditStatus.approved:
+                cursor.execute(
+                    "SELECT json_diff_data "
+                    "FROM caseattributes_audit WHERE "
+                    "id = %s",
+                    (change_id,)
+                )
+                result = cursor.fetchone()
+                if result is None:
+                    return False
+                json_diff_data = result[0]
+                json_diff_data = json.loads(json_diff_data)
+                inbredset_id = json_diff_data.get("inbredset_id")
+                modifications = json_diff_data.get(
+                    "Modifications", {}).get("Current", {})
+                strains = tuple(modifications.keys())
+                case_attrs = set()
+                for data in modifications.values():
+                    case_attrs.update(data.keys())
+
+                # Bulk fetch strain ids
+                strain_id_map = {}
+                if strains:
                     cursor.execute(
-                        "INSERT INTO CaseAttribute "
-                        "(Name, Description) VALUES "
-                        "(%s, %s)",
-                        (
-                            data.get("name").strip(),
-                            data.get("description").strip(),
-                        ),
+                        "SELECT Name, Id FROM Strain WHERE Name IN "
+                        f"({', '.join(['%s'] * len(strains))})",
+                        strains
                     )
-                # Delete
-                elif diff_data.get("Deletion"):
-                    data = diff_data.get("Deletion")
+                    for name, strain_id in cursor.fetchall():
+                        strain_id_map[name] = strain_id
+
+                # Bulk fetch case attr ids
+                caseattr_id_map = {}
+                if case_attrs:
                     cursor.execute(
-                        "DELETE FROM CaseAttribute WHERE Id = %s",
-                        (data.get("id"),),
+                        "SELECT Name, CaseAttributeId FROM CaseAttribute "
+                        "WHERE InbredSetId = %s AND Name IN "
+                        f"({', '.join(['%s'] * len(case_attrs))})",
+                        (inbredset_id, *case_attrs)
                     )
-                # Modification
-                elif diff_data.get("Modification"):
-                    data = diff_data.get("Modification")
-                    if desc_ := data.get("description"):
-                        cursor.execute(
-                            "UPDATE CaseAttribute SET "
-                            "Description = %s WHERE Id = %s",
-                            (
-                                desc_.get("Current"),
-                                diff_data.get("id"),
-                            ),
-                        )
-                    if name_ := data.get("name"):
-                        cursor.execute(
-                            "UPDATE CaseAttribute SET "
-                            "Name = %s WHERE Id = %s",
-                            (
-                                name_.get("Current"),
-                                diff_data.get("id"),
-                            ),
-                        )
-                if cursor.rowcount:
-                    cursor.execute(
-                        "UPDATE caseattributes_audit SET "
-                        "status = 'approved' WHERE id = %s",
-                        (case_attr_audit_id,),
+                    for name, caseattr_id in cursor.fetchall():
+                        caseattr_id_map[name] = caseattr_id
+
+                # Bulk insert data
+                insert_data = []
+                for strain, data in modifications.items():
+                    strain_id = strain_id_map.get(strain)
+                    for case_attr, value in data.items():
+                        insert_data.append({
+                            "inbredset_id": inbredset_id,
+                            "strain_id": strain_id,
+                            "caseattr_id": caseattr_id_map.get(case_attr),
+                            "value": value,
+                        })
+                if insert_data:
+                    cursor.executemany(
+                        "INSERT INTO CaseAttributeXRefNew "
+                        "(InbredSetId, StrainId, CaseAttributeId, Value) "
+                        "VALUES (%(inbredset_id)s, %(strain_id)s, %(caseattr_id)s, %(value)s) "
+                        "ON DUPLICATE KEY UPDATE Value = VALUES(Value)",
+                        insert_data
                     )
-            rowcount = cursor.rowcount
-    except Exception as _e:
-        raise MySQLdb.Error(_e) from _e
-    return rowcount
+
+                # Update LMDB and audit table
+                cursor.execute(
+                    "UPDATE caseattributes_audit "
+                    "SET status = %s "
+                    "WHERE id = %s",
+                    (str(change_type), change_id))
+                if approvals := txn.get(b"approved"):
+                    approved_ids = pickle.loads(approvals)
+                approved_ids.add(change_id)
+                review_ids.discard(change_id)
+                txn.put(b"review", pickle.dumps(review_ids))
+                txn.put(b"approved", pickle.dumps(approved_ids))
+                return True
+            case _:
+                raise ValueError
diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py
index aec8eac..5d6cfb3 100644
--- a/gn3/db/correlations.py
+++ b/gn3/db/correlations.py
@@ -328,7 +328,7 @@ def build_temporary_tissue_correlations_table(
 
     return temp_table_name
 
-def fetch_tissue_correlations(# pylint: disable=R0913
+def fetch_tissue_correlations(# pylint: disable=[R0913, too-many-arguments, too-many-positional-arguments]
         dataset: dict, trait_symbol: str, probeset_freeze_id: int, method: str,
         return_number: int, conn: Any) -> dict:
     """
@@ -529,7 +529,7 @@ def __build_query__(
             f"ORDER BY {db_type}.Id"),
         1)
 
-# pylint: disable=too-many-arguments
+# pylint: disable=[too-many-arguments, too-many-positional-arguments]
 def __fetch_data__(
         conn, sample_ids: tuple, db_name: str, db_type: str, method: str,
         temp_table: Optional[str]) -> Tuple[Tuple[Any], int]:
diff --git a/gn3/db/datasets.py b/gn3/db/datasets.py
index f3b4f9f..fea207b 100644
--- a/gn3/db/datasets.py
+++ b/gn3/db/datasets.py
@@ -79,6 +79,21 @@ def retrieve_mrna_group_name(connection: Any, probeset_id: int, dataset_name: st
             return res[0]
         return None
 
+def retrieve_group_id(connection: Any, group_name: str):
+    """
+    Given the group name, retrieve the group ID
+    """
+    query = (
+        "SELECT iset.Id "
+        "FROM InbredSet AS iset "
+        "WHERE iset.Name = %(group_name)s")
+    with connection.cursor() as cursor:
+        cursor.execute(query, {"group_name": group_name})
+        res = cursor.fetchone()
+        if res:
+            return res[0]
+        return None
+
 def retrieve_phenotype_group_name(connection: Any, dataset_id: int):
     """
     Given the dataset id (PublishFreeze.Id in the database), retrieve the name
diff --git a/gn3/db/menu.py b/gn3/db/menu.py
index 8dccabf..34dedde 100644
--- a/gn3/db/menu.py
+++ b/gn3/db/menu.py
@@ -1,10 +1,12 @@
 """Menu generation code for the data in the dropdowns in the index page."""
-
+import logging
 from typing import Tuple
 from functools import reduce
 
 from gn3.db.species import get_all_species
 
+logger = logging.getLogger(__name__)
+
 def gen_dropdown_json(conn):
     """
     Generates and outputs (as json file) the data for the main dropdown menus on
@@ -14,10 +16,12 @@ def gen_dropdown_json(conn):
     groups = get_groups(conn, tuple(row[0] for row in species))
     types = get_types(conn, groups)
     datasets = get_datasets(conn, types)
-    return dict(species=species,
-                groups=groups,
-                types=types,
-                datasets=datasets)
+    return {
+        "species": species,
+        "groups": groups,
+        "types": types,
+        "datasets": datasets
+    }
 
 def get_groups(conn, species_names: Tuple[str, ...]):
     """Build groups list"""
@@ -35,6 +39,7 @@ def get_groups(conn, species_names: Tuple[str, ...]):
             "IFNULL(InbredSet.Family, InbredSet.FullName) ASC, "
             "InbredSet.FullName ASC, "
             "InbredSet.MenuOrderId ASC")
+        logger.debug("'get_groups' QUERY: %s, %s", query, species_names)
         cursor.execute(query, tuple(species_names))
         results = cursor.fetchall()
 
diff --git a/gn3/db/probesets.py b/gn3/db/probesets.py
index 910f05b..e725add 100644
--- a/gn3/db/probesets.py
+++ b/gn3/db/probesets.py
@@ -8,6 +8,9 @@ from gn3.db_utils import Connection as DBConnection
 
 from .query_tools import mapping_to_query_columns
 
+
+# pylint: disable = line-too-long
+
 @dataclass(frozen=True)
 class Probeset: # pylint: disable=[too-many-instance-attributes]
     """Data Type that represents a Probeset"""
@@ -40,40 +43,42 @@ class Probeset: # pylint: disable=[too-many-instance-attributes]
 # Mapping from the Phenotype dataclass to the actual column names in the
 # database
 probeset_mapping = {
-    "id_": "Id",
-    "name": "Name",
-    "symbol": "symbol",
-    "description": "description",
-    "probe_target_description": "Probe_Target_Description",
-    "chr_": "Chr",
-    "mb": "Mb",
-    "alias": "alias",
-    "geneid": "GeneId",
-    "homologeneid": "HomoloGeneID",
-    "unigeneid": "UniGeneId",
-    "omim": "OMIM",
-    "refseq_transcriptid": "RefSeq_TranscriptId",
-    "blatseq": "BlatSeq",
-    "targetseq": "TargetSeq",
-    "strand_probe": "Strand_Probe",
-    "probe_set_target_region": "Probe_set_target_region",
-    "probe_set_specificity": "Probe_set_specificity",
-    "probe_set_blat_score": "Probe_set_BLAT_score",
-    "probe_set_blat_mb_start": "Probe_set_Blat_Mb_start",
-    "probe_set_blat_mb_end": "Probe_set_Blat_Mb_end",
-    "probe_set_strand": "Probe_set_strand",
-    "probe_set_note_by_rw": "Probe_set_Note_by_RW",
-    "flag": "flag"
+    "id_": "ProbeSet.Id",
+    "name": "ProbeSet.Name",
+    "symbol": "ProbeSet.symbol",
+    "description": "ProbeSet.description",
+    "probe_target_description": "ProbeSet.Probe_Target_Description",
+    "chr_": "ProbeSet.Chr",
+    "mb": "ProbeSet.Mb",
+    "alias": "ProbeSet.alias",
+    "geneid": "ProbeSet.GeneId",
+    "homologeneid": "ProbeSet.HomoloGeneID",
+    "unigeneid": "ProbeSet.UniGeneId",
+    "omim": "ProbeSet.OMIM",
+    "refseq_transcriptid": "ProbeSet.RefSeq_TranscriptId",
+    "blatseq": "ProbeSet.BlatSeq",
+    "targetseq": "ProbeSet.TargetSeq",
+    "strand_probe": "ProbeSet.Strand_Probe",
+    "probe_set_target_region": "ProbeSet.Probe_set_target_region",
+    "probe_set_specificity": "ProbeSet.Probe_set_specificity",
+    "probe_set_blat_score": "ProbeSet.Probe_set_BLAT_score",
+    "probe_set_blat_mb_start": "ProbeSet.Probe_set_Blat_Mb_start",
+    "probe_set_blat_mb_end": "ProbeSet.Probe_set_Blat_Mb_end",
+    "probe_set_strand": "ProbeSet.Probe_set_strand",
+    "probe_set_note_by_rw": "ProbeSet.Probe_set_Note_by_RW",
+    "flag": "ProbeSet.flag"
 }
 
-def fetch_probeset_metadata_by_name(conn: DBConnection, name: str) -> dict:
+def fetch_probeset_metadata_by_name(conn: DBConnection, trait_name: str, dataset_name: str) -> dict:
     """Fetch a ProbeSet's metadata by its `name`."""
     with conn.cursor(cursorclass=DictCursor) as cursor:
         cols = ", ".join(mapping_to_query_columns(probeset_mapping))
         cursor.execute((f"SELECT {cols} "
-                        "FROM ProbeSet "
-                        "WHERE Name = %(name)s"),
-                       {"name": name})
+                        "FROM ProbeSetFreeze "
+                        "INNER JOIN ProbeSetXRef ON ProbeSetXRef.`ProbeSetFreezeId` = ProbeSetFreeze.`Id` "
+                        "INNER JOIN ProbeSet ON ProbeSet.`Id` = ProbeSetXRef.`ProbeSetId` "
+                        "WHERE ProbeSet.Name = %(trait_name)s AND ProbeSetFreeze.Name = %(ds_name)s"),
+                       {"trait_name": trait_name, "ds_name": dataset_name})
         return cursor.fetchone()
 
 def update_probeset(conn, probeset_id, data:dict) -> int:
diff --git a/gn3/db/rdf/wiki.py b/gn3/db/rdf/wiki.py
index b2b301a..dd8d204 100644
--- a/gn3/db/rdf/wiki.py
+++ b/gn3/db/rdf/wiki.py
@@ -15,6 +15,7 @@ from gn3.db.rdf import (
     RDF_PREFIXES,
     query_frame_and_compact,
     update_rdf,
+    sparql_query,
 )
 
 
@@ -41,6 +42,10 @@ def __sanitize_result(result: dict) -> dict:
     if not result:
         return {}
     categories = result.get("categories")
+    if (version := result.get("version")) and isinstance(version, str):
+        result["version"] = int(version)
+    if (wiki_id := result.get("id")) and isinstance(version, str):
+        result["id"] = int(wiki_id)
     if isinstance(categories, str):
         result["categories"] = [categories] if categories else []
     result["categories"] = sorted(result["categories"])
@@ -79,7 +84,7 @@ CONSTRUCT {
              gnt:belongsToCategory ?category ;
              gnt:hasVersion ?max ;
              dct:created ?created ;
-             dct:identifier ?id_ .
+             dct:identifier ?id .
 } FROM $graph WHERE {
     ?comment rdfs:label ?text_ ;
              gnt:symbol ?symbol ;
@@ -88,12 +93,12 @@ CONSTRUCT {
              dct:created ?createTime .
     FILTER ( LCASE(STR(?symbol)) = LCASE("$symbol") ) .
     {
-        SELECT (MAX(?vers) AS ?max) ?id_ WHERE {
+        SELECT (MAX(?vers) AS ?max_) ?id_ WHERE {
             ?comment dct:identifier ?id_ ;
                      dct:hasVersion ?vers .
         }
     }
-    ?comment dct:hasVersion ?max .
+    ?comment dct:hasVersion ?max_ .
     OPTIONAL { ?comment gnt:reason ?reason_ } .
     OPTIONAL {
         ?comment gnt:belongsToSpecies ?speciesId .
@@ -106,6 +111,8 @@ CONSTRUCT {
     OPTIONAL { ?comment gnt:belongsToCategory ?category_ } .
     BIND (str(?createTime) AS ?created) .
     BIND (str(?text_) AS ?text) .
+    BIND (str(?max_) AS ?max) .
+    BIND (str(?id_) AS ?id) .
     BIND (STR(COALESCE(?pmid_, "")) AS ?pmid) .
     BIND (COALESCE(?reason_, "") AS ?reason) .
     BIND (STR(COALESCE(?weburl_, "")) AS ?weburl) .
@@ -154,7 +161,7 @@ CONSTRUCT {
              rdfs:label ?text_ ;
              gnt:symbol ?symbol ;
              dct:created ?createTime ;
-             dct:hasVersion ?version ;
+             dct:hasVersion ?version_ ;
              dct:identifier $comment_id .
     OPTIONAL { ?comment gnt:reason ?reason_ } .
     OPTIONAL {
@@ -167,6 +174,7 @@ CONSTRUCT {
     OPTIONAL { ?comment foaf:mbox ?email_ . } .
     OPTIONAL { ?comment gnt:belongsToCategory ?category_ . } .
     BIND (str(?text_) AS ?text) .
+    BIND (str(?version_) AS ?version) .
     BIND (str(?createTime) AS ?created) .
     BIND (STR(COALESCE(?pmid_, "")) AS ?pmid) .
     BIND (COALESCE(?reason_, "") AS ?reason) .
@@ -186,38 +194,42 @@ CONSTRUCT {
 
 
 def update_wiki_comment(
-        insert_dict: dict,
-        sparql_user: str,
-        sparql_password: str,
-        sparql_auth_uri: str,
-        graph: str = "<http://genenetwork.org>",
+    insert_dict: dict,
+    sparql_user: str,
+    sparql_password: str,
+    sparql_auth_uri: str,
+    graph: str = "<http://genenetwork.org>",
 ) -> str:
     """Update a wiki comment by inserting a comment with the same
-identifier but an updated version id.
+    identifier but an updated version id.
     """
     name = f"gn:wiki-{insert_dict['Id']}-{insert_dict['versionId']}"
-    comment_triple = Template("""$name rdfs:label '''$comment'''@en ;
+    comment_triple = Template(
+        """$name rdfs:label '''$comment'''@en ;
 rdf:type gnc:GNWikiEntry ;
 gnt:symbol "$symbol" ;
 dct:identifier "$comment_id"^^xsd:integer ;
 dct:hasVersion "$next_version"^^xsd:integer ;
 dct:created "$created"^^xsd:datetime .
-""").substitute(
+"""
+    ).substitute(
         comment=insert_dict["comment"],
-        name=name, symbol=insert_dict['symbol'],
-        comment_id=insert_dict["Id"], next_version=insert_dict["versionId"],
-        created=insert_dict["createtime"])
+        name=name,
+        symbol=insert_dict["symbol"],
+        comment_id=insert_dict["Id"],
+        next_version=insert_dict["versionId"],
+        created=insert_dict["createtime"],
+    )
     using = ""
     if insert_dict["email"]:
         comment_triple += f"{name} foaf:mbox <{insert_dict['email']}> .\n"
     if insert_dict["initial"]:
         comment_triple += f"{name} gnt:initial \"{insert_dict['initial']}\" .\n"
-    if insert_dict["species"]:
+    if insert_dict["species"] and insert_dict["species"].lower() != "no specific species":
         comment_triple += f"{name} gnt:belongsToSpecies ?speciesId .\n"
         using = Template(
-            """ USING $graph WHERE { ?speciesId gnt:shortName "$species" . } """).substitute(
-                graph=graph, species=insert_dict["species"]
-        )
+            """ USING $graph WHERE { ?speciesId gnt:shortName "$species" . } """
+        ).substitute(graph=graph, species=insert_dict["species"])
     if insert_dict["reason"]:
         comment_triple += f"{name} gnt:reason \"{insert_dict['reason']}\" .\n"
     if insert_dict["weburl"]:
@@ -236,10 +248,110 @@ INSERT {
 GRAPH $graph {
 $comment_triple}
 } $using
-""").substitute(prefix=RDF_PREFIXES,
-                graph=graph,
-                comment_triple=comment_triple,
-                using=using),
+"""
+        ).substitute(
+            prefix=RDF_PREFIXES, graph=graph, comment_triple=comment_triple, using=using
+        ),
+        sparql_user=sparql_user,
+        sparql_password=sparql_password,
+        sparql_auth_uri=sparql_auth_uri,
+    )
+
+
+def get_rif_entries_by_symbol(
+    symbol: str, sparql_uri: str, graph: str = "<http://genenetwork.org>"
+) -> dict:
+    """Fetch NCBI RIF entries for a given symbol (case-insensitive).
+
+This function retrieves NCBI RIF entries using a SPARQL `SELECT` query
+instead of a `CONSTRUCT` to avoid truncation.  The Virtuoso SPARQL
+engine limits query results to 1,048,576 triples per solution, and
+NCBI entries can exceed this limit.  Since there may be more than
+2,000 entries, which could result in the number of triples surpassing
+the limit, `SELECT` is used to ensure complete data retrieval without
+truncation.  See:
+
+<https://community.openlinksw.com/t/sparql-query-limiting-results-to-100000-triples/2131>
+
+    """
+    # XXX: Consider pagination
+    query = Template(
+        """
+$prefix
+
+SELECT ?comment ?symbol ?species ?pubmed_id ?version ?created ?gene_id ?taxonomic_id
+FROM $graph WHERE {
+    ?comment_id rdfs:label ?text_ ;
+                gnt:symbol ?symbol ;
+                rdf:type gnc:NCBIWikiEntry ;
+                gnt:hasGeneId ?gene_id_ ;
+                dct:hasVersion ?version ;
+                dct:references ?pmid_ ;
+                dct:created ?createTime ;
+                gnt:belongsToSpecies ?speciesId .
+    ?speciesId rdfs:label ?species .
+    FILTER ( LCASE(?symbol) = LCASE("$symbol") ) .
+    OPTIONAL { ?comment_id skos:notation ?taxonId_ . } .
+    BIND (STR(?text_) AS ?comment) .
+    BIND (xsd:integer(STRAFTER(STR(?taxonId_), STR(taxon:))) AS ?taxonomic_id) .
+    BIND (xsd:integer(STRAFTER(STR(?pmid_), STR(pubmed:))) AS ?pubmed_id) .
+    BIND (xsd:integer(STRAFTER(STR(?gene_id_), STR(generif:))) AS ?gene_id) .
+    BIND (STR(?createTime) AS ?created) .
+} ORDER BY ?species ?createTime
+"""
+    ).substitute(prefix=RDF_PREFIXES, graph=graph, symbol=symbol)
+    results: dict[str, dict | list] = {
+        "@context": {
+            "dct": "http://purl.org/dc/terms/",
+            "gnt": "http://genenetwork.org/term/",
+            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+            "skos": "http://www.w3.org/2004/02/skos/core#",
+            "symbol": "gnt:symbol",
+            "species": "gnt:species",
+            "taxonomic_id": "skos:notation",
+            "gene_id": "gnt:hasGeneId",
+            "pubmed_id": "dct:references",
+            "created": "dct:created",
+            "comment": "rdfs:comment",
+            "version": "dct:hasVersion",
+        }
+    }
+    data: list[dict[str, int | str]] = []
+    for entry in sparql_query(query=query, endpoint=sparql_uri, format_type="json"):
+        data.append(
+            {
+                key: int(metadata.get("value"))
+                if metadata.get("value").isdigit()
+                else metadata.get("value")
+                for key, metadata in entry.items()
+            }
+        )
+    results["data"] = data
+    return results
+
+
+def delete_wiki_entries_by_id(
+    wiki_id: int,
+    sparql_user: str,
+    sparql_password: str,
+    sparql_auth_uri: str,
+    graph: str = "<http://genenetwork.org>",
+) -> str:
+    """Delete all wiki entries associated with a given ID."""
+    query = Template(
+        """
+$prefix
+
+DELETE WHERE {
+    GRAPH $graph {
+        ?comment dct:identifier \"$wiki_id\"^^xsd:integer .
+        ?comment ?p ?o .
+    }
+}
+"""
+    ).substitute(prefix=RDF_PREFIXES, graph=graph, wiki_id=wiki_id)
+    return update_rdf(
+        query=query,
         sparql_user=sparql_user,
         sparql_password=sparql_password,
         sparql_auth_uri=sparql_auth_uri,
diff --git a/gn3/db/sample_data.py b/gn3/db/sample_data.py
index 8db40e3..4e01a3a 100644
--- a/gn3/db/sample_data.py
+++ b/gn3/db/sample_data.py
@@ -59,20 +59,32 @@ def __extract_actions(
     return result
 
 def get_mrna_sample_data(
-    conn: Any, probeset_id: str, dataset_name: str
+    conn: Any, probeset_id: int, dataset_name: str, probeset_name: str = None  # type: ignore
 ) -> Dict:
     """Fetch a mRNA Assay (ProbeSet in the DB) trait's sample data and return it as a dict"""
     with conn.cursor() as cursor:
-        cursor.execute("""
-SELECT st.Name, ifnull(psd.value, 'x'), ifnull(psse.error, 'x'), ifnull(ns.count, 'x')
-FROM ProbeSetFreeze psf
-    JOIN ProbeSetXRef psx ON psx.ProbeSetFreezeId = psf.Id
-    JOIN ProbeSet ps ON ps.Id = psx.ProbeSetId
-    JOIN ProbeSetData psd ON psd.Id = psx.DataId
-    JOIN Strain st ON psd.StrainId = st.Id
-    LEFT JOIN ProbeSetSE psse ON psse.DataId = psd.Id AND psse.StrainId = psd.StrainId
-    LEFT JOIN NStrain ns ON ns.DataId = psd.Id AND ns.StrainId = psd.StrainId
-WHERE ps.Id = %s AND psf.Name= %s""", (probeset_id, dataset_name))
+        if probeset_name:
+            cursor.execute("""
+    SELECT st.Name, ifnull(psd.value, 'x'), ifnull(psse.error, 'x'), ifnull(ns.count, 'x')
+    FROM ProbeSetFreeze psf
+        JOIN ProbeSetXRef psx ON psx.ProbeSetFreezeId = psf.Id
+        JOIN ProbeSet ps ON ps.Id = psx.ProbeSetId
+        JOIN ProbeSetData psd ON psd.Id = psx.DataId
+        JOIN Strain st ON psd.StrainId = st.Id
+        LEFT JOIN ProbeSetSE psse ON psse.DataId = psd.Id AND psse.StrainId = psd.StrainId
+        LEFT JOIN NStrain ns ON ns.DataId = psd.Id AND ns.StrainId = psd.StrainId
+    WHERE ps.Name = %s AND psf.Name= %s""", (probeset_name, dataset_name))
+        else:
+            cursor.execute("""
+    SELECT st.Name, ifnull(psd.value, 'x'), ifnull(psse.error, 'x'), ifnull(ns.count, 'x')
+    FROM ProbeSetFreeze psf
+        JOIN ProbeSetXRef psx ON psx.ProbeSetFreezeId = psf.Id
+        JOIN ProbeSet ps ON ps.Id = psx.ProbeSetId
+        JOIN ProbeSetData psd ON psd.Id = psx.DataId
+        JOIN Strain st ON psd.StrainId = st.Id
+        LEFT JOIN ProbeSetSE psse ON psse.DataId = psd.Id AND psse.StrainId = psd.StrainId
+        LEFT JOIN NStrain ns ON ns.DataId = psd.Id AND ns.StrainId = psd.StrainId
+    WHERE ps.Id = %s AND psf.Name= %s""", (probeset_id, dataset_name))
 
         sample_data = {}
         for data in cursor.fetchall():
@@ -118,18 +130,28 @@ WHERE ps.Id = %s AND psf.Name= %s""", (probeset_id, dataset_name))
         return "\n".join(trait_csv)
 
 def get_pheno_sample_data(
-    conn: Any, trait_name: int, phenotype_id: int
+    conn: Any, trait_name: int, phenotype_id: int, group_id: int = None  # type: ignore
 ) -> Dict:
     """Fetch a phenotype (Publish in the DB) trait's sample data and return it as a dict"""
     with conn.cursor() as cursor:
-        cursor.execute("""
-SELECT st.Name, ifnull(pd.value, 'x'), ifnull(ps.error, 'x'), ifnull(ns.count, 'x')
-FROM PublishFreeze pf JOIN PublishXRef px ON px.InbredSetId = pf.InbredSetId
-     JOIN PublishData pd ON pd.Id = px.DataId JOIN Strain st ON pd.StrainId = st.Id
-     LEFT JOIN PublishSE ps ON ps.DataId = pd.Id AND ps.StrainId = pd.StrainId
-     LEFT JOIN NStrain ns ON ns.DataId = pd.Id AND ns.StrainId = pd.StrainId
-WHERE px.Id = %s AND px.PhenotypeId = %s
-ORDER BY st.Name""", (trait_name, phenotype_id))
+        if group_id:
+            cursor.execute("""
+    SELECT st.Name, ifnull(ROUND(pd.value, 2), 'x'), ifnull(ROUND(ps.error, 3), 'x'), ifnull(ns.count, 'x')
+    FROM PublishFreeze pf JOIN PublishXRef px ON px.InbredSetId = pf.InbredSetId
+        JOIN PublishData pd ON pd.Id = px.DataId JOIN Strain st ON pd.StrainId = st.Id
+        LEFT JOIN PublishSE ps ON ps.DataId = pd.Id AND ps.StrainId = pd.StrainId
+        LEFT JOIN NStrain ns ON ns.DataId = pd.Id AND ns.StrainId = pd.StrainId
+    WHERE px.Id = %s AND px.InbredSetId = %s
+    ORDER BY st.Name""", (trait_name, group_id))
+        else:
+            cursor.execute("""
+    SELECT st.Name, ifnull(pd.value, 'x'), ifnull(ps.error, 'x'), ifnull(ns.count, 'x')
+    FROM PublishFreeze pf JOIN PublishXRef px ON px.InbredSetId = pf.InbredSetId
+        JOIN PublishData pd ON pd.Id = px.DataId JOIN Strain st ON pd.StrainId = st.Id
+        LEFT JOIN PublishSE ps ON ps.DataId = pd.Id AND ps.StrainId = pd.StrainId
+        LEFT JOIN NStrain ns ON ns.DataId = pd.Id AND ns.StrainId = pd.StrainId
+    WHERE px.Id = %s AND px.PhenotypeId = %s
+    ORDER BY st.Name""", (trait_name, phenotype_id))
 
         sample_data = {}
         for data in cursor.fetchall():
@@ -302,8 +324,8 @@ def update_sample_data(
     if data_type == "mrna":
         strain_id, data_id, inbredset_id = get_mrna_sample_data_ids(
             conn=conn,
-            probeset_id=int(probeset_id),
-            dataset_name=dataset_name,
+            probeset_id=int(probeset_id),# pylint: disable=[possibly-used-before-assignment]
+            dataset_name=dataset_name,# pylint: disable=[possibly-used-before-assignment]
             strain_name=extract_strain_name(csv_header, original_data),
         )
         none_case_attrs = {
@@ -315,8 +337,8 @@ def update_sample_data(
     else:
         strain_id, data_id, inbredset_id = get_pheno_sample_data_ids(
             conn=conn,
-            publishxref_id=int(trait_name),
-            phenotype_id=phenotype_id,
+            publishxref_id=int(trait_name),# pylint: disable=[possibly-used-before-assignment]
+            phenotype_id=phenotype_id,# pylint: disable=[possibly-used-before-assignment]
             strain_name=extract_strain_name(csv_header, original_data),
         )
         none_case_attrs = {
@@ -422,8 +444,8 @@ def delete_sample_data(
     if data_type == "mrna":
         strain_id, data_id, inbredset_id = get_mrna_sample_data_ids(
             conn=conn,
-            probeset_id=int(probeset_id),
-            dataset_name=dataset_name,
+            probeset_id=int(probeset_id),# pylint: disable=[possibly-used-before-assignment]
+            dataset_name=dataset_name,# pylint: disable=[possibly-used-before-assignment]
             strain_name=extract_strain_name(csv_header, data),
         )
         none_case_attrs: Dict[str, Any] = {
@@ -435,8 +457,8 @@ def delete_sample_data(
     else:
         strain_id, data_id, inbredset_id = get_pheno_sample_data_ids(
             conn=conn,
-            publishxref_id=int(trait_name),
-            phenotype_id=phenotype_id,
+            publishxref_id=int(trait_name),# pylint: disable=[possibly-used-before-assignment]
+            phenotype_id=phenotype_id,# pylint: disable=[possibly-used-before-assignment]
             strain_name=extract_strain_name(csv_header, data),
         )
         none_case_attrs = {
@@ -528,8 +550,8 @@ def insert_sample_data(
     if data_type == "mrna":
         strain_id, data_id, inbredset_id = get_mrna_sample_data_ids(
             conn=conn,
-            probeset_id=int(probeset_id),
-            dataset_name=dataset_name,
+            probeset_id=int(probeset_id),# pylint: disable=[possibly-used-before-assignment]
+            dataset_name=dataset_name,# pylint: disable=[possibly-used-before-assignment]
             strain_name=extract_strain_name(csv_header, data),
         )
         none_case_attrs = {
@@ -541,8 +563,8 @@ def insert_sample_data(
     else:
         strain_id, data_id, inbredset_id = get_pheno_sample_data_ids(
             conn=conn,
-            publishxref_id=int(trait_name),
-            phenotype_id=phenotype_id,
+            publishxref_id=int(trait_name),# pylint: disable=[possibly-used-before-assignment]
+            phenotype_id=phenotype_id,# pylint: disable=[possibly-used-before-assignment]
             strain_name=extract_strain_name(csv_header, data),
         )
         none_case_attrs = {
@@ -584,3 +606,145 @@ def insert_sample_data(
         return count
     except Exception as _e:
         raise MySQLdb.Error(_e) from _e
+
+def batch_update_sample_data(
+    conn: Any, diff_data: Dict
+):
+    """Given sample data diffs, execute all relevant update/insert/delete queries"""
+    def __fetch_data_id(conn, db_type, trait_id, dataset_name):
+        with conn.cursor() as cursor:
+            if db_type == "Publish":
+                cursor.execute(
+                    (
+                        f"SELECT {db_type}XRef.DataId "
+                        f"FROM {db_type}XRef, {db_type}Freeze "
+                        f"WHERE {db_type}XRef.InbredSetId = {db_type}Freeze.InbredSetId AND "
+                        f"{db_type}XRef.Id = %s AND "
+                        f"{db_type}Freeze.Name = %s"
+                    ), (trait_id, dataset_name)
+                )
+            elif db_type == "ProbeSet":
+                cursor.execute(
+                    (
+                        f"SELECT {db_type}XRef.DataId "
+                        f"FROM {db_type}XRef, {db_type}, {db_type}Freeze "
+                        f"WHERE {db_type}XRef.InbredSetId = {db_type}Freeze.InbredSetId AND "
+                        f"{db_type}XRef.ProbeSetId = {db_type}.Id AND "
+                        f"{db_type}.Name = %s AND "
+                        f"{db_type}Freeze.Name = %s"
+                    ), (trait_id, dataset_name)
+                )
+            return cursor.fetchone()[0]
+
+    def __fetch_strain_id(conn, strain_name):
+        with conn.cursor() as cursor:
+            cursor.execute(
+                "SELECT Id FROM Strain WHERE Name = %s", (strain_name,)
+            )
+            return cursor.fetchone()[0]
+
+    def __update_query(conn, db_type, data_id, strain_id, diffs):
+        with conn.cursor() as cursor:
+            if 'value' in diffs:
+                cursor.execute(
+                    (
+                        f"UPDATE {db_type}Data "
+                        "SET value = %s "
+                        "WHERE Id = %s AND StrainId = %s"
+                    ), (diffs['value']['Current'], data_id, strain_id)
+                )
+            if 'error' in diffs:
+                cursor.execute(
+                    (
+                        f"UPDATE {db_type}SE "
+                        "SET error = %s "
+                        "WHERE DataId = %s AND StrainId = %s"
+                    ), (diffs['error']['Current'], data_id, strain_id)
+                )
+            if 'n_cases' in diffs:
+                cursor.execute(
+                    (
+                        "UPDATE NStrain "
+                        "SET count = %s "
+                        "WHERE DataId = %s AND StrainId = %s"
+                    ), (diffs['n_cases']['Current'], data_id, strain_id)
+                )
+
+        conn.commit()
+
+    def __insert_query(conn, db_type, data_id, strain_id, diffs):
+        with conn.cursor() as cursor:
+            if 'value' in diffs:
+                cursor.execute(
+                    (
+                        f"INSERT INTO {db_type}Data (Id, StrainId, value)"
+                        "VALUES (%s, %s, %s)"
+                    ), (data_id, strain_id, diffs['value'])
+                )
+            if 'error' in diffs:
+                cursor.execute(
+                    (
+                        f"INSERT INTO {db_type}SE (DataId, StrainId, error)"
+                        "VALUES (%s, %s, %s)"
+                    ), (data_id, strain_id, diffs['error'])
+                )
+            if 'n_cases' in diffs:
+                cursor.execute(
+                    (
+                        "INSERT INTO NStrain (DataId, StrainId, count)"
+                        "VALUES (%s, %s, %s)"
+                    ), (data_id, strain_id, diffs['n_cases'])
+                )
+
+        conn.commit()
+
+    def __delete_query(conn, db_type, data_id, strain_id, diffs):
+        with conn.cursor() as cursor:
+            if 'value' in diffs:
+                cursor.execute(
+                    (
+                        f"DELETE FROM {db_type}Data "
+                        "WHERE Id = %s AND StrainId = %s"
+                    ), (data_id, strain_id)
+                )
+            if 'error' in diffs:
+                cursor.execute(
+                    (
+                        f"DELETE FROM {db_type}SE "
+                        "WHERE DataId = %s AND StrainId = %s"
+                    ), (data_id, strain_id)
+                )
+            if 'n_cases' in diffs:
+                cursor.execute(
+                    (
+                        "DELETE FROM NStrain "
+                        "WHERE DataId = %s AND StrainId = %s"
+                    ), (data_id, strain_id)
+                )
+
+        conn.commit()
+
+    def __update_data(conn, db_type, data_id, diffs, update_type):
+        for strain in diffs:
+            strain_id = __fetch_strain_id(conn, strain)
+            if update_type == "update":
+                __update_query(conn, db_type, data_id, strain_id, diffs[strain])
+            elif update_type == "insert":
+                __insert_query(conn, db_type, data_id, strain_id, diffs[strain])
+            elif update_type == "delete":
+                __delete_query(conn, db_type, data_id, strain_id, diffs[strain])
+
+    for key in diff_data:
+        dataset, trait = key.split(":")
+        if "Publish" in dataset:
+            db_type = "Publish"
+        else:
+            db_type = "ProbeSet"
+
+        data_id = __fetch_data_id(conn, db_type, trait, dataset)
+
+        __update_data(conn, db_type, data_id, diff_data[key]['Modifications'], 'update')
+        __update_data(conn, db_type, data_id, diff_data[key]['Additions'], 'insert')
+        __update_data(conn, db_type, data_id, diff_data[key]['Deletions'], 'delete')
+
+    return diff_data
diff --git a/gn3/db/traits.py b/gn3/db/traits.py
index fa13fcc..fbac0da 100644
--- a/gn3/db/traits.py
+++ b/gn3/db/traits.py
@@ -3,7 +3,6 @@ import os
 from functools import reduce
 from typing import Any, Dict, Sequence
 
-from gn3.settings import TMPDIR
 from gn3.chancy import random_string
 from gn3.function_helpers import compose
 from gn3.db.datasets import retrieve_trait_dataset
@@ -690,7 +689,7 @@ def retrieve_trait_data(trait: dict, conn: Any, samplelist: Sequence[str] = tupl
     return {}
 
 
-def generate_traits_filename(base_path: str = TMPDIR):
+def generate_traits_filename(base_path: str):
     """Generate a unique filename for use with generated traits files."""
     return (
         f"{os.path.abspath(base_path)}/traits_test_file_{random_string(10)}.txt")
diff --git a/gn3/db/wiki.py b/gn3/db/wiki.py
index 0f46855..e702569 100644
--- a/gn3/db/wiki.py
+++ b/gn3/db/wiki.py
@@ -22,12 +22,20 @@ def _decode_dict(result: dict):
 def get_latest_comment(connection, comment_id: int) -> int:
     """ Latest comment is one with the highest versionId """
     cursor = connection.cursor(DictCursor)
-    query = """ SELECT versionId AS version, symbol, PubMed_ID AS pubmed_ids, sp.Name AS species,
-        comment, email, weburl, initial, reason
-        FROM `GeneRIF` gr
-		INNER JOIN Species sp USING(SpeciesId)
-		WHERE gr.Id = %s
-		ORDER BY versionId DESC LIMIT 1;
+    query = """SELECT versionId AS version,
+       symbol,
+       PubMed_ID AS pubmed_ids,
+       COALESCE(sp.Name, 'no specific species') AS species,
+       comment,
+       email,
+       weburl,
+       initial,
+       reason
+FROM `GeneRIF` gr
+LEFT JOIN Species sp USING(SpeciesId)
+WHERE gr.Id = %s
+ORDER BY versionId DESC
+LIMIT 1;
     """
     cursor.execute(query, (str(comment_id),))
     result = _decode_dict(cursor.fetchone())
@@ -48,6 +56,8 @@ def get_latest_comment(connection, comment_id: int) -> int:
 
 def get_species_id(cursor, species_name: str) -> int:
     """Find species id given species `Name`"""
+    if species_name.lower() == "no specific species":
+        return 0
     cursor.execute(
         "SELECT SpeciesID from Species  WHERE Name = %s", (species_name,))
     species_ids = cursor.fetchall()
@@ -70,6 +80,14 @@ def get_next_comment_version(cursor, comment_id: int) -> int:
     return latest_version + 1
 
 
+def get_next_comment_id(cursor) -> int:
+    """Get the next GeneRIF.Id"""
+    cursor.execute(
+        "SELECT MAX(Id) from GeneRIF"
+    )
+    return cursor.fetchone()[0] + 1
+
+
 def get_categories_ids(cursor, categories: List[str]) -> List[int]:
     """Get the categories_ids from a list of category strings"""
     dict_cats = get_categories(cursor)
@@ -93,7 +111,7 @@ def get_categories(cursor) -> Dict[str, int]:
 
 def get_species(cursor) -> Dict[str, str]:
     """Get all species"""
-    cursor.execute("SELECT Name, SpeciesName from Species")
+    cursor.execute("SELECT Name, SpeciesName from Species ORDER BY Species.Id")
     raw_species = cursor.fetchall()
     dict_cats = dict(raw_species)
     return dict_cats