diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/deepdiff/summarize.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-4a52a71956a8d46fcb7294ac71734504bb09bcc2.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/deepdiff/summarize.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/deepdiff/summarize.py | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/deepdiff/summarize.py b/.venv/lib/python3.12/site-packages/deepdiff/summarize.py new file mode 100644 index 00000000..f911b84c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/deepdiff/summarize.py @@ -0,0 +1,144 @@ +from typing import Tuple +from deepdiff.helper import JSON, SummaryNodeType +from deepdiff.serialization import json_dumps + + +def _truncate(s: str, max_len: int) -> str: + """ + Truncate string s to max_len characters. + If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters. + """ + if len(s) <= max_len: + return s + if max_len <= 5: + return s[:max_len] + return s[:max_len - 5] + "..." + s[-2:] +# Re-defining the functions due to environment reset + + +# Function to calculate node weights recursively +def calculate_weights(node): + if isinstance(node, dict): + weight = 0 + children_weights = {} + for k, v in node.items(): + try: + edge_weight = len(k) + except TypeError: + edge_weight = 1 + child_weight, child_structure = calculate_weights(v) + total_weight = edge_weight + child_weight + weight += total_weight + children_weights[k] = (edge_weight, child_weight, child_structure) + return weight, (SummaryNodeType.dict, children_weights) + + elif isinstance(node, list): + weight = 0 + children_weights = [] + for v in node: + edge_weight = 0 # Index weights are zero + child_weight, child_structure = calculate_weights(v) + total_weight = edge_weight + child_weight + weight += total_weight + children_weights.append((edge_weight, child_weight, child_structure)) + return weight, (SummaryNodeType.list, children_weights) + + else: + if isinstance(node, str): + node_weight = len(node) + elif isinstance(node, int): + node_weight = len(str(node)) + elif isinstance(node, float): + node_weight = len(str(round(node, 2))) + elif node is None: + node_weight = 1 + else: + node_weight = 0 + return node_weight, (SummaryNodeType.leaf, node) + +# Include previously defined functions for shrinking with threshold +# (Implementing directly the balanced summarization algorithm as above) + +# Balanced algorithm (simplified version): +def shrink_tree_balanced(node_structure, max_weight: int, balance_threshold: float) -> Tuple[JSON, float]: + node_type, node_info = node_structure + + if node_type is SummaryNodeType.leaf: + leaf_value = node_info + leaf_weight, _ = calculate_weights(leaf_value) + if leaf_weight <= max_weight: + return leaf_value, leaf_weight + else: + if isinstance(leaf_value, str): + truncated_value = _truncate(leaf_value, max_weight) + return truncated_value, len(truncated_value) + elif isinstance(leaf_value, (int, float)): + leaf_str = str(leaf_value) + truncated_str = leaf_str[:max_weight] + try: + return int(truncated_str), len(truncated_str) + except Exception: + try: + return float(truncated_str), len(truncated_str) + except Exception: + return truncated_str, len(truncated_str) + elif leaf_value is None: + return None, 1 if max_weight >= 1 else 0 + + elif node_type is SummaryNodeType.dict: + shrunk_dict = {} + total_weight = 0 + sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True) + + for k, (edge_w, _, child_struct) in sorted_children: + allowed_branch_weight = min(max_weight * balance_threshold, max_weight - total_weight) + if allowed_branch_weight <= edge_w: + continue + + remaining_weight = int(allowed_branch_weight - edge_w) + shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, remaining_weight, balance_threshold) + if shrunk_child is not None: + shrunk_dict[k[:edge_w]] = shrunk_child + total_weight += edge_w + shrunk_weight + + if total_weight >= max_weight: + break + if not shrunk_dict: + return None, 0 + + return shrunk_dict, total_weight + + elif node_type is SummaryNodeType.list: + shrunk_list = [] + total_weight = 0 + sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True) + for edge_w, _, child_struct in sorted_children: + allowed_branch_weight = int(min(max_weight * balance_threshold, max_weight - total_weight)) + shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, allowed_branch_weight, balance_threshold) + if shrunk_child is not None: + shrunk_list.append(shrunk_child) + total_weight += shrunk_weight + if total_weight >= max_weight - 1: + shrunk_list.append("...") + break + if not shrunk_list: + return None, 0 + return shrunk_list, total_weight + return None, 0 + + +def greedy_tree_summarization_balanced(json_data: JSON, max_weight: int, balance_threshold=0.6) -> JSON: + total_weight, tree_structure = calculate_weights(json_data) + if total_weight <= max_weight: + return json_data + shrunk_tree, _ = shrink_tree_balanced(tree_structure, max_weight, balance_threshold) + return shrunk_tree + + +def summarize(data: JSON, max_length:int=200, balance_threshold:float=0.6) -> str: + try: + return json_dumps( + greedy_tree_summarization_balanced(data, max_length, balance_threshold) + ) + except Exception: + return str(data) |