from typing import Tuple
from deepdiff.helper import JSON, SummaryNodeType
from deepdiff.serialization import json_dumps
def _truncate(s: str, max_len: int) -> str:
"""
Truncate string s to max_len characters.
If possible, keep the first (max_len-5) characters, then '...' then the last 2 characters.
"""
if len(s) <= max_len:
return s
if max_len <= 5:
return s[:max_len]
return s[:max_len - 5] + "..." + s[-2:]
# Re-defining the functions due to environment reset
# Function to calculate node weights recursively
def calculate_weights(node):
if isinstance(node, dict):
weight = 0
children_weights = {}
for k, v in node.items():
try:
edge_weight = len(k)
except TypeError:
edge_weight = 1
child_weight, child_structure = calculate_weights(v)
total_weight = edge_weight + child_weight
weight += total_weight
children_weights[k] = (edge_weight, child_weight, child_structure)
return weight, (SummaryNodeType.dict, children_weights)
elif isinstance(node, list):
weight = 0
children_weights = []
for v in node:
edge_weight = 0 # Index weights are zero
child_weight, child_structure = calculate_weights(v)
total_weight = edge_weight + child_weight
weight += total_weight
children_weights.append((edge_weight, child_weight, child_structure))
return weight, (SummaryNodeType.list, children_weights)
else:
if isinstance(node, str):
node_weight = len(node)
elif isinstance(node, int):
node_weight = len(str(node))
elif isinstance(node, float):
node_weight = len(str(round(node, 2)))
elif node is None:
node_weight = 1
else:
node_weight = 0
return node_weight, (SummaryNodeType.leaf, node)
# Include previously defined functions for shrinking with threshold
# (Implementing directly the balanced summarization algorithm as above)
# Balanced algorithm (simplified version):
def shrink_tree_balanced(node_structure, max_weight: int, balance_threshold: float) -> Tuple[JSON, float]:
node_type, node_info = node_structure
if node_type is SummaryNodeType.leaf:
leaf_value = node_info
leaf_weight, _ = calculate_weights(leaf_value)
if leaf_weight <= max_weight:
return leaf_value, leaf_weight
else:
if isinstance(leaf_value, str):
truncated_value = _truncate(leaf_value, max_weight)
return truncated_value, len(truncated_value)
elif isinstance(leaf_value, (int, float)):
leaf_str = str(leaf_value)
truncated_str = leaf_str[:max_weight]
try:
return int(truncated_str), len(truncated_str)
except Exception:
try:
return float(truncated_str), len(truncated_str)
except Exception:
return truncated_str, len(truncated_str)
elif leaf_value is None:
return None, 1 if max_weight >= 1 else 0
elif node_type is SummaryNodeType.dict:
shrunk_dict = {}
total_weight = 0
sorted_children = sorted(node_info.items(), key=lambda x: x[1][0] + x[1][1], reverse=True)
for k, (edge_w, _, child_struct) in sorted_children:
allowed_branch_weight = min(max_weight * balance_threshold, max_weight - total_weight)
if allowed_branch_weight <= edge_w:
continue
remaining_weight = int(allowed_branch_weight - edge_w)
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, remaining_weight, balance_threshold)
if shrunk_child is not None:
shrunk_dict[k[:edge_w]] = shrunk_child
total_weight += edge_w + shrunk_weight
if total_weight >= max_weight:
break
if not shrunk_dict:
return None, 0
return shrunk_dict, total_weight
elif node_type is SummaryNodeType.list:
shrunk_list = []
total_weight = 0
sorted_children = sorted(node_info, key=lambda x: x[0] + x[1], reverse=True)
for edge_w, _, child_struct in sorted_children:
allowed_branch_weight = int(min(max_weight * balance_threshold, max_weight - total_weight))
shrunk_child, shrunk_weight = shrink_tree_balanced(child_struct, allowed_branch_weight, balance_threshold)
if shrunk_child is not None:
shrunk_list.append(shrunk_child)
total_weight += shrunk_weight
if total_weight >= max_weight - 1:
shrunk_list.append("...")
break
if not shrunk_list:
return None, 0
return shrunk_list, total_weight
return None, 0
def greedy_tree_summarization_balanced(json_data: JSON, max_weight: int, balance_threshold=0.6) -> JSON:
total_weight, tree_structure = calculate_weights(json_data)
if total_weight <= max_weight:
return json_data
shrunk_tree, _ = shrink_tree_balanced(tree_structure, max_weight, balance_threshold)
return shrunk_tree
def summarize(data: JSON, max_length:int=200, balance_threshold:float=0.6) -> str:
try:
return json_dumps(
greedy_tree_summarization_balanced(data, max_length, balance_threshold)
)
except Exception:
return str(data)