import pickle import sys import io import os import json import uuid import logging import re # NOQA import builtins # NOQA import datetime # NOQA import decimal # NOQA import orderly_set # NOQA import collections # NOQA from copy import deepcopy, copy from functools import partial from collections.abc import Mapping from typing import ( Callable, Optional, Union, overload, Literal, Any, ) from deepdiff.helper import ( strings, get_type, TEXT_VIEW, np_float32, np_float64, np_int32, np_int64, np_ndarray, Opcode, SetOrdered, pydantic_base_model_type, PydanticBaseModel, NotPresent, ipranges, ) from deepdiff.model import DeltaResult try: import orjson except ImportError: # pragma: no cover. orjson = None logger = logging.getLogger(__name__) class UnsupportedFormatErr(TypeError): pass NONE_TYPE = type(None) CSV_HEADER_MAX_CHUNK_SIZE = 2048 # The chunk needs to be big enough that covers a couple of rows of data. MODULE_NOT_FOUND_MSG = 'DeepDiff Delta did not find {} in your modules. Please make sure it is already imported.' FORBIDDEN_MODULE_MSG = "Module '{}' is forbidden. You need to explicitly pass it by passing a safe_to_import parameter" DELTA_IGNORE_ORDER_NEEDS_REPETITION_REPORT = 'report_repetition must be set to True when ignore_order is True to create the delta object.' DELTA_ERROR_WHEN_GROUP_BY = 'Delta can not be made when group_by is used since the structure of data is modified from the original form.' SAFE_TO_IMPORT = { 'builtins.range', 'builtins.complex', 'builtins.set', 'builtins.frozenset', 'builtins.slice', 'builtins.str', 'builtins.bytes', 'builtins.list', 'builtins.tuple', 'builtins.int', 'builtins.float', 'builtins.dict', 'builtins.bool', 'builtins.bin', 'builtins.None', 'datetime.datetime', 'datetime.time', 'datetime.timedelta', 'decimal.Decimal', 'uuid.UUID', 'orderly_set.sets.OrderedSet', 'orderly_set.sets.OrderlySet', 'orderly_set.sets.StableSetEq', 'deepdiff.helper.SetOrdered', 'collections.namedtuple', 'collections.OrderedDict', 're.Pattern', 'deepdiff.helper.Opcode', } TYPE_STR_TO_TYPE = { 'range': range, 'complex': complex, 'set': set, 'frozenset': frozenset, 'slice': slice, 'str': str, 'bytes': bytes, 'list': list, 'tuple': tuple, 'int': int, 'float': float, 'dict': dict, 'bool': bool, 'bin': bin, 'None': None, 'NoneType': None, 'datetime': datetime.datetime, 'time': datetime.time, 'timedelta': datetime.timedelta, 'Decimal': decimal.Decimal, 'SetOrdered': SetOrdered, 'namedtuple': collections.namedtuple, 'OrderedDict': collections.OrderedDict, 'Pattern': re.Pattern, 'iprange': str, } class ModuleNotFoundError(ImportError): """ Raised when the module is not found in sys.modules """ pass class ForbiddenModule(ImportError): """ Raised when a module is not explicitly allowed to be imported """ pass class SerializationMixin: def to_json_pickle(self): """ :ref:`to_json_pickle_label` Get the json pickle of the diff object. Unless you need all the attributes and functionality of DeepDiff, running to_json() is the safer option that json pickle. """ try: import jsonpickle copied = self.copy() # type: ignore return jsonpickle.encode(copied) except ImportError: # pragma: no cover. Json pickle is getting deprecated. logger.error('jsonpickle library needs to be installed in order to run to_json_pickle') # pragma: no cover. Json pickle is getting deprecated. @classmethod def from_json_pickle(cls, value): """ :ref:`from_json_pickle_label` Load DeepDiff object with all the bells and whistles from the json pickle dump. Note that json pickle dump comes from to_json_pickle """ try: import jsonpickle return jsonpickle.decode(value) except ImportError: # pragma: no cover. Json pickle is getting deprecated. logger.error('jsonpickle library needs to be installed in order to run from_json_pickle') # pragma: no cover. Json pickle is getting deprecated. def to_json(self, default_mapping: Optional[dict]=None, force_use_builtin_json=False, **kwargs): """ Dump json of the text view. **Parameters** default_mapping : dictionary(optional), a dictionary of mapping of different types to json types. by default DeepDiff converts certain data types. For example Decimals into floats so they can be exported into json. If you have a certain object type that the json serializer can not serialize it, please pass the appropriate type conversion through this dictionary. force_use_builtin_json: Boolean, default = False When True, we use Python's builtin Json library for serialization, even if Orjson is installed. kwargs: Any other kwargs you pass will be passed on to Python's json.dumps() **Example** Serialize custom objects >>> class A: ... pass ... >>> class B: ... pass ... >>> t1 = A() >>> t2 = B() >>> ddiff = DeepDiff(t1, t2) >>> ddiff.to_json() TypeError: We do not know how to convert <__main__.A object at 0x10648> of type for json serialization. Please pass the default_mapping parameter with proper mapping of the object to a basic python type. >>> default_mapping = {A: lambda x: 'obj A', B: lambda x: 'obj B'} >>> ddiff.to_json(default_mapping=default_mapping) '{"type_changes": {"root": {"old_type": "A", "new_type": "B", "old_value": "obj A", "new_value": "obj B"}}}' """ dic = self.to_dict(view_override=TEXT_VIEW) return json_dumps( dic, default_mapping=default_mapping, force_use_builtin_json=force_use_builtin_json, **kwargs, ) def to_dict(self, view_override: Optional[str]=None) -> dict: """ convert the result to a python dictionary. You can override the view type by passing view_override. **Parameters** view_override: view type, default=None, override the view that was used to generate the diff when converting to the dictionary. The options are the text or tree. """ view = view_override if view_override else self.view # type: ignore return dict(self._get_view_results(view)) # type: ignore def _to_delta_dict( self, directed: bool = True, report_repetition_required: bool = True, always_include_values: bool = False, ) -> dict: """ Dump to a dictionary suitable for delta usage. Unlike to_dict, this is not dependent on the original view that the user chose to create the diff. **Parameters** directed : Boolean, default=True, whether to create a directional delta dictionary or a symmetrical Note that in the current implementation the symmetrical delta (non-directional) is ONLY used for verifying that the delta is being applied to the exact same values as what was used to generate the delta and has no other usages. If this option is set as True, then the dictionary will not have the "old_value" in the output. Otherwise it will have the "old_value". "old_value" is the value of the item in t1. If delta = Delta(DeepDiff(t1, t2)) then t1 + delta == t2 Note that it the items in t1 + delta might have slightly different order of items than t2 if ignore_order was set to be True in the diff object. """ if self.group_by is not None: # type: ignore raise ValueError(DELTA_ERROR_WHEN_GROUP_BY) if directed and not always_include_values: _iterable_opcodes = {} # type: ignore for path, op_codes in self._iterable_opcodes.items(): # type: ignore _iterable_opcodes[path] = [] for op_code in op_codes: new_op_code = Opcode( tag=op_code.tag, t1_from_index=op_code.t1_from_index, t1_to_index=op_code.t1_to_index, t2_from_index=op_code.t2_from_index, t2_to_index=op_code.t2_to_index, new_values=op_code.new_values, ) _iterable_opcodes[path].append(new_op_code) else: _iterable_opcodes = self._iterable_opcodes # type: ignore result = DeltaResult( tree_results=self.tree, # type: ignore ignore_order=self.ignore_order, # type: ignore always_include_values=always_include_values, _iterable_opcodes=_iterable_opcodes, ) result.remove_empty_keys() if report_repetition_required and self.ignore_order and not self.report_repetition: # type: ignore raise ValueError(DELTA_IGNORE_ORDER_NEEDS_REPETITION_REPORT) if directed: for report_key, report_value in result.items(): if isinstance(report_value, Mapping): for path, value in report_value.items(): if isinstance(value, Mapping) and 'old_value' in value: del value['old_value'] # type: ignore if self._numpy_paths: # type: ignore # Note that keys that start with '_' are considered internal to DeepDiff # and will be omitted when counting distance. (Look inside the distance module.) result['_numpy_paths'] = self._numpy_paths # type: ignore if self.iterable_compare_func: # type: ignore result['_iterable_compare_func_was_used'] = True return deepcopy(dict(result)) def pretty(self, prefix: Optional[Union[str, Callable]]=None): """ The pretty human readable string output for the diff object regardless of what view was used to generate the diff. prefix can be a callable or a string or None. Example: >>> t1={1,2,4} >>> t2={2,3} >>> print(DeepDiff(t1, t2).pretty()) Item root[3] added to set. Item root[4] removed from set. Item root[1] removed from set. """ result = [] if prefix is None: prefix = '' keys = sorted(self.tree.keys()) # type: ignore # sorting keys to guarantee constant order across python versions. for key in keys: for item_key in self.tree[key]: # type: ignore result += [pretty_print_diff(item_key)] if callable(prefix): return "\n".join(f"{prefix(diff=self)}{r}" for r in result) return "\n".join(f"{prefix}{r}" for r in result) class _RestrictedUnpickler(pickle.Unpickler): def __init__(self, *args, **kwargs): self.safe_to_import = kwargs.pop('safe_to_import', None) if self.safe_to_import: if isinstance(self.safe_to_import, strings): self.safe_to_import = set([self.safe_to_import]) elif isinstance(self.safe_to_import, (set, frozenset)): pass else: self.safe_to_import = set(self.safe_to_import) self.safe_to_import = self.safe_to_import | SAFE_TO_IMPORT else: self.safe_to_import = SAFE_TO_IMPORT super().__init__(*args, **kwargs) def find_class(self, module, name): # Only allow safe classes from self.safe_to_import. module_dot_class = '{}.{}'.format(module, name) if module_dot_class in self.safe_to_import: try: module_obj = sys.modules[module] except KeyError: raise ModuleNotFoundError(MODULE_NOT_FOUND_MSG.format(module_dot_class)) from None return getattr(module_obj, name) # Forbid everything else. raise ForbiddenModule(FORBIDDEN_MODULE_MSG.format(module_dot_class)) from None def persistent_load(self, pid): if pid == "<>": return type(None) class _RestrictedPickler(pickle.Pickler): def persistent_id(self, obj): if obj is NONE_TYPE: # NOQA return "<>" return None def pickle_dump(obj, file_obj=None, protocol=4): """ **pickle_dump** Dumps the obj into pickled content. **Parameters** obj : Any python object file_obj : (Optional) A file object to dump the contents into **Returns** If file_obj is passed the return value will be None. It will write the object's pickle contents into the file. However if no file_obj is passed, then it will return the pickle serialization of the obj in the form of bytes. """ file_obj_passed = bool(file_obj) file_obj = file_obj or io.BytesIO() _RestrictedPickler(file_obj, protocol=protocol, fix_imports=False).dump(obj) if not file_obj_passed: return file_obj.getvalue() def pickle_load(content=None, file_obj=None, safe_to_import=None): """ **pickle_load** Load the pickled content. content should be a bytes object. **Parameters** content : Bytes of pickled object. file_obj : A file object to load the content from safe_to_import : A set of modules that needs to be explicitly allowed to be loaded. Example: {'mymodule.MyClass', 'decimal.Decimal'} Note that this set will be added to the basic set of modules that are already allowed. The set of what is already allowed can be found in deepdiff.serialization.SAFE_TO_IMPORT **Returns** A delta object that can be added to t1 to recreate t2. **Examples** Importing >>> from deepdiff import DeepDiff, Delta >>> from pprint import pprint """ if not content and not file_obj: raise ValueError('Please either pass the content or the file_obj to pickle_load.') if isinstance(content, str): content = content.encode('utf-8') if content: file_obj = io.BytesIO(content) return _RestrictedUnpickler(file_obj, safe_to_import=safe_to_import).load() def _get_pretty_form_text(verbose_level): pretty_form_texts = { "type_changes": "Type of {diff_path} changed from {type_t1} to {type_t2} and value changed from {val_t1} to {val_t2}.", "values_changed": "Value of {diff_path} changed from {val_t1} to {val_t2}.", "dictionary_item_added": "Item {diff_path} added to dictionary.", "dictionary_item_removed": "Item {diff_path} removed from dictionary.", "iterable_item_added": "Item {diff_path} added to iterable.", "iterable_item_removed": "Item {diff_path} removed from iterable.", "attribute_added": "Attribute {diff_path} added.", "attribute_removed": "Attribute {diff_path} removed.", "set_item_added": "Item root[{val_t2}] added to set.", "set_item_removed": "Item root[{val_t1}] removed from set.", "repetition_change": "Repetition change for item {diff_path}.", } if verbose_level == 2: pretty_form_texts.update( { "dictionary_item_added": "Item {diff_path} ({val_t2}) added to dictionary.", "dictionary_item_removed": "Item {diff_path} ({val_t1}) removed from dictionary.", "iterable_item_added": "Item {diff_path} ({val_t2}) added to iterable.", "iterable_item_removed": "Item {diff_path} ({val_t1}) removed from iterable.", "attribute_added": "Attribute {diff_path} ({val_t2}) added.", "attribute_removed": "Attribute {diff_path} ({val_t1}) removed.", } ) return pretty_form_texts def pretty_print_diff(diff): type_t1 = get_type(diff.t1).__name__ type_t2 = get_type(diff.t2).__name__ val_t1 = '"{}"'.format(str(diff.t1)) if type_t1 == "str" else str(diff.t1) val_t2 = '"{}"'.format(str(diff.t2)) if type_t2 == "str" else str(diff.t2) diff_path = diff.path(root='root') return _get_pretty_form_text(diff.verbose_level).get(diff.report_type, "").format( diff_path=diff_path, type_t1=type_t1, type_t2=type_t2, val_t1=val_t1, val_t2=val_t2) def load_path_content(path, file_type=None): """ Loads and deserializes the content of the path. """ if file_type is None: file_type = path.split('.')[-1] if file_type == 'json': with open(path, 'r') as the_file: content = json_loads(the_file.read()) elif file_type in {'yaml', 'yml'}: try: import yaml except ImportError: # pragma: no cover. raise ImportError('Pyyaml needs to be installed.') from None # pragma: no cover. with open(path, 'r') as the_file: content = yaml.safe_load(the_file) elif file_type == 'toml': try: if sys.version_info >= (3, 11): import tomllib as tomli else: import tomli except ImportError: # pragma: no cover. raise ImportError('On python<=3.10 tomli needs to be installed.') from None # pragma: no cover. with open(path, 'rb') as the_file: content = tomli.load(the_file) elif file_type == 'pickle': with open(path, 'rb') as the_file: content = the_file.read() content = pickle_load(content) elif file_type in {'csv', 'tsv'}: try: import clevercsv # type: ignore content = clevercsv.read_dicts(path) except ImportError: # pragma: no cover. import csv with open(path, 'r') as the_file: content = list(csv.DictReader(the_file)) logger.info(f"NOTE: CSV content was empty in {path}") # Everything in csv is string but we try to automatically convert any numbers we find for row in content: for key, value in row.items(): value = value.strip() for type_ in [int, float, complex]: try: value = type_(value) except Exception: pass else: row[key] = value break else: raise UnsupportedFormatErr(f'Only json, yaml, toml, csv, tsv and pickle are supported.\n' f' The {file_type} extension is not known.') return content def save_content_to_path(content, path, file_type=None, keep_backup=True): """ Saves and serializes the content of the path. """ backup_path = f"{path}.bak" os.rename(path, backup_path) try: _save_content( content=content, path=path, file_type=file_type, keep_backup=keep_backup) except Exception: os.rename(backup_path, path) raise else: if not keep_backup: os.remove(backup_path) def _save_content(content, path, file_type, keep_backup=True): if file_type == 'json': with open(path, 'w') as the_file: content = json_dumps(content) the_file.write(content) # type: ignore elif file_type in {'yaml', 'yml'}: try: import yaml except ImportError: # pragma: no cover. raise ImportError('Pyyaml needs to be installed.') from None # pragma: no cover. with open(path, 'w') as the_file: content = yaml.safe_dump(content, stream=the_file) elif file_type == 'toml': try: import tomli_w except ImportError: # pragma: no cover. raise ImportError('Tomli-w needs to be installed.') from None # pragma: no cover. with open(path, 'wb') as the_file: content = tomli_w.dump(content, the_file) elif file_type == 'pickle': with open(path, 'wb') as the_file: content = pickle_dump(content, file_obj=the_file) elif file_type in {'csv', 'tsv'}: try: import clevercsv # type: ignore dict_writer = clevercsv.DictWriter except ImportError: # pragma: no cover. import csv dict_writer = csv.DictWriter with open(path, 'w', newline='') as csvfile: fieldnames = list(content[0].keys()) writer = dict_writer(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(content) else: raise UnsupportedFormatErr('Only json, yaml, toml, csv, tsv and pickle are supported.\n' f' The {file_type} extension is not known.') return content def _serialize_decimal(value): if value.as_tuple().exponent == 0: return int(value) else: return float(value) def _serialize_tuple(value): if hasattr(value, '_asdict'): # namedtuple return value._asdict() return value JSON_CONVERTOR = { decimal.Decimal: _serialize_decimal, SetOrdered: list, orderly_set.StableSetEq: list, set: list, type: lambda x: x.__name__, bytes: lambda x: x.decode('utf-8'), datetime.datetime: lambda x: x.isoformat(), uuid.UUID: lambda x: str(x), np_float32: float, np_float64: float, np_int32: int, np_int64: int, np_ndarray: lambda x: x.tolist(), tuple: _serialize_tuple, Mapping: dict, NotPresent: str, } if PydanticBaseModel is not pydantic_base_model_type: JSON_CONVERTOR[PydanticBaseModel] = lambda x: x.dict() def json_convertor_default(default_mapping=None): if default_mapping: _convertor_mapping = JSON_CONVERTOR.copy() _convertor_mapping.update(default_mapping) else: _convertor_mapping = JSON_CONVERTOR def _convertor(obj): for original_type, convert_to in _convertor_mapping.items(): if isinstance(obj, original_type): return convert_to(obj) # This is to handle reverse() which creates a generator of type list_reverseiterator if obj.__class__.__name__ == 'list_reverseiterator': return list(copy(obj)) raise TypeError('We do not know how to convert {} of type {} for json serialization. Please pass the default_mapping parameter with proper mapping of the object to a basic python type.'.format(obj, type(obj))) return _convertor class JSONDecoder(json.JSONDecoder): def __init__(self, *args, **kwargs): json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) def object_hook(self, obj): # type: ignore if 'old_type' in obj and 'new_type' in obj: for type_key in ('old_type', 'new_type'): type_str = obj[type_key] obj[type_key] = TYPE_STR_TO_TYPE.get(type_str, type_str) return obj @overload def json_dumps( item: Any, **kwargs, ) -> str: ... @overload def json_dumps( item: Any, default_mapping:Optional[dict], force_use_builtin_json: bool, return_bytes:Literal[True], **kwargs, ) -> bytes: ... @overload def json_dumps( item: Any, default_mapping:Optional[dict], force_use_builtin_json: bool, return_bytes:Literal[False], **kwargs, ) -> str: ... def json_dumps( item: Any, default_mapping:Optional[dict]=None, force_use_builtin_json: bool = False, return_bytes: bool = False, **kwargs, ) -> Union[str, bytes]: """ Dump json with extra details that are not normally json serializable parameters ---------- force_use_builtin_json: Boolean, default = False When True, we use Python's builtin Json library for serialization, even if Orjson is installed. """ if orjson and not force_use_builtin_json: indent = kwargs.pop('indent', None) kwargs['option'] = orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY if indent: kwargs['option'] |= orjson.OPT_INDENT_2 if 'sort_keys' in kwargs: raise TypeError( "orjson does not accept the sort_keys parameter. " "If you need to pass sort_keys, set force_use_builtin_json=True " "to use Python's built-in json library instead of orjson.") result = orjson.dumps( item, default=json_convertor_default(default_mapping=default_mapping), **kwargs) if return_bytes: return result return result.decode(encoding='utf-8') else: result = json.dumps( item, default=json_convertor_default(default_mapping=default_mapping), **kwargs) if return_bytes: return result.encode(encoding='utf-8') return result json_loads = partial(json.loads, cls=JSONDecoder)