gn-ai - A repository for GeneNetwork's AI tool development

#!/usr/bin/env python
import re
from collections.abc import MutableMapping, Iterable
from deepdiff.helper import SetOrdered
import logging

from deepdiff.helper import (
    strings, numbers, add_to_frozen_set, get_doc, dict_, RE_COMPILED_TYPE, ipranges
)

logger = logging.getLogger(__name__)


doc = get_doc('search_doc.rst')


class DeepSearch(dict):
    r"""
    **DeepSearch**

    Deep Search inside objects to find the item matching your criteria.

    **Parameters**

    obj : The object to search within

    item : The item to search for

    verbose_level : int >= 0, default = 1.
        Verbose level one shows the paths of found items.
        Verbose level 2 shows the path and value of the found items.

    exclude_paths: list, default = None.
        List of paths to exclude from the report.

    exclude_types: list, default = None.
        List of object types to exclude from the report.

    case_sensitive: Boolean, default = False

    match_string: Boolean, default = False
        If True, the value of the object or its children have to exactly match the item.
        If False, the value of the item can be a part of the value of the object or its children

    use_regexp: Boolean, default = False

    strict_checking: Boolean, default = True
        If True, it will check the type of the object to match, so when searching for '1234',
        it will NOT match the int 1234. Currently this only affects the numeric values searching.

    **Returns**

        A DeepSearch object that has the matched paths and matched values.

    **Supported data types**

    int, string, unicode, dictionary, list, tuple, set, frozenset, OrderedDict, NamedTuple and custom objects!

    **Examples**

    Importing
        >>> from deepdiff import DeepSearch
        >>> from pprint import pprint

    Search in list for string
        >>> obj = ["long somewhere", "string", 0, "somewhere great!"]
        >>> item = "somewhere"
        >>> ds = DeepSearch(obj, item, verbose_level=2)
        >>> print(ds)
        {'matched_values': {'root[3]': 'somewhere great!', 'root[0]': 'long somewhere'}}

    Search in nested data for string
        >>> obj = ["something somewhere", {"long": "somewhere", "string": 2, 0: 0, "somewhere": "around"}]
        >>> item = "somewhere"
        >>> ds = DeepSearch(obj, item, verbose_level=2)
        >>> pprint(ds, indent=2)
        { 'matched_paths': {"root[1]['somewhere']": 'around'},
          'matched_values': { 'root[0]': 'something somewhere',
                              "root[1]['long']": 'somewhere'}}

    """

    warning_num = 0

    def __init__(self,
                 obj,
                 item,
                 exclude_paths=SetOrdered(),
                 exclude_regex_paths=SetOrdered(),
                 exclude_types=SetOrdered(),
                 verbose_level=1,
                 case_sensitive=False,
                 match_string=False,
                 use_regexp=False,
                 strict_checking=True,
                 **kwargs):
        if kwargs:
            raise ValueError((
                "The following parameter(s) are not valid: %s\n"
                "The valid parameters are obj, item, exclude_paths, exclude_types,\n"
                "case_sensitive, match_string and verbose_level."
            ) % ', '.join(kwargs.keys()))

        self.obj = obj
        self.case_sensitive = case_sensitive if isinstance(item, strings) else True
        item = item if self.case_sensitive else item.lower()
        self.exclude_paths = SetOrdered(exclude_paths)
        self.exclude_regex_paths = [re.compile(exclude_regex_path) for exclude_regex_path in exclude_regex_paths]
        self.exclude_types = SetOrdered(exclude_types)
        self.exclude_types_tuple = tuple(
            exclude_types)  # we need tuple for checking isinstance
        self.verbose_level = verbose_level
        self.update(
            matched_paths=self.__set_or_dict(),
            matched_values=self.__set_or_dict(),
            unprocessed=[])
        self.use_regexp = use_regexp
        if not strict_checking and (isinstance(item, numbers) or isinstance(item, ipranges)):
            item = str(item)
        if self.use_regexp:
            try:
                item = re.compile(item)
            except TypeError as e:
                raise TypeError(f"The passed item of {item} is not usable for regex: {e}") from None
        self.strict_checking = strict_checking

        # Cases where user wants to match exact string item
        self.match_string = match_string

        self.__search(obj, item, parents_ids=frozenset({id(obj)}))

        empty_keys = [k for k, v in self.items() if not v]

        for k in empty_keys:
            del self[k]

    def __set_or_dict(self):
        return dict_() if self.verbose_level >= 2 else SetOrdered()

    def __report(self, report_key, key, value):
        if self.verbose_level >= 2:
            self[report_key][key] = value
        else:
            self[report_key].add(key)

    def __search_obj(self,
                     obj,
                     item,
                     parent,
                     parents_ids=frozenset(),
                     is_namedtuple=False):
        """Search objects"""
        found = False
        if obj == item:
            found = True
            # We report the match but also continue inside the match to see if there are
            # further matches inside the `looped` object.
            self.__report(report_key='matched_values', key=parent, value=obj)

        try:
            if is_namedtuple:
                obj = obj._asdict()
            else:
                # Skip magic methods. Slightly hacky, but unless people are defining
                # new magic methods they want to search, it should work fine.
                obj = {i: getattr(obj, i) for i in dir(obj)
                       if not (i.startswith('__') and i.endswith('__'))}
        except AttributeError:
            try:
                obj = {i: getattr(obj, i) for i in obj.__slots__}
            except AttributeError:
                if not found:
                    self['unprocessed'].append("%s" % parent)

                return

        self.__search_dict(
            obj, item, parent, parents_ids, print_as_attribute=True)

    def __skip_this(self, item, parent):
        skip = False
        if parent in self.exclude_paths:
            skip = True
        elif self.exclude_regex_paths and any(
                [exclude_regex_path.search(parent) for exclude_regex_path in self.exclude_regex_paths]):
            skip = True
        else:
            if isinstance(item, self.exclude_types_tuple):
                skip = True

        return skip

    def __search_dict(self,
                      obj,
                      item,
                      parent,
                      parents_ids=frozenset(),
                      print_as_attribute=False):
        """Search dictionaries"""
        if print_as_attribute:
            parent_text = "%s.%s"
        else:
            parent_text = "%s[%s]"

        obj_keys = SetOrdered(obj.keys())

        for item_key in obj_keys:
            if not print_as_attribute and isinstance(item_key, strings):
                item_key_str = "'%s'" % item_key
            else:
                item_key_str = item_key

            obj_child = obj[item_key]

            item_id = id(obj_child)

            if parents_ids and item_id in parents_ids:
                continue

            parents_ids_added = add_to_frozen_set(parents_ids, item_id)

            new_parent = parent_text % (parent, item_key_str)
            new_parent_cased = new_parent if self.case_sensitive else new_parent.lower()

            str_item = str(item)
            if (self.match_string and str_item == new_parent_cased) or\
               (not self.match_string and str_item in new_parent_cased) or\
               (self.use_regexp and item.search(new_parent_cased)):
                self.__report(
                    report_key='matched_paths',
                    key=new_parent,
                    value=obj_child)

            self.__search(
                obj_child,
                item,
                parent=new_parent,
                parents_ids=parents_ids_added)

    def __search_iterable(self,
                          obj,
                          item,
                          parent="root",
                          parents_ids=frozenset()):
        """Search iterables except dictionaries, sets and strings."""
        for i, thing in enumerate(obj):
            new_parent = "{}[{}]".format(parent, i)
            if self.__skip_this(thing, parent=new_parent):
                continue

            if self.case_sensitive or not isinstance(thing, strings):
                thing_cased = thing
            else:
                thing_cased = thing.lower()

            if not self.use_regexp and thing_cased == item:
                self.__report(
                    report_key='matched_values', key=new_parent, value=thing)
            else:
                item_id = id(thing)
                if parents_ids and item_id in parents_ids:
                    continue
                parents_ids_added = add_to_frozen_set(parents_ids, item_id)
                self.__search(thing, item, "%s[%s]" %
                              (parent, i), parents_ids_added)

    def __search_str(self, obj, item, parent):
        """Compare strings"""
        obj_text = obj if self.case_sensitive else obj.lower()

        is_matched = False
        if self.use_regexp:
            is_matched = item.search(obj_text)
        elif (self.match_string and item == obj_text) or (not self.match_string and item in obj_text):
            is_matched = True
        if is_matched:
            self.__report(report_key='matched_values', key=parent, value=obj)

    def __search_numbers(self, obj, item, parent):
        if (
            item == obj or (
                not self.strict_checking and (
                    item == str(obj) or (
                        self.use_regexp and item.search(str(obj))
                    )
                )
            )
        ):
            self.__report(report_key='matched_values', key=parent, value=obj)

    def __search_tuple(self, obj, item, parent, parents_ids):
        # Checking to see if it has _fields. Which probably means it is a named
        # tuple.
        try:
            obj._asdict
        # It must be a normal tuple
        except AttributeError:
            self.__search_iterable(obj, item, parent, parents_ids)
        # We assume it is a namedtuple then
        else:
            self.__search_obj(
                obj, item, parent, parents_ids, is_namedtuple=True)

    def __search(self, obj, item, parent="root", parents_ids=frozenset()):
        """The main search method"""
        if self.__skip_this(item, parent):
            return

        elif isinstance(obj, strings) and isinstance(item, (strings, RE_COMPILED_TYPE)):
            self.__search_str(obj, item, parent)

        elif isinstance(obj, strings) and isinstance(item, numbers):
            return

        elif isinstance(obj, ipranges):
            self.__search_str(str(obj), item, parent)

        elif isinstance(obj, numbers):
            self.__search_numbers(obj, item, parent)

        elif isinstance(obj, MutableMapping):
            self.__search_dict(obj, item, parent, parents_ids)

        elif isinstance(obj, tuple):
            self.__search_tuple(obj, item, parent, parents_ids)

        elif isinstance(obj, (set, frozenset)):
            if self.warning_num < 10:
                logger.warning(
                    "Set item detected in the path."
                    "'set' objects do NOT support indexing. But DeepSearch will still report a path."
                )
                self.warning_num += 1
            self.__search_iterable(obj, item, parent, parents_ids)

        elif isinstance(obj, Iterable) and not isinstance(obj, strings):
            self.__search_iterable(obj, item, parent, parents_ids)

        else:
            self.__search_obj(obj, item, parent, parents_ids)


class grep:
    __doc__ = doc

    def __init__(self,
                 item,
                 **kwargs):
        self.item = item
        self.kwargs = kwargs

    def __ror__(self, other):
        return DeepSearch(obj=other, item=self.item, **self.kwargs)


if __name__ == "__main__":  # pragma: no cover
    import doctest
    doctest.testmod()