aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/tokenizers/normalizers
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/tokenizers/normalizers')
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py29
-rw-r--r--.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi595
2 files changed, 624 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py
new file mode 100644
index 00000000..15a16f1e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.py
@@ -0,0 +1,29 @@
+from .. import normalizers
+
+
+Normalizer = normalizers.Normalizer
+BertNormalizer = normalizers.BertNormalizer
+NFD = normalizers.NFD
+NFKD = normalizers.NFKD
+NFC = normalizers.NFC
+NFKC = normalizers.NFKC
+Sequence = normalizers.Sequence
+Lowercase = normalizers.Lowercase
+Prepend = normalizers.Prepend
+Strip = normalizers.Strip
+StripAccents = normalizers.StripAccents
+Nmt = normalizers.Nmt
+Precompiled = normalizers.Precompiled
+Replace = normalizers.Replace
+
+
+NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
+
+
+def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
+ if normalizer not in NORMALIZERS:
+ raise ValueError(
+ "{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
+ )
+
+ return NORMALIZERS[normalizer]()
diff --git a/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi
new file mode 100644
index 00000000..507d4473
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/tokenizers/normalizers/__init__.pyi
@@ -0,0 +1,595 @@
+# Generated content DO NOT EDIT
+class Normalizer:
+ """
+ Base class for all normalizers
+
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
+ Normalizer will return an instance of this class when instantiated.
+ """
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class BertNormalizer(Normalizer):
+ """
+ BertNormalizer
+
+ Takes care of normalizing raw text before giving it to a Bert model.
+ This includes cleaning the text, handling accents, chinese chars and lowercasing
+
+ Args:
+ clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to clean the text, by removing any control characters
+ and replacing all whitespaces by the classic one.
+
+ handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to handle chinese chars by putting spaces around them.
+
+ strip_accents (:obj:`bool`, `optional`):
+ Whether to strip all accents. If this option is not specified (ie == None),
+ then it will be determined by the value for `lowercase` (as in the original Bert).
+
+ lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
+ Whether to lowercase.
+ """
+ def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Lowercase(Normalizer):
+ """
+ Lowercase Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFC(Normalizer):
+ """
+ NFC Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFD(Normalizer):
+ """
+ NFD Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFKC(Normalizer):
+ """
+ NFKC Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class NFKD(Normalizer):
+ """
+ NFKD Unicode Normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Nmt(Normalizer):
+ """
+ Nmt normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Precompiled(Normalizer):
+ """
+ Precompiled normalizer
+ Don't use manually it is used for compatiblity for SentencePiece.
+ """
+ def __init__(self, precompiled_charsmap):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Prepend(Normalizer):
+ """
+ Prepend normalizer
+ """
+ def __init__(self, prepend):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Replace(Normalizer):
+ """
+ Replace normalizer
+ """
+ def __init__(self, pattern, content):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Sequence(Normalizer):
+ """
+ Allows concatenating multiple other Normalizer as a Sequence.
+ All the normalizers run in sequence in the given order
+
+ Args:
+ normalizers (:obj:`List[Normalizer]`):
+ A list of Normalizer to be run as a sequence
+ """
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class Strip(Normalizer):
+ """
+ Strip normalizer
+ """
+ def __init__(self, left=True, right=True):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass
+
+class StripAccents(Normalizer):
+ """
+ StripAccents normalizer
+ """
+ def __init__(self):
+ pass
+
+ def normalize(self, normalized):
+ """
+ Normalize a :class:`~tokenizers.NormalizedString` in-place
+
+ This method allows to modify a :class:`~tokenizers.NormalizedString` to
+ keep track of the alignment information. If you just want to see the result
+ of the normalization on a raw string, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+
+ Args:
+ normalized (:class:`~tokenizers.NormalizedString`):
+ The normalized string on which to apply this
+ :class:`~tokenizers.normalizers.Normalizer`
+ """
+ pass
+
+ def normalize_str(self, sequence):
+ """
+ Normalize the given string
+
+ This method provides a way to visualize the effect of a
+ :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+ information. If you need to get/convert offsets, you can use
+ :meth:`~tokenizers.normalizers.Normalizer.normalize`
+
+ Args:
+ sequence (:obj:`str`):
+ A string to normalize
+
+ Returns:
+ :obj:`str`: A string after normalization
+ """
+ pass