diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/google/protobuf/text_encoding.py | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/google/protobuf/text_encoding.py')
-rw-r--r-- | .venv/lib/python3.12/site-packages/google/protobuf/text_encoding.py | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/google/protobuf/text_encoding.py b/.venv/lib/python3.12/site-packages/google/protobuf/text_encoding.py new file mode 100644 index 00000000..03c27dc1 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/google/protobuf/text_encoding.py @@ -0,0 +1,106 @@ +# Protocol Buffers - Google's data interchange format +# Copyright 2008 Google Inc. All rights reserved. +# +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file or at +# https://developers.google.com/open-source/licenses/bsd + +"""Encoding related utilities.""" +import re + +def _AsciiIsPrint(i): + return i >= 32 and i < 127 + +def _MakeStrEscapes(): + ret = {} + for i in range(0, 128): + if not _AsciiIsPrint(i): + ret[i] = r'\%03o' % i + ret[ord('\t')] = r'\t' # optional escape + ret[ord('\n')] = r'\n' # optional escape + ret[ord('\r')] = r'\r' # optional escape + ret[ord('"')] = r'\"' # necessary escape + ret[ord('\'')] = r"\'" # optional escape + ret[ord('\\')] = r'\\' # necessary escape + return ret + +# Maps int -> char, performing string escapes. +_str_escapes = _MakeStrEscapes() + +# Maps int -> char, performing byte escaping and string escapes +_byte_escapes = {i: chr(i) for i in range(0, 256)} +_byte_escapes.update(_str_escapes) +_byte_escapes.update({i: r'\%03o' % i for i in range(128, 256)}) + + +def _DecodeUtf8EscapeErrors(text_bytes): + ret = '' + while text_bytes: + try: + ret += text_bytes.decode('utf-8').translate(_str_escapes) + text_bytes = '' + except UnicodeDecodeError as e: + ret += text_bytes[:e.start].decode('utf-8').translate(_str_escapes) + ret += _byte_escapes[text_bytes[e.start]] + text_bytes = text_bytes[e.start+1:] + return ret + + +def CEscape(text, as_utf8) -> str: + """Escape a bytes string for use in an text protocol buffer. + + Args: + text: A byte string to be escaped. + as_utf8: Specifies if result may contain non-ASCII characters. + In Python 3 this allows unescaped non-ASCII Unicode characters. + In Python 2 the return value will be valid UTF-8 rather than only ASCII. + Returns: + Escaped string (str). + """ + # Python's text.encode() 'string_escape' or 'unicode_escape' codecs do not + # satisfy our needs; they encodes unprintable characters using two-digit hex + # escapes whereas our C++ unescaping function allows hex escapes to be any + # length. So, "\0011".encode('string_escape') ends up being "\\x011", which + # will be decoded in C++ as a single-character string with char code 0x11. + text_is_unicode = isinstance(text, str) + if as_utf8: + if text_is_unicode: + return text.translate(_str_escapes) + else: + return _DecodeUtf8EscapeErrors(text) + else: + if text_is_unicode: + text = text.encode('utf-8') + return ''.join([_byte_escapes[c] for c in text]) + + +_CUNESCAPE_HEX = re.compile(r'(\\+)x([0-9a-fA-F])(?![0-9a-fA-F])') + + +def CUnescape(text: str) -> bytes: + """Unescape a text string with C-style escape sequences to UTF-8 bytes. + + Args: + text: The data to parse in a str. + Returns: + A byte string. + """ + + def ReplaceHex(m): + # Only replace the match if the number of leading back slashes is odd. i.e. + # the slash itself is not escaped. + if len(m.group(1)) & 1: + return m.group(1) + 'x0' + m.group(2) + return m.group(0) + + # This is required because the 'string_escape' encoding doesn't + # allow single-digit hex escapes (like '\xf'). + result = _CUNESCAPE_HEX.sub(ReplaceHex, text) + + # Replaces Unicode escape sequences with their character equivalents. + result = result.encode('raw_unicode_escape').decode('raw_unicode_escape') + # Encode Unicode characters as UTF-8, then decode to Latin-1 escaping + # unprintable characters. + result = result.encode('utf-8').decode('unicode_escape') + # Convert Latin-1 text back to a byte string (latin-1 codec also works here). + return result.encode('latin-1') |