gn-ai - A repository for GeneNetwork's AI tool development

# cython: language_level=3

from cpython.exc cimport PyErr_NoMemory
from cpython.mem cimport PyMem_Free, PyMem_Malloc, PyMem_Realloc
from cpython.unicode cimport (
    PyUnicode_DATA,
    PyUnicode_DecodeASCII,
    PyUnicode_DecodeUTF8Stateful,
    PyUnicode_GET_LENGTH,
    PyUnicode_KIND,
    PyUnicode_READ,
)
from libc.stdint cimport uint8_t, uint64_t
from libc.string cimport memcpy, memset

from string import ascii_letters, digits


cdef str GEN_DELIMS = ":/?#[]@"
cdef str SUB_DELIMS_WITHOUT_QS = "!$'()*,"
cdef str SUB_DELIMS = SUB_DELIMS_WITHOUT_QS + '+?=;'
cdef str RESERVED = GEN_DELIMS + SUB_DELIMS
cdef str UNRESERVED = ascii_letters + digits + '-._~'
cdef str ALLOWED = UNRESERVED + SUB_DELIMS_WITHOUT_QS
cdef str QS = '+&=;'

DEF BUF_SIZE = 8 * 1024  # 8KiB
cdef char BUFFER[BUF_SIZE]

cdef inline Py_UCS4 _to_hex(uint8_t v) noexcept:
    if v < 10:
        return <Py_UCS4>(v+0x30)  # ord('0') == 0x30
    else:
        return <Py_UCS4>(v+0x41-10)  # ord('A') == 0x41


cdef inline int _from_hex(Py_UCS4 v) noexcept:
    if '0' <= v <= '9':
        return <int>(v) - 0x30  # ord('0') == 0x30
    elif 'A' <= v <= 'F':
        return <int>(v) - 0x41 + 10  # ord('A') == 0x41
    elif 'a' <= v <= 'f':
        return <int>(v) - 0x61 + 10  # ord('a') == 0x61
    else:
        return -1


cdef inline int _is_lower_hex(Py_UCS4 v) noexcept:
    return 'a' <= v <= 'f'


cdef inline Py_UCS4 _restore_ch(Py_UCS4 d1, Py_UCS4 d2):
    cdef int digit1 = _from_hex(d1)
    if digit1 < 0:
        return <Py_UCS4>-1
    cdef int digit2 = _from_hex(d2)
    if digit2 < 0:
        return <Py_UCS4>-1
    return <Py_UCS4>(digit1 << 4 | digit2)


cdef uint8_t ALLOWED_TABLE[16]
cdef uint8_t ALLOWED_NOTQS_TABLE[16]


cdef inline bint bit_at(uint8_t array[], uint64_t ch) noexcept:
    return array[ch >> 3] & (1 << (ch & 7))


cdef inline void set_bit(uint8_t array[], uint64_t ch) noexcept:
    array[ch >> 3] |= (1 << (ch & 7))


memset(ALLOWED_TABLE, 0, sizeof(ALLOWED_TABLE))
memset(ALLOWED_NOTQS_TABLE, 0, sizeof(ALLOWED_NOTQS_TABLE))

for i in range(128):
    if chr(i) in ALLOWED:
        set_bit(ALLOWED_TABLE, i)
        set_bit(ALLOWED_NOTQS_TABLE, i)
    if chr(i) in QS:
        set_bit(ALLOWED_NOTQS_TABLE, i)

# ----------------- writer ---------------------------

cdef struct Writer:
    char *buf
    Py_ssize_t size
    Py_ssize_t pos
    bint changed


cdef inline void _init_writer(Writer* writer):
    writer.buf = &BUFFER[0]
    writer.size = BUF_SIZE
    writer.pos = 0
    writer.changed = 0


cdef inline void _release_writer(Writer* writer):
    if writer.buf != BUFFER:
        PyMem_Free(writer.buf)


cdef inline int _write_char(Writer* writer, Py_UCS4 ch, bint changed):
    cdef char * buf
    cdef Py_ssize_t size

    if writer.pos == writer.size:
        # reallocate
        size = writer.size + BUF_SIZE
        if writer.buf == BUFFER:
            buf = <char*>PyMem_Malloc(size)
            if buf == NULL:
                PyErr_NoMemory()
                return -1
            memcpy(buf, writer.buf, writer.size)
        else:
            buf = <char*>PyMem_Realloc(writer.buf, size)
            if buf == NULL:
                PyErr_NoMemory()
                return -1
        writer.buf = buf
        writer.size = size
    writer.buf[writer.pos] = <char>ch
    writer.pos += 1
    writer.changed |= changed
    return 0


cdef inline int _write_pct(Writer* writer, uint8_t ch, bint changed):
    if _write_char(writer, '%', changed) < 0:
        return -1
    if _write_char(writer, _to_hex(<uint8_t>ch >> 4), changed) < 0:
        return -1
    return _write_char(writer, _to_hex(<uint8_t>ch & 0x0f), changed)


cdef inline int _write_utf8(Writer* writer, Py_UCS4 symbol):
    cdef uint64_t utf = <uint64_t> symbol

    if utf < 0x80:
        return _write_pct(writer, <uint8_t>utf, True)
    elif utf < 0x800:
        if _write_pct(writer, <uint8_t>(0xc0 | (utf >> 6)), True) < 0:
            return -1
        return _write_pct(writer,  <uint8_t>(0x80 | (utf & 0x3f)), True)
    elif 0xD800 <= utf <= 0xDFFF:
        # surogate pair, ignored
        return 0
    elif utf < 0x10000:
        if _write_pct(writer, <uint8_t>(0xe0 | (utf >> 12)), True) < 0:
            return -1
        if _write_pct(writer, <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
                      True) < 0:
            return -1
        return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)
    elif utf > 0x10FFFF:
        # symbol is too large
        return 0
    else:
        if _write_pct(writer,  <uint8_t>(0xf0 | (utf >> 18)), True) < 0:
            return -1
        if _write_pct(writer,  <uint8_t>(0x80 | ((utf >> 12) & 0x3f)),
                      True) < 0:
            return -1
        if _write_pct(writer,  <uint8_t>(0x80 | ((utf >> 6) & 0x3f)),
                      True) < 0:
            return -1
        return _write_pct(writer, <uint8_t>(0x80 | (utf & 0x3f)), True)


# --------------------- end writer --------------------------


cdef class _Quoter:
    cdef bint _qs
    cdef bint _requote

    cdef uint8_t _safe_table[16]
    cdef uint8_t _protected_table[16]

    def __init__(
            self, *, str safe='', str protected='', bint qs=False, bint requote=True,
    ):
        cdef Py_UCS4 ch

        self._qs = qs
        self._requote = requote

        if not self._qs:
            memcpy(self._safe_table,
                   ALLOWED_NOTQS_TABLE,
                   sizeof(self._safe_table))
        else:
            memcpy(self._safe_table,
                   ALLOWED_TABLE,
                   sizeof(self._safe_table))
        for ch in safe:
            if ord(ch) > 127:
                raise ValueError("Only safe symbols with ORD < 128 are allowed")
            set_bit(self._safe_table, ch)

        memset(self._protected_table, 0, sizeof(self._protected_table))
        for ch in protected:
            if ord(ch) > 127:
                raise ValueError("Only safe symbols with ORD < 128 are allowed")
            set_bit(self._safe_table, ch)
            set_bit(self._protected_table, ch)

    def __call__(self, val):
        if val is None:
            return None
        if type(val) is not str:
            if isinstance(val, str):
                # derived from str
                val = str(val)
            else:
                raise TypeError("Argument should be str")
        return self._do_quote_or_skip(<str>val)

    cdef str _do_quote_or_skip(self, str val):
        cdef Py_UCS4 ch
        cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val)
        cdef Py_ssize_t idx = length
        cdef bint must_quote = 0
        cdef Writer writer
        cdef int kind = PyUnicode_KIND(val)
        cdef const void *data = PyUnicode_DATA(val)

        # If everything in the string is in the safe
        # table and all ASCII, we can skip quoting
        while idx:
            idx -= 1
            ch = PyUnicode_READ(kind, data, idx)
            if ch >= 128 or not bit_at(self._safe_table, ch):
                must_quote = 1
                break

        if not must_quote:
            return val

        _init_writer(&writer)
        try:
            return self._do_quote(<str>val, length, kind, data, &writer)
        finally:
            _release_writer(&writer)

    cdef str _do_quote(
        self,
        str val,
        Py_ssize_t length,
        int kind,
        const void *data,
        Writer *writer
    ):
        cdef Py_UCS4 ch
        cdef int changed
        cdef Py_ssize_t idx = 0

        while idx < length:
            ch = PyUnicode_READ(kind, data, idx)
            idx += 1
            if ch == '%' and self._requote and idx <= length - 2:
                ch = _restore_ch(
                    PyUnicode_READ(kind, data, idx),
                    PyUnicode_READ(kind, data, idx + 1)
                )
                if ch != <Py_UCS4>-1:
                    idx += 2
                    if ch < 128:
                        if bit_at(self._protected_table, ch):
                            if _write_pct(writer, ch, True) < 0:
                                raise
                            continue

                        if bit_at(self._safe_table, ch):
                            if _write_char(writer, ch, True) < 0:
                                raise
                            continue

                    changed = (_is_lower_hex(PyUnicode_READ(kind, data, idx - 2)) or
                               _is_lower_hex(PyUnicode_READ(kind, data, idx - 1)))
                    if _write_pct(writer, ch, changed) < 0:
                        raise
                    continue
                else:
                    ch = '%'

            if self._write(writer, ch) < 0:
                raise

        if not writer.changed:
            return val
        else:
            return PyUnicode_DecodeASCII(writer.buf, writer.pos, "strict")

    cdef inline int _write(self, Writer *writer, Py_UCS4 ch):
        if self._qs:
            if ch == ' ':
                return _write_char(writer, '+', True)

        if ch < 128 and bit_at(self._safe_table, ch):
            return _write_char(writer, ch, False)

        return _write_utf8(writer, ch)


cdef class _Unquoter:
    cdef str _ignore
    cdef str _unsafe
    cdef bint _qs
    cdef _Quoter _quoter
    cdef _Quoter _qs_quoter

    def __init__(self, *, ignore="", unsafe="", qs=False):
        self._ignore = ignore
        self._unsafe = unsafe
        self._qs = qs
        self._quoter = _Quoter()
        self._qs_quoter = _Quoter(qs=True)

    def __call__(self, val):
        if val is None:
            return None
        if type(val) is not str:
            if isinstance(val, str):
                # derived from str
                val = str(val)
            else:
                raise TypeError("Argument should be str")
        return self._do_unquote(<str>val)

    cdef str _do_unquote(self, str val):
        cdef Py_ssize_t length = PyUnicode_GET_LENGTH(val)
        if length == 0:
            return val

        cdef list ret = []
        cdef char buffer[4]
        cdef Py_ssize_t buflen = 0
        cdef Py_ssize_t consumed
        cdef str unquoted
        cdef Py_UCS4 ch = 0
        cdef Py_ssize_t idx = 0
        cdef Py_ssize_t start_pct
        cdef int kind = PyUnicode_KIND(val)
        cdef const void *data = PyUnicode_DATA(val)
        cdef bint changed = 0
        while idx < length:
            ch = PyUnicode_READ(kind, data, idx)
            idx += 1
            if ch == '%' and idx <= length - 2:
                changed = 1
                ch = _restore_ch(
                    PyUnicode_READ(kind, data, idx),
                    PyUnicode_READ(kind, data, idx + 1)
                )
                if ch != <Py_UCS4>-1:
                    idx += 2
                    assert buflen < 4
                    buffer[buflen] = ch
                    buflen += 1
                    try:
                        unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
                                                                NULL, &consumed)
                    except UnicodeDecodeError:
                        start_pct = idx - buflen * 3
                        buffer[0] = ch
                        buflen = 1
                        ret.append(val[start_pct : idx - 3])
                        try:
                            unquoted = PyUnicode_DecodeUTF8Stateful(buffer, buflen,
                                                                    NULL, &consumed)
                        except UnicodeDecodeError:
                            buflen = 0
                            ret.append(val[idx - 3 : idx])
                            continue
                    if not unquoted:
                        assert consumed == 0
                        continue
                    assert consumed == buflen
                    buflen = 0
                    if self._qs and unquoted in '+=&;':
                        ret.append(self._qs_quoter(unquoted))
                    elif unquoted in self._unsafe or unquoted in self._ignore:
                        ret.append(self._quoter(unquoted))
                    else:
                        ret.append(unquoted)
                    continue
                else:
                    ch = '%'

            if buflen:
                start_pct = idx - 1 - buflen * 3
                ret.append(val[start_pct : idx - 1])
                buflen = 0

            if ch == '+':
                if not self._qs or ch in self._unsafe:
                    ret.append('+')
                else:
                    changed = 1
                    ret.append(' ')
                continue

            if ch in self._unsafe:
                changed = 1
                ret.append('%')
                h = hex(ord(ch)).upper()[2:]
                for ch in h:
                    ret.append(ch)
                continue

            ret.append(ch)

        if not changed:
            return val

        if buflen:
            ret.append(val[length - buflen * 3 : length])

        return ''.join(ret)