about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are here HEAD master
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py285
1 files changed, 285 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
new file mode 100644
index 00000000..3b1d687e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
@@ -0,0 +1,285 @@
+"""
+Code related to text extraction.
+
+Some parts are still in _page.py. In doubt, they will stay there.
+"""
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
+
+CUSTOM_RTL_MIN: int = -1
+CUSTOM_RTL_MAX: int = -1
+CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
+LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
+
+
+class OrientationNotFoundError(Exception):
+    pass
+
+
+def set_custom_rtl(
+    _min: Union[str, int, None] = None,
+    _max: Union[str, int, None] = None,
+    specials: Union[str, List[int], None] = None,
+) -> Tuple[int, int, List[int]]:
+    """
+    Change the Right-To-Left and special characters custom parameters.
+
+    Args:
+        _min: The new minimum value for the range of custom characters that
+            will be written right to left.
+            If set to ``None``, the value will not be changed.
+            If set to an integer or string, it will be converted to its ASCII code.
+            The default value is -1, which sets no additional range to be converted.
+        _max: The new maximum value for the range of custom characters that will
+            be written right to left.
+            If set to ``None``, the value will not be changed.
+            If set to an integer or string, it will be converted to its ASCII code.
+            The default value is -1, which sets no additional range to be converted.
+        specials: The new list of special characters to be inserted in the
+            current insertion order.
+            If set to ``None``, the current value will not be changed.
+            If set to a string, it will be converted to a list of ASCII codes.
+            The default value is an empty list.
+
+    Returns:
+        A tuple containing the new values for ``CUSTOM_RTL_MIN``,
+        ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
+    """
+    global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+    if isinstance(_min, int):
+        CUSTOM_RTL_MIN = _min
+    elif isinstance(_min, str):
+        CUSTOM_RTL_MIN = ord(_min)
+    if isinstance(_max, int):
+        CUSTOM_RTL_MAX = _max
+    elif isinstance(_max, str):
+        CUSTOM_RTL_MAX = ord(_max)
+    if isinstance(specials, str):
+        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
+    elif isinstance(specials, list):
+        CUSTOM_RTL_SPECIAL_CHARS = specials
+    return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+
+
+def mult(m: List[float], n: List[float]) -> List[float]:
+    return [
+        m[0] * n[0] + m[1] * n[2],
+        m[0] * n[1] + m[1] * n[3],
+        m[2] * n[0] + m[3] * n[2],
+        m[2] * n[1] + m[3] * n[3],
+        m[4] * n[0] + m[5] * n[2] + n[4],
+        m[4] * n[1] + m[5] * n[3] + n[5],
+    ]
+
+
+def orient(m: List[float]) -> int:
+    if m[3] > 1e-6:
+        return 0
+    elif m[3] < -1e-6:
+        return 180
+    elif m[1] > 0:
+        return 90
+    else:
+        return 270
+
+
+def crlf_space_check(
+    text: str,
+    cmtm_prev: Tuple[List[float], List[float]],
+    cmtm_matrix: Tuple[List[float], List[float]],
+    memo_cmtm: Tuple[List[float], List[float]],
+    cmap: Tuple[
+        Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+    ],
+    orientations: Tuple[int, ...],
+    output: str,
+    font_size: float,
+    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+    spacewidth: float,
+) -> Tuple[str, str, List[float], List[float]]:
+    cm_prev = cmtm_prev[0]
+    tm_prev = cmtm_prev[1]
+    cm_matrix = cmtm_matrix[0]
+    tm_matrix = cmtm_matrix[1]
+    memo_cm = memo_cmtm[0]
+    memo_tm = memo_cmtm[1]
+
+    m_prev = mult(tm_prev, cm_prev)
+    m = mult(tm_matrix, cm_matrix)
+    orientation = orient(m)
+    delta_x = m[4] - m_prev[4]
+    delta_y = m[5] - m_prev[5]
+    k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
+    f = font_size * k
+    cm_prev = m
+    if orientation not in orientations:
+        raise OrientationNotFoundError
+    try:
+        if orientation == 0:
+            if delta_y < -0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_y) < f * 0.3
+                and abs(delta_x) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+        elif orientation == 180:
+            if delta_y > 0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_y) < f * 0.3
+                and abs(delta_x) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+        elif orientation == 90:
+            if delta_x > 0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_x) < f * 0.3
+                and abs(delta_y) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+        elif orientation == 270:
+            if delta_x < -0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_x) < f * 0.3
+                and abs(delta_y) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+    except Exception:
+        pass
+    tm_prev = tm_matrix.copy()
+    cm_prev = cm_matrix.copy()
+    return text, output, cm_prev, tm_prev
+
+
+def handle_tj(
+    text: str,
+    operands: List[Union[str, TextStringObject]],
+    cm_matrix: List[float],
+    tm_matrix: List[float],
+    cmap: Tuple[
+        Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+    ],
+    orientations: Tuple[int, ...],
+    output: str,
+    font_size: float,
+    rtl_dir: bool,
+    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+) -> Tuple[str, bool]:
+    m = mult(tm_matrix, cm_matrix)
+    orientation = orient(m)
+    if orientation in orientations and len(operands) > 0:
+        if isinstance(operands[0], str):
+            text += operands[0]
+        else:
+            t: str = ""
+            tt: bytes = (
+                encode_pdfdocencoding(operands[0])
+                if isinstance(operands[0], str)
+                else operands[0]
+            )
+            if isinstance(cmap[0], str):
+                try:
+                    t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                except Exception:
+                    # the data does not match the expectation,
+                    # we use the alternative ;
+                    # text extraction may not be good
+                    t = tt.decode(
+                        "utf-16-be" if cmap[0] == "charmap" else "charmap",
+                        "surrogatepass",
+                    )  # apply str encoding
+            else:  # apply dict encoding
+                t = "".join(
+                    [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
+                )
+            # "\u0590 - \u08FF \uFB50 - \uFDFF"
+            for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
+                # x can be a sequence of bytes ; ex: habibi.pdf
+                if len(x) == 1:
+                    xx = ord(x)
+                else:
+                    xx = 1
+                # fmt: off
+                if (
+                    # cases where the current inserting order is kept
+                    (xx <= 0x2F)                        # punctuations but...
+                    or 0x3A <= xx <= 0x40               # numbers (x30-39)
+                    or 0x2000 <= xx <= 0x206F           # upper punctuations..
+                    or 0x20A0 <= xx <= 0x21FF           # but (numbers) indices/exponents
+                    or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized....
+                ):
+                    text = x + text if rtl_dir else text + x
+                elif (  # right-to-left characters set
+                    0x0590 <= xx <= 0x08FF
+                    or 0xFB1D <= xx <= 0xFDFF
+                    or 0xFE70 <= xx <= 0xFEFF
+                    or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
+                ):
+                    if not rtl_dir:
+                        rtl_dir = True
+                        output += text
+                        if visitor_text is not None:
+                            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                        text = ""
+                    text = x + text
+                else:  # left-to-right
+                    # print(">",xx,x,end="")
+                    if rtl_dir:
+                        rtl_dir = False
+                        output += text
+                        if visitor_text is not None:
+                            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                        text = ""
+                    text = text + x
+                # fmt: on
+    return text, rtl_dir