two version of R2R are here HEAD master

author: S. Solomon Darnell 2025-03-28 21:52:21 -0500
committer: S. Solomon Darnell 2025-03-28 21:52:21 -0500
commit: 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree: ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_text_extraction
parent: cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download: gn-ai-master.tar.gz
7 files changed, 1342 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
new file mode 100644
index 00000000..3b1d687e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
@@ -0,0 +1,285 @@
+"""
+Code related to text extraction.
+
+Some parts are still in _page.py. In doubt, they will stay there.
+"""
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
+
+CUSTOM_RTL_MIN: int = -1
+CUSTOM_RTL_MAX: int = -1
+CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
+LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
+
+
+class OrientationNotFoundError(Exception):
+    pass
+
+
+def set_custom_rtl(
+    _min: Union[str, int, None] = None,
+    _max: Union[str, int, None] = None,
+    specials: Union[str, List[int], None] = None,
+) -> Tuple[int, int, List[int]]:
+    """
+    Change the Right-To-Left and special characters custom parameters.
+
+    Args:
+        _min: The new minimum value for the range of custom characters that
+            will be written right to left.
+            If set to ``None``, the value will not be changed.
+            If set to an integer or string, it will be converted to its ASCII code.
+            The default value is -1, which sets no additional range to be converted.
+        _max: The new maximum value for the range of custom characters that will
+            be written right to left.
+            If set to ``None``, the value will not be changed.
+            If set to an integer or string, it will be converted to its ASCII code.
+            The default value is -1, which sets no additional range to be converted.
+        specials: The new list of special characters to be inserted in the
+            current insertion order.
+            If set to ``None``, the current value will not be changed.
+            If set to a string, it will be converted to a list of ASCII codes.
+            The default value is an empty list.
+
+    Returns:
+        A tuple containing the new values for ``CUSTOM_RTL_MIN``,
+        ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
+    """
+    global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+    if isinstance(_min, int):
+        CUSTOM_RTL_MIN = _min
+    elif isinstance(_min, str):
+        CUSTOM_RTL_MIN = ord(_min)
+    if isinstance(_max, int):
+        CUSTOM_RTL_MAX = _max
+    elif isinstance(_max, str):
+        CUSTOM_RTL_MAX = ord(_max)
+    if isinstance(specials, str):
+        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
+    elif isinstance(specials, list):
+        CUSTOM_RTL_SPECIAL_CHARS = specials
+    return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+
+
+def mult(m: List[float], n: List[float]) -> List[float]:
+    return [
+        m[0] * n[0] + m[1] * n[2],
+        m[0] * n[1] + m[1] * n[3],
+        m[2] * n[0] + m[3] * n[2],
+        m[2] * n[1] + m[3] * n[3],
+        m[4] * n[0] + m[5] * n[2] + n[4],
+        m[4] * n[1] + m[5] * n[3] + n[5],
+    ]
+
+
+def orient(m: List[float]) -> int:
+    if m[3] > 1e-6:
+        return 0
+    elif m[3] < -1e-6:
+        return 180
+    elif m[1] > 0:
+        return 90
+    else:
+        return 270
+
+
+def crlf_space_check(
+    text: str,
+    cmtm_prev: Tuple[List[float], List[float]],
+    cmtm_matrix: Tuple[List[float], List[float]],
+    memo_cmtm: Tuple[List[float], List[float]],
+    cmap: Tuple[
+        Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+    ],
+    orientations: Tuple[int, ...],
+    output: str,
+    font_size: float,
+    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+    spacewidth: float,
+) -> Tuple[str, str, List[float], List[float]]:
+    cm_prev = cmtm_prev[0]
+    tm_prev = cmtm_prev[1]
+    cm_matrix = cmtm_matrix[0]
+    tm_matrix = cmtm_matrix[1]
+    memo_cm = memo_cmtm[0]
+    memo_tm = memo_cmtm[1]
+
+    m_prev = mult(tm_prev, cm_prev)
+    m = mult(tm_matrix, cm_matrix)
+    orientation = orient(m)
+    delta_x = m[4] - m_prev[4]
+    delta_y = m[5] - m_prev[5]
+    k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
+    f = font_size * k
+    cm_prev = m
+    if orientation not in orientations:
+        raise OrientationNotFoundError
+    try:
+        if orientation == 0:
+            if delta_y < -0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_y) < f * 0.3
+                and abs(delta_x) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+        elif orientation == 180:
+            if delta_y > 0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_y) < f * 0.3
+                and abs(delta_x) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+        elif orientation == 90:
+            if delta_x > 0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_x) < f * 0.3
+                and abs(delta_y) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+        elif orientation == 270:
+            if delta_x < -0.8 * f:
+                if (output + text)[-1] != "\n":
+                    output += text + "\n"
+                    if visitor_text is not None:
+                        visitor_text(
+                            text + "\n",
+                            memo_cm,
+                            memo_tm,
+                            cmap[3],
+                            font_size,
+                        )
+                    text = ""
+            elif (
+                abs(delta_x) < f * 0.3
+                and abs(delta_y) > spacewidth * f * 15
+                and (output + text)[-1] != " "
+            ):
+                text += " "
+    except Exception:
+        pass
+    tm_prev = tm_matrix.copy()
+    cm_prev = cm_matrix.copy()
+    return text, output, cm_prev, tm_prev
+
+
+def handle_tj(
+    text: str,
+    operands: List[Union[str, TextStringObject]],
+    cm_matrix: List[float],
+    tm_matrix: List[float],
+    cmap: Tuple[
+        Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+    ],
+    orientations: Tuple[int, ...],
+    output: str,
+    font_size: float,
+    rtl_dir: bool,
+    visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+) -> Tuple[str, bool]:
+    m = mult(tm_matrix, cm_matrix)
+    orientation = orient(m)
+    if orientation in orientations and len(operands) > 0:
+        if isinstance(operands[0], str):
+            text += operands[0]
+        else:
+            t: str = ""
+            tt: bytes = (
+                encode_pdfdocencoding(operands[0])
+                if isinstance(operands[0], str)
+                else operands[0]
+            )
+            if isinstance(cmap[0], str):
+                try:
+                    t = tt.decode(cmap[0], "surrogatepass")  # apply str encoding
+                except Exception:
+                    # the data does not match the expectation,
+                    # we use the alternative ;
+                    # text extraction may not be good
+                    t = tt.decode(
+                        "utf-16-be" if cmap[0] == "charmap" else "charmap",
+                        "surrogatepass",
+                    )  # apply str encoding
+            else:  # apply dict encoding
+                t = "".join(
+                    [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
+                )
+            # "\u0590 - \u08FF \uFB50 - \uFDFF"
+            for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
+                # x can be a sequence of bytes ; ex: habibi.pdf
+                if len(x) == 1:
+                    xx = ord(x)
+                else:
+                    xx = 1
+                # fmt: off
+                if (
+                    # cases where the current inserting order is kept
+                    (xx <= 0x2F)                        # punctuations but...
+                    or 0x3A <= xx <= 0x40               # numbers (x30-39)
+                    or 0x2000 <= xx <= 0x206F           # upper punctuations..
+                    or 0x20A0 <= xx <= 0x21FF           # but (numbers) indices/exponents
+                    or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized....
+                ):
+                    text = x + text if rtl_dir else text + x
+                elif (  # right-to-left characters set
+                    0x0590 <= xx <= 0x08FF
+                    or 0xFB1D <= xx <= 0xFDFF
+                    or 0xFE70 <= xx <= 0xFEFF
+                    or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
+                ):
+                    if not rtl_dir:
+                        rtl_dir = True
+                        output += text
+                        if visitor_text is not None:
+                            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                        text = ""
+                    text = x + text
+                else:  # left-to-right
+                    # print(">",xx,x,end="")
+                    if rtl_dir:
+                        rtl_dir = False
+                        output += text
+                        if visitor_text is not None:
+                            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                        text = ""
+                    text = text + x
+                # fmt: on
+    return text, rtl_dir
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py
new file mode 100644
index 00000000..8f4d5929
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py
@@ -0,0 +1,16 @@
+"""Layout mode text extraction extension for pypdf"""
+from ._fixed_width_page import (
+    fixed_char_width,
+    fixed_width_page,
+    text_show_operations,
+    y_coordinate_groups,
+)
+from ._font import Font
+
+__all__ = [
+    "fixed_char_width",
+    "fixed_width_page",
+    "text_show_operations",
+    "y_coordinate_groups",
+    "Font",
+]
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
new file mode 100644
index 00000000..1be50095
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -0,0 +1,381 @@
+"""Extract PDF text preserving the layout of the source PDF"""
+
+import sys
+from itertools import groupby
+from math import ceil
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+from ..._utils import logger_warning
+from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
+from ._font import Font
+from ._text_state_manager import TextStateManager
+from ._text_state_params import TextStateParams
+
+if sys.version_info >= (3, 8):
+    from typing import Literal, TypedDict
+else:
+    from typing_extensions import Literal, TypedDict
+
+
+class BTGroup(TypedDict):
+    """
+    Dict describing a line of text rendered within a BT/ET operator pair.
+    If multiple text show operations render text on the same line, the text
+    will be combined into a single BTGroup dict.
+
+    Keys:
+        tx: x coordinate of first character in BTGroup
+        ty: y coordinate of first character in BTGroup
+        font_size: nominal font size
+        font_height: effective font height
+        text: rendered text
+        displaced_tx: x coordinate of last character in BTGroup
+        flip_sort: -1 if page is upside down, else 1
+    """
+
+    tx: float
+    ty: float
+    font_size: float
+    font_height: float
+    text: str
+    displaced_tx: float
+    flip_sort: Literal[-1, 1]
+
+
+def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
+    """
+    BTGroup constructed from a TextStateParams instance, rendered text, and
+    displaced tx value.
+
+    Args:
+        tj_op (TextStateParams): TextStateParams instance
+        rendered_text (str): rendered text
+        dispaced_tx (float): x coordinate of last character in BTGroup
+    """
+    return BTGroup(
+        tx=tj_op.tx,
+        ty=tj_op.ty,
+        font_size=tj_op.font_size,
+        font_height=tj_op.font_height,
+        text=rendered_text,
+        displaced_tx=dispaced_tx,
+        flip_sort=-1 if tj_op.flip_vertical else 1,
+    )
+
+
+def recurs_to_target_op(
+    ops: Iterator[Tuple[List[Any], bytes]],
+    text_state_mgr: TextStateManager,
+    end_target: Literal[b"Q", b"ET"],
+    fonts: Dict[str, Font],
+    strip_rotated: bool = True,
+) -> Tuple[List[BTGroup], List[TextStateParams]]:
+    """
+    Recurse operators between BT/ET and/or q/Q operators managing the transform
+    stack and capturing text positioning and rendering data.
+
+    Args:
+        ops: iterator of operators in content stream
+        text_state_mgr: a TextStateManager instance
+        end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
+        fonts: font dictionary as returned by PageObject._layout_mode_fonts()
+
+    Returns:
+        tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
+    """
+    # 1 entry per line of text rendered within each BT/ET operation.
+    bt_groups: List[BTGroup] = []
+
+    # 1 entry per text show operator (Tj/TJ/'/")
+    tj_ops: List[TextStateParams] = []
+
+    if end_target == b"Q":
+        # add new q level. cm's added at this level will be popped at next b'Q'
+        text_state_mgr.add_q()
+
+    while True:
+        try:
+            operands, op = next(ops)
+        except StopIteration:
+            return bt_groups, tj_ops
+        if op == end_target:
+            if op == b"Q":
+                text_state_mgr.remove_q()
+            if op == b"ET":
+                if not tj_ops:
+                    return bt_groups, tj_ops
+                _text = ""
+                bt_idx = 0  # idx of first tj in this bt group
+                last_displaced_tx = tj_ops[bt_idx].displaced_tx
+                last_ty = tj_ops[bt_idx].ty
+                for _idx, _tj in enumerate(
+                    tj_ops
+                ):  # ... build text from new Tj operators
+                    if strip_rotated and _tj.rotated:
+                        continue
+                    # if the y position of the text is greater than the font height, assume
+                    # the text is on a new line and start a new group
+                    if abs(_tj.ty - last_ty) > _tj.font_height:
+                        if _text.strip():
+                            bt_groups.append(
+                                bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
+                            )
+                        bt_idx = _idx
+                        _text = ""
+
+                    # if the x position of the text is less than the last x position by
+                    # more than 5 spaces widths, assume the text order should be flipped
+                    # and start a new group
+                    if (
+                        last_displaced_tx - _tj.tx
+                        > _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
+                    ):
+                        if _text.strip():
+                            bt_groups.append(
+                                bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
+                            )
+                        bt_idx = _idx
+                        last_displaced_tx = _tj.displaced_tx
+                        _text = ""
+
+                    # calculate excess x translation based on ending tx of previous Tj.
+                    # multiply by bool (_idx != bt_idx) to ensure spaces aren't double
+                    # applied to the first tj of a BTGroup in fixed_width_page().
+                    excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
+                    # space_tx could be 0 if either Tz or font_size was 0 for this _tj.
+                    spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
+                    new_text = f'{" " * spaces}{_tj.txt}'
+
+                    last_ty = _tj.ty
+                    _text = f"{_text}{new_text}"
+                    last_displaced_tx = _tj.displaced_tx
+                if _text:
+                    bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
+                text_state_mgr.reset_tm()
+            return bt_groups, tj_ops
+        if op == b"q":
+            bts, tjs = recurs_to_target_op(
+                ops, text_state_mgr, b"Q", fonts, strip_rotated
+            )
+            bt_groups.extend(bts)
+            tj_ops.extend(tjs)
+        elif op == b"cm":
+            text_state_mgr.add_cm(*operands)
+        elif op == b"BT":
+            bts, tjs = recurs_to_target_op(
+                ops, text_state_mgr, b"ET", fonts, strip_rotated
+            )
+            bt_groups.extend(bts)
+            tj_ops.extend(tjs)
+        elif op == b"Tj":
+            tj_ops.append(text_state_mgr.text_state_params(operands[0]))
+        elif op == b"TJ":
+            _tj = text_state_mgr.text_state_params()
+            for tj_op in operands[0]:
+                if isinstance(tj_op, bytes):
+                    _tj = text_state_mgr.text_state_params(tj_op)
+                    tj_ops.append(_tj)
+                else:
+                    text_state_mgr.add_trm(_tj.displacement_matrix(TD_offset=tj_op))
+        elif op == b"'":
+            text_state_mgr.reset_trm()
+            text_state_mgr.add_tm([0, -text_state_mgr.TL])
+            tj_ops.append(text_state_mgr.text_state_params(operands[0]))
+        elif op == b'"':
+            text_state_mgr.reset_trm()
+            text_state_mgr.set_state_param(b"Tw", operands[0])
+            text_state_mgr.set_state_param(b"Tc", operands[1])
+            text_state_mgr.add_tm([0, -text_state_mgr.TL])
+            tj_ops.append(text_state_mgr.text_state_params(operands[2]))
+        elif op in (b"Td", b"Tm", b"TD", b"T*"):
+            text_state_mgr.reset_trm()
+            if op == b"Tm":
+                text_state_mgr.reset_tm()
+            elif op == b"TD":
+                text_state_mgr.set_state_param(b"TL", -operands[1])
+            elif op == b"T*":
+                operands = [0, -text_state_mgr.TL]
+            text_state_mgr.add_tm(operands)
+        elif op == b"Tf":
+            text_state_mgr.set_font(fonts[operands[0]], operands[1])
+        else:  # handle Tc, Tw, Tz, TL, and Ts operators
+            text_state_mgr.set_state_param(op, operands)
+
+
+def y_coordinate_groups(
+    bt_groups: List[BTGroup], debug_path: Optional[Path] = None
+) -> Dict[int, List[BTGroup]]:
+    """
+    Group text operations by rendered y coordinate, i.e. the line number.
+
+    Args:
+        bt_groups: list of dicts as returned by text_show_operations()
+        debug_path (Path, optional): Path to a directory for saving debug output.
+
+    Returns:
+        Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
+            keyed by y coordinate
+    """
+    ty_groups = {
+        ty: sorted(grp, key=lambda x: x["tx"])
+        for ty, grp in groupby(
+            bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
+        )
+    }
+    # combine groups whose y coordinates differ by less than the effective font height
+    # (accounts for mixed fonts and other minor oddities)
+    last_ty = next(iter(ty_groups))
+    last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
+    for ty in list(ty_groups)[1:]:
+        fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
+        txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
+        # prevent merge if both groups are rendering in the same x position.
+        no_text_overlap = not (txs & last_txs)
+        offset_less_than_font_height = abs(ty - last_ty) < fsz
+        if no_text_overlap and offset_less_than_font_height:
+            ty_groups[last_ty] = sorted(
+                ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
+            )
+            last_txs |= txs
+        else:
+            last_ty = ty
+            last_txs = txs
+    if debug_path:  # pragma: no cover
+        import json
+
+        debug_path.joinpath("bt_groups.json").write_text(
+            json.dumps(ty_groups, indent=2, default=str), "utf-8"
+        )
+    return ty_groups
+
+
+def text_show_operations(
+    ops: Iterator[Tuple[List[Any], bytes]],
+    fonts: Dict[str, Font],
+    strip_rotated: bool = True,
+    debug_path: Optional[Path] = None,
+) -> List[BTGroup]:
+    """
+    Extract text from BT/ET operator pairs.
+
+    Args:
+        ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
+        fonts (Dict[str, Font]): font dictionary
+        strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
+        debug_path (Path, optional): Path to a directory for saving debug output.
+
+    Returns:
+        List[BTGroup]: list of dicts of text rendered by each BT operator
+    """
+    state_mgr = TextStateManager()  # transformation stack manager
+    debug = bool(debug_path)
+    bt_groups: List[BTGroup] = []  # BT operator dict
+    tj_debug: List[TextStateParams] = []  # Tj/TJ operator data (debug only)
+    try:
+        warned_rotation = False
+        while True:
+            operands, op = next(ops)
+            if op in (b"BT", b"q"):
+                bts, tjs = recurs_to_target_op(
+                    ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
+                )
+                if not warned_rotation and any(tj.rotated for tj in tjs):
+                    warned_rotation = True
+                    if strip_rotated:
+                        logger_warning(
+                            "Rotated text discovered. Output will be incomplete.",
+                            __name__,
+                        )
+                    else:
+                        logger_warning(
+                            "Rotated text discovered. Layout will be degraded.",
+                            __name__,
+                        )
+                bt_groups.extend(bts)
+                if debug:  # pragma: no cover
+                    tj_debug.extend(tjs)
+            else:  # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
+                state_mgr.set_state_param(op, operands)
+    except StopIteration:
+        pass
+
+    # left align the data, i.e. decrement all tx values by min(tx)
+    min_x = min((x["tx"] for x in bt_groups), default=0.0)
+    bt_groups = [
+        dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x)  # type: ignore[misc]
+        for ogrp in sorted(
+            bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
+        )
+    ]
+
+    if debug_path:  # pragma: no cover
+        import json
+
+        debug_path.joinpath("bts.json").write_text(
+            json.dumps(bt_groups, indent=2, default=str), "utf-8"
+        )
+        debug_path.joinpath("tjs.json").write_text(
+            json.dumps(
+                tj_debug, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
+            ),
+            "utf-8",
+        )
+    return bt_groups
+
+
+def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> float:
+    """
+    Calculate average character width weighted by the length of the rendered
+    text in each sample for conversion to fixed-width layout.
+
+    Args:
+        bt_groups (List[BTGroup]): List of dicts of text rendered by each
+            BT operator
+
+    Returns:
+        float: fixed character width
+    """
+    char_widths = []
+    for _bt in bt_groups:
+        _len = len(_bt["text"]) * scale_weight
+        char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
+    return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
+
+
+def fixed_width_page(
+    ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
+) -> str:
+    """
+    Generate page text from text operations grouped by rendered y coordinate.
+
+    Args:
+        ty_groups: dict of text show ops as returned by y_coordinate_groups()
+        char_width: fixed character width
+        space_vertically: include blank lines inferred from y distance + font height.
+
+    Returns:
+        str: page text in a fixed width format that closely adheres to the rendered
+            layout in the source pdf.
+    """
+    lines: List[str] = []
+    last_y_coord = 0
+    for y_coord, line_data in ty_groups.items():
+        if space_vertically and lines:
+            blank_lines = (
+                int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
+            )
+            lines.extend([""] * blank_lines)
+        line = ""
+        last_disp = 0.0
+        for bt_op in line_data:
+            offset = int(bt_op["tx"] // char_width)
+            spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
+            line = f"{line}{' ' * spaces}{bt_op['text']}"
+            last_disp = bt_op["displaced_tx"]
+        if line.strip() or lines:
+            lines.append(
+                "".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line)
+            )
+        last_y_coord = y_coord
+    return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip())
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py
new file mode 100644
index 00000000..a912fddb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py
@@ -0,0 +1,112 @@
+"""Font constants and classes for "layout" mode text operations"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Sequence, Union
+
+from ...generic import IndirectObject
+from ._font_widths import STANDARD_WIDTHS
+
+
+@dataclass
+class Font:
+    """
+    A font object formatted for use during "layout" mode text extraction
+
+    Attributes:
+        subtype (str): font subtype
+        space_width (int | float): width of a space character
+        encoding (str | Dict[int, str]): font encoding
+        char_map (dict): character map
+        font_dictionary (dict): font dictionary
+    """
+
+    subtype: str
+    space_width: Union[int, float]
+    encoding: Union[str, Dict[int, str]]
+    char_map: Dict[Any, Any]
+    font_dictionary: Dict[Any, Any]
+    width_map: Dict[str, int] = field(default_factory=dict, init=False)
+
+    def __post_init__(self) -> None:
+        # TrueType fonts have a /Widths array mapping character codes to widths
+        if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
+            first_char = self.font_dictionary.get("/FirstChar", 0)
+            self.width_map = {
+                self.encoding.get(idx + first_char, chr(idx + first_char)): width
+                for idx, width in enumerate(self.font_dictionary["/Widths"])
+            }
+
+        # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
+        if "/DescendantFonts" in self.font_dictionary:
+            d_font: Dict[Any, Any]
+            for d_font_idx, d_font in enumerate(
+                self.font_dictionary["/DescendantFonts"]
+            ):
+                while isinstance(d_font, IndirectObject):
+                    d_font = d_font.get_object()  # type: ignore[assignment]
+                self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
+                ord_map = {
+                    ord(_target): _surrogate
+                    for _target, _surrogate in self.char_map.items()
+                    if isinstance(_target, str)
+                }
+                # /W width definitions have two valid formats which can be mixed and matched:
+                #   (1) A character start index followed by a list of widths, e.g.
+                #       `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
+                #   (2) A character start index, a character stop index, and a width, e.g.
+                #       `45 65 500` applies width 500 to characters 45-65.
+                skip_count = 0
+                _w = d_font.get("/W", [])
+                for idx, w_entry in enumerate(_w):
+                    if skip_count:
+                        skip_count -= 1
+                        continue
+                    if not isinstance(w_entry, (int, float)):  # pragma: no cover
+                        # We should never get here due to skip_count above. Add a
+                        # warning and or use reader's "strict" to force an ex???
+                        continue
+                    # check for format (1): `int [int int int int ...]`
+                    if isinstance(_w[idx + 1], Sequence):
+                        start_idx, width_list = _w[idx : idx + 2]
+                        self.width_map.update(
+                            {
+                                ord_map[_cidx]: _width
+                                for _cidx, _width in zip(
+                                    range(start_idx, start_idx + len(width_list), 1),
+                                    width_list,
+                                )
+                                if _cidx in ord_map
+                            }
+                        )
+                        skip_count = 1
+                    # check for format (2): `int int int`
+                    if not isinstance(_w[idx + 1], Sequence) and not isinstance(
+                        _w[idx + 2], Sequence
+                    ):
+                        start_idx, stop_idx, const_width = _w[idx : idx + 3]
+                        self.width_map.update(
+                            {
+                                ord_map[_cidx]: const_width
+                                for _cidx in range(start_idx, stop_idx + 1, 1)
+                                if _cidx in ord_map
+                            }
+                        )
+                        skip_count = 2
+        if not self.width_map and "/BaseFont" in self.font_dictionary:
+            for key in STANDARD_WIDTHS:
+                if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
+                    self.width_map = STANDARD_WIDTHS[key]
+                    break
+
+    def word_width(self, word: str) -> float:
+        """Sum of character widths specified in PDF font for the supplied word"""
+        return sum(
+            [self.width_map.get(char, self.space_width * 2) for char in word], 0.0
+        )
+
+    @staticmethod
+    def to_dict(font_instance: "Font") -> Dict[str, Any]:
+        """Dataclass to dict for json.dumps serialization."""
+        return {
+            k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
+        }
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py
new file mode 100644
index 00000000..39092bcd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py
@@ -0,0 +1,208 @@
+# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard
+STANDARD_WIDTHS = {
+    "Helvetica": {  # 4 fonts, includes bold, oblique and boldoblique variants
+        " ": 278,
+        "!": 278,
+        '"': 355,
+        "#": 556,
+        "$": 556,
+        "%": 889,
+        "&": 667,
+        "'": 191,
+        "(": 333,
+        ")": 333,
+        "*": 389,
+        "+": 584,
+        ",": 278,
+        "-": 333,
+        ".": 278,
+        "/": 278,
+        "0": 556,
+        "1": 556,
+        "2": 556,
+        "3": 556,
+        "4": 556,
+        "5": 556,
+        "6": 556,
+        "7": 556,
+        "8": 556,
+        "9": 556,
+        ":": 278,
+        ";": 278,
+        "<": 584,
+        "=": 584,
+        ">": 584,
+        "?": 611,
+        "@": 975,
+        "A": 667,
+        "B": 667,
+        "C": 722,
+        "D": 722,
+        "E": 667,
+        "F": 611,
+        "G": 778,
+        "H": 722,
+        "I": 278,
+        "J": 500,
+        "K": 667,
+        "L": 556,
+        "M": 833,
+        "N": 722,
+        "O": 778,
+        "P": 667,
+        "Q": 944,
+        "R": 667,
+        "S": 667,
+        "T": 611,
+        "U": 278,
+        "V": 278,
+        "W": 584,
+        "X": 556,
+        "Y": 556,
+        "Z": 500,
+        "[": 556,
+        "\\": 556,
+        "]": 556,
+        "^": 278,
+        "_": 278,
+        "`": 278,
+        "a": 278,
+        "b": 278,
+        "c": 333,
+        "d": 556,
+        "e": 556,
+        "f": 556,
+        "g": 556,
+        "h": 556,
+        "i": 556,
+        "j": 556,
+        "k": 556,
+        "l": 556,
+        "m": 556,
+        "n": 278,
+        "o": 278,
+        "p": 556,
+        "q": 556,
+        "r": 500,
+        "s": 556,
+        "t": 556,
+        "u": 278,
+        "v": 500,
+        "w": 500,
+        "x": 222,
+        "y": 222,
+        "z": 556,
+        "{": 222,
+        "|": 833,
+        "}": 556,
+        "~": 556,
+    },
+    "Times": {  # 4 fonts, includes bold, oblique and boldoblique variants
+        " ": 250,
+        "!": 333,
+        '"': 408,
+        "#": 500,
+        "$": 500,
+        "%": 833,
+        "&": 778,
+        "'": 180,
+        "(": 333,
+        ")": 333,
+        "*": 500,
+        "+": 564,
+        ",": 250,
+        "-": 333,
+        ".": 250,
+        "/": 564,
+        "0": 500,
+        "1": 500,
+        "2": 500,
+        "3": 500,
+        "4": 500,
+        "5": 500,
+        "6": 500,
+        "7": 500,
+        "8": 500,
+        "9": 500,
+        ":": 278,
+        ";": 278,
+        "<": 564,
+        "=": 564,
+        ">": 564,
+        "?": 444,
+        "@": 921,
+        "A": 722,
+        "B": 667,
+        "C": 667,
+        "D": 722,
+        "E": 611,
+        "F": 556,
+        "G": 722,
+        "H": 722,
+        "I": 333,
+        "J": 389,
+        "K": 722,
+        "L": 611,
+        "M": 889,
+        "N": 722,
+        "O": 722,
+        "P": 556,
+        "Q": 722,
+        "R": 667,
+        "S": 556,
+        "T": 611,
+        "U": 722,
+        "V": 722,
+        "W": 944,
+        "X": 722,
+        "Y": 722,
+        "Z": 611,
+        "[": 333,
+        "\\": 278,
+        "]": 333,
+        "^": 469,
+        "_": 500,
+        "`": 333,
+        "a": 444,
+        "b": 500,
+        "c": 444,
+        "d": 500,
+        "e": 444,
+        "f": 333,
+        "g": 500,
+        "h": 500,
+        "i": 278,
+        "j": 278,
+        "k": 500,
+        "l": 278,
+        "m": 722,
+        "n": 500,
+        "o": 500,
+        "p": 500,
+        "q": 500,
+        "r": 333,
+        "s": 389,
+        "t": 278,
+        "u": 500,
+        "v": 444,
+        "w": 722,
+        "x": 500,
+        "y": 444,
+        "z": 389,
+        "{": 348,
+        "|": 220,
+        "}": 348,
+        "~": 469,
+    },
+}
+STANDARD_WIDTHS[
+    "Courier"
+] = {  # 4 fonts, includes bold, oblique and boldoblique variants
+    c: 600 for c in STANDARD_WIDTHS["Times"]  # fixed width
+}
+STANDARD_WIDTHS["ZapfDingbats"] = {c: 1000 for c in STANDARD_WIDTHS["Times"]}  # 1 font
+STANDARD_WIDTHS["Symbol"] = {c: 500 for c in STANDARD_WIDTHS["Times"]}  # 1 font
+# add aliases per table H.3 on page 1110 of the PDF 1.7 standard
+STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"]
+STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"]
+STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"]
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py
new file mode 100644
index 00000000..3c5d4736
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py
@@ -0,0 +1,213 @@
+"""manage the PDF transform stack during "layout" mode text extraction"""
+
+from collections import ChainMap, Counter
+from typing import Any, Dict, List, MutableMapping, Union
+from typing import ChainMap as ChainMapType
+from typing import Counter as CounterType
+
+from ...errors import PdfReadError
+from .. import mult
+from ._font import Font
+from ._text_state_params import TextStateParams
+
+TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
+TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
+
+
+class TextStateManager:
+    """
+    Tracks the current text state including cm/tm/trm transformation matrices.
+
+    Attributes:
+        transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
+        q_queue (Counter[int]): Counter of q operators
+        q_depth (List[int]): list of q operator nesting levels
+        Tc (float): character spacing
+        Tw (float): word spacing
+        Tz (int): horizontal scaling
+        TL (float): leading
+        Ts (float): text rise
+        font (Font): font object
+        font_size (int | float): font size
+    """
+
+    def __init__(self) -> None:
+        self.transform_stack: TextStateManagerChainMapType = ChainMap(
+            self.new_transform()
+        )
+        self.q_queue: CounterType[int] = Counter()
+        self.q_depth = [0]
+        self.Tc: float = 0.0
+        self.Tw: float = 0.0
+        self.Tz: float = 100.0
+        self.TL: float = 0.0
+        self.Ts: float = 0.0
+        self.font: Union[Font, None] = None
+        self.font_size: Union[int, float] = 0
+
+    def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None:
+        """
+        Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
+
+        Args:
+            op: operator read from PDF stream as bytes. No action is taken
+                for unsupported operators (see supported operators above).
+            value (float | List[Any]): new parameter value. If a list,
+                value[0] is used.
+        """
+        if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
+            return
+        self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
+
+    def set_font(self, font: Font, size: float) -> None:
+        """
+        Set the current font and font_size.
+
+        Args:
+            font (Font): a layout mode Font
+            size (float): font size
+        """
+        self.font = font
+        self.font_size = size
+
+    def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
+        """
+        Create a TextStateParams instance to display a text string. Type[bytes] values
+        will be decoded implicitly.
+
+        Args:
+            value (str | bytes): text to associate with the captured state.
+
+        Raises:
+            PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
+
+        Returns:
+            TextStateParams: current text state parameters
+        """
+        if not isinstance(self.font, Font):
+            raise PdfReadError(
+                "font not set: is PDF missing a Tf operator?"
+            )  # pragma: no cover
+        if isinstance(value, bytes):
+            try:
+                if isinstance(self.font.encoding, str):
+                    txt = value.decode(self.font.encoding, "surrogatepass")
+                else:
+                    txt = "".join(
+                        self.font.encoding[x]
+                        if x in self.font.encoding
+                        else bytes((x,)).decode()
+                        for x in value
+                    )
+            except (UnicodeEncodeError, UnicodeDecodeError):
+                txt = value.decode("utf-8", "replace")
+            txt = "".join(
+                self.font.char_map[x] if x in self.font.char_map else x for x in txt
+            )
+        else:
+            txt = value
+        return TextStateParams(
+            txt,
+            self.font,
+            self.font_size,
+            self.Tc,
+            self.Tw,
+            self.Tz,
+            self.TL,
+            self.Ts,
+            self.effective_transform,
+        )
+
+    @staticmethod
+    def raw_transform(
+        _a: float = 1.0,
+        _b: float = 0.0,
+        _c: float = 0.0,
+        _d: float = 1.0,
+        _e: float = 0.0,
+        _f: float = 0.0,
+    ) -> Dict[int, float]:
+        """Only a/b/c/d/e/f matrix params"""
+        return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
+
+    @staticmethod
+    def new_transform(
+        _a: float = 1.0,
+        _b: float = 0.0,
+        _c: float = 0.0,
+        _d: float = 1.0,
+        _e: float = 0.0,
+        _f: float = 0.0,
+        is_text: bool = False,
+        is_render: bool = False,
+    ) -> TextStateManagerDictType:
+        """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
+        result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
+        result.update({"is_text": is_text, "is_render": is_render})
+        return result
+
+    def reset_tm(self) -> TextStateManagerChainMapType:
+        """Clear all transforms from chainmap having is_text==True or is_render==True"""
+        while (
+            self.transform_stack.maps[0]["is_text"]
+            or self.transform_stack.maps[0]["is_render"]
+        ):
+            self.transform_stack = self.transform_stack.parents
+        return self.transform_stack
+
+    def reset_trm(self) -> TextStateManagerChainMapType:
+        """Clear all transforms from chainmap having is_render==True"""
+        while self.transform_stack.maps[0]["is_render"]:
+            self.transform_stack = self.transform_stack.parents
+        return self.transform_stack
+
+    def remove_q(self) -> TextStateManagerChainMapType:
+        """Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
+        self.transform_stack = self.reset_tm()
+        self.transform_stack.maps = self.transform_stack.maps[
+            self.q_queue.pop(self.q_depth.pop(), 0) :
+        ]
+        return self.transform_stack
+
+    def add_q(self) -> None:
+        """Add another level to q_queue"""
+        self.q_depth.append(len(self.q_depth))
+
+    def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
+        """Concatenate an additional transform matrix"""
+        self.transform_stack = self.reset_tm()
+        self.q_queue.update(self.q_depth[-1:])
+        self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
+        return self.transform_stack
+
+    def _complete_matrix(self, operands: List[float]) -> List[float]:
+        """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
+        if len(operands) == 2:  # this is a Td operator or equivalent
+            operands = [1.0, 0.0, 0.0, 1.0, *operands]
+        return operands
+
+    def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType:
+        """Append a text transform matrix"""
+        self.transform_stack = self.transform_stack.new_child(
+            self.new_transform(  # type: ignore[misc]
+                *self._complete_matrix(operands), is_text=True  # type: ignore[arg-type]
+            )
+        )
+        return self.transform_stack
+
+    def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType:
+        """Append a text rendering transform matrix"""
+        self.transform_stack = self.transform_stack.new_child(
+            self.new_transform(  # type: ignore[misc]
+                *self._complete_matrix(operands), is_text=True, is_render=True  # type: ignore[arg-type]
+            )
+        )
+        return self.transform_stack
+
+    @property
+    def effective_transform(self) -> List[float]:
+        """Current effective transform accounting for cm, tm, and trm transforms"""
+        eff_transform = [*self.transform_stack.maps[0].values()]
+        for transform in self.transform_stack.maps[1:]:
+            eff_transform = mult(eff_transform, transform)  # type: ignore[arg-type]  # dict has int keys 0-5
+        return eff_transform
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py
new file mode 100644
index 00000000..b6e6930c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py
@@ -0,0 +1,127 @@
+"""A dataclass that captures the CTM and Text State for a tj operation"""
+
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Union
+
+from .. import mult, orient
+from ._font import Font
+
+
+@dataclass
+class TextStateParams:
+    """
+    Text state parameters and operator values for a single text value in a
+    TJ or Tj PDF operation.
+
+    Attributes:
+        txt (str): the text to be rendered.
+        font (Font): font object
+        font_size (int | float): font size
+        Tc (float): character spacing. Defaults to 0.0.
+        Tw (float): word spacing. Defaults to 0.0.
+        Tz (float): horizontal scaling. Defaults to 100.0.
+        TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
+        Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
+        transform (List[float]): effective transformation matrix.
+        tx (float): x cood of rendered text, i.e. self.transform[4]
+        ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
+        displaced_tx (float): x coord immediately following rendered text
+        space_tx (float): tx for a space character
+        font_height (float): effective font height accounting for CTM
+        flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
+        rotated (bool): True if the text orientation is rotated with respect to the page.
+    """
+
+    txt: str
+    font: Font
+    font_size: Union[int, float]
+    Tc: float = 0.0
+    Tw: float = 0.0
+    Tz: float = 100.0
+    TL: float = 0.0
+    Ts: float = 0.0
+    transform: List[float] = field(
+        default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+    )
+    tx: float = field(default=0.0, init=False)
+    ty: float = field(default=0.0, init=False)
+    displaced_tx: float = field(default=0.0, init=False)
+    space_tx: float = field(default=0.0, init=False)
+    font_height: float = field(default=0.0, init=False)
+    flip_vertical: bool = field(default=False, init=False)
+    rotated: bool = field(default=False, init=False)
+
+    def __post_init__(self) -> None:
+        if orient(self.transform) in (90, 270):
+            self.transform = mult(
+                [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
+                self.transform,
+            )
+            self.rotated = True
+        # self.transform[0] AND self.transform[3] < 0 indicates true rotation.
+        # If only self.transform[3] < 0, the y coords are simply inverted.
+        if orient(self.transform) == 180 and self.transform[0] < -1e-6:
+            self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
+            self.rotated = True
+        self.displaced_tx = self.displaced_transform()[4]
+        self.tx = self.transform[4]
+        self.ty = self.render_transform()[5]
+        self.space_tx = round(self.word_tx(" "), 3)
+        if self.space_tx < 1e-6:
+            # if the " " char is assigned 0 width (e.g. for fine tuned spacing
+            # with TJ int operators a la crazyones.pdf), calculate space_tx as
+            # a TD_offset of -2 * font.space_width where font.space_width is
+            # the space_width calculated in _cmap.py.
+            self.space_tx = round(self.word_tx("", self.font.space_width * -2), 3)
+        self.font_height = self.font_size * math.sqrt(
+            self.transform[1] ** 2 + self.transform[3] ** 2
+        )
+        # flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
+        self.flip_vertical = self.transform[3] < -1e-6  # inverts y axis
+
+    def font_size_matrix(self) -> List[float]:
+        """Font size matrix"""
+        return [
+            self.font_size * (self.Tz / 100.0),
+            0.0,
+            0.0,
+            self.font_size,
+            0.0,
+            self.Ts,
+        ]
+
+    def displaced_transform(self) -> List[float]:
+        """Effective transform matrix after text has been rendered."""
+        return mult(self.displacement_matrix(), self.transform)
+
+    def render_transform(self) -> List[float]:
+        """Effective transform matrix accounting for font size, Tz, and Ts."""
+        return mult(self.font_size_matrix(), self.transform)
+
+    def displacement_matrix(
+        self, word: Union[str, None] = None, TD_offset: float = 0.0
+    ) -> List[float]:
+        """
+        Text displacement matrix
+
+        Args:
+            word (str, optional): Defaults to None in which case self.txt displacement is
+                returned.
+            TD_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
+        """
+        word = word if word is not None else self.txt
+        return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, TD_offset), 0.0]
+
+    def word_tx(self, word: str, TD_offset: float = 0.0) -> float:
+        """Horizontal text displacement for any word according this text state"""
+        return (
+            (self.font_size * ((self.font.word_width(word) - TD_offset) / 1000.0))
+            + self.Tc
+            + word.count(" ") * self.Tw
+        ) * (self.Tz / 100.0)
+
+    @staticmethod
+    def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
+        """Dataclass to dict for json.dumps serialization"""
+        return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}
author	S. Solomon Darnell	2025-03-28 21:52:21 -0500
committer	S. Solomon Darnell	2025-03-28 21:52:21 -0500
commit	4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
tree	ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_text_extraction
parent	cc961e04ba734dd72309fb548a2f97d67d578813 (diff)
download	gn-ai-master.tar.gz