aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/pypdf/_text_extraction
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_text_extraction')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py285
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py16
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py381
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py112
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py208
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py213
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py127
7 files changed, 1342 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
new file mode 100644
index 00000000..3b1d687e
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py
@@ -0,0 +1,285 @@
+"""
+Code related to text extraction.
+
+Some parts are still in _page.py. In doubt, they will stay there.
+"""
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding
+
+CUSTOM_RTL_MIN: int = -1
+CUSTOM_RTL_MAX: int = -1
+CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
+LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5
+
+
+class OrientationNotFoundError(Exception):
+ pass
+
+
+def set_custom_rtl(
+ _min: Union[str, int, None] = None,
+ _max: Union[str, int, None] = None,
+ specials: Union[str, List[int], None] = None,
+) -> Tuple[int, int, List[int]]:
+ """
+ Change the Right-To-Left and special characters custom parameters.
+
+ Args:
+ _min: The new minimum value for the range of custom characters that
+ will be written right to left.
+ If set to ``None``, the value will not be changed.
+ If set to an integer or string, it will be converted to its ASCII code.
+ The default value is -1, which sets no additional range to be converted.
+ _max: The new maximum value for the range of custom characters that will
+ be written right to left.
+ If set to ``None``, the value will not be changed.
+ If set to an integer or string, it will be converted to its ASCII code.
+ The default value is -1, which sets no additional range to be converted.
+ specials: The new list of special characters to be inserted in the
+ current insertion order.
+ If set to ``None``, the current value will not be changed.
+ If set to a string, it will be converted to a list of ASCII codes.
+ The default value is an empty list.
+
+ Returns:
+ A tuple containing the new values for ``CUSTOM_RTL_MIN``,
+ ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
+ """
+ global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+ if isinstance(_min, int):
+ CUSTOM_RTL_MIN = _min
+ elif isinstance(_min, str):
+ CUSTOM_RTL_MIN = ord(_min)
+ if isinstance(_max, int):
+ CUSTOM_RTL_MAX = _max
+ elif isinstance(_max, str):
+ CUSTOM_RTL_MAX = ord(_max)
+ if isinstance(specials, str):
+ CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
+ elif isinstance(specials, list):
+ CUSTOM_RTL_SPECIAL_CHARS = specials
+ return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
+
+
+def mult(m: List[float], n: List[float]) -> List[float]:
+ return [
+ m[0] * n[0] + m[1] * n[2],
+ m[0] * n[1] + m[1] * n[3],
+ m[2] * n[0] + m[3] * n[2],
+ m[2] * n[1] + m[3] * n[3],
+ m[4] * n[0] + m[5] * n[2] + n[4],
+ m[4] * n[1] + m[5] * n[3] + n[5],
+ ]
+
+
+def orient(m: List[float]) -> int:
+ if m[3] > 1e-6:
+ return 0
+ elif m[3] < -1e-6:
+ return 180
+ elif m[1] > 0:
+ return 90
+ else:
+ return 270
+
+
+def crlf_space_check(
+ text: str,
+ cmtm_prev: Tuple[List[float], List[float]],
+ cmtm_matrix: Tuple[List[float], List[float]],
+ memo_cmtm: Tuple[List[float], List[float]],
+ cmap: Tuple[
+ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+ ],
+ orientations: Tuple[int, ...],
+ output: str,
+ font_size: float,
+ visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+ spacewidth: float,
+) -> Tuple[str, str, List[float], List[float]]:
+ cm_prev = cmtm_prev[0]
+ tm_prev = cmtm_prev[1]
+ cm_matrix = cmtm_matrix[0]
+ tm_matrix = cmtm_matrix[1]
+ memo_cm = memo_cmtm[0]
+ memo_tm = memo_cmtm[1]
+
+ m_prev = mult(tm_prev, cm_prev)
+ m = mult(tm_matrix, cm_matrix)
+ orientation = orient(m)
+ delta_x = m[4] - m_prev[4]
+ delta_y = m[5] - m_prev[5]
+ k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
+ f = font_size * k
+ cm_prev = m
+ if orientation not in orientations:
+ raise OrientationNotFoundError
+ try:
+ if orientation == 0:
+ if delta_y < -0.8 * f:
+ if (output + text)[-1] != "\n":
+ output += text + "\n"
+ if visitor_text is not None:
+ visitor_text(
+ text + "\n",
+ memo_cm,
+ memo_tm,
+ cmap[3],
+ font_size,
+ )
+ text = ""
+ elif (
+ abs(delta_y) < f * 0.3
+ and abs(delta_x) > spacewidth * f * 15
+ and (output + text)[-1] != " "
+ ):
+ text += " "
+ elif orientation == 180:
+ if delta_y > 0.8 * f:
+ if (output + text)[-1] != "\n":
+ output += text + "\n"
+ if visitor_text is not None:
+ visitor_text(
+ text + "\n",
+ memo_cm,
+ memo_tm,
+ cmap[3],
+ font_size,
+ )
+ text = ""
+ elif (
+ abs(delta_y) < f * 0.3
+ and abs(delta_x) > spacewidth * f * 15
+ and (output + text)[-1] != " "
+ ):
+ text += " "
+ elif orientation == 90:
+ if delta_x > 0.8 * f:
+ if (output + text)[-1] != "\n":
+ output += text + "\n"
+ if visitor_text is not None:
+ visitor_text(
+ text + "\n",
+ memo_cm,
+ memo_tm,
+ cmap[3],
+ font_size,
+ )
+ text = ""
+ elif (
+ abs(delta_x) < f * 0.3
+ and abs(delta_y) > spacewidth * f * 15
+ and (output + text)[-1] != " "
+ ):
+ text += " "
+ elif orientation == 270:
+ if delta_x < -0.8 * f:
+ if (output + text)[-1] != "\n":
+ output += text + "\n"
+ if visitor_text is not None:
+ visitor_text(
+ text + "\n",
+ memo_cm,
+ memo_tm,
+ cmap[3],
+ font_size,
+ )
+ text = ""
+ elif (
+ abs(delta_x) < f * 0.3
+ and abs(delta_y) > spacewidth * f * 15
+ and (output + text)[-1] != " "
+ ):
+ text += " "
+ except Exception:
+ pass
+ tm_prev = tm_matrix.copy()
+ cm_prev = cm_matrix.copy()
+ return text, output, cm_prev, tm_prev
+
+
+def handle_tj(
+ text: str,
+ operands: List[Union[str, TextStringObject]],
+ cm_matrix: List[float],
+ tm_matrix: List[float],
+ cmap: Tuple[
+ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
+ ],
+ orientations: Tuple[int, ...],
+ output: str,
+ font_size: float,
+ rtl_dir: bool,
+ visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
+) -> Tuple[str, bool]:
+ m = mult(tm_matrix, cm_matrix)
+ orientation = orient(m)
+ if orientation in orientations and len(operands) > 0:
+ if isinstance(operands[0], str):
+ text += operands[0]
+ else:
+ t: str = ""
+ tt: bytes = (
+ encode_pdfdocencoding(operands[0])
+ if isinstance(operands[0], str)
+ else operands[0]
+ )
+ if isinstance(cmap[0], str):
+ try:
+ t = tt.decode(cmap[0], "surrogatepass") # apply str encoding
+ except Exception:
+ # the data does not match the expectation,
+ # we use the alternative ;
+ # text extraction may not be good
+ t = tt.decode(
+ "utf-16-be" if cmap[0] == "charmap" else "charmap",
+ "surrogatepass",
+ ) # apply str encoding
+ else: # apply dict encoding
+ t = "".join(
+ [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
+ )
+ # "\u0590 - \u08FF \uFB50 - \uFDFF"
+ for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
+ # x can be a sequence of bytes ; ex: habibi.pdf
+ if len(x) == 1:
+ xx = ord(x)
+ else:
+ xx = 1
+ # fmt: off
+ if (
+ # cases where the current inserting order is kept
+ (xx <= 0x2F) # punctuations but...
+ or 0x3A <= xx <= 0x40 # numbers (x30-39)
+ or 0x2000 <= xx <= 0x206F # upper punctuations..
+ or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
+ or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
+ ):
+ text = x + text if rtl_dir else text + x
+ elif ( # right-to-left characters set
+ 0x0590 <= xx <= 0x08FF
+ or 0xFB1D <= xx <= 0xFDFF
+ or 0xFE70 <= xx <= 0xFEFF
+ or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX
+ ):
+ if not rtl_dir:
+ rtl_dir = True
+ output += text
+ if visitor_text is not None:
+ visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+ text = ""
+ text = x + text
+ else: # left-to-right
+ # print(">",xx,x,end="")
+ if rtl_dir:
+ rtl_dir = False
+ output += text
+ if visitor_text is not None:
+ visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+ text = ""
+ text = text + x
+ # fmt: on
+ return text, rtl_dir
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py
new file mode 100644
index 00000000..8f4d5929
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py
@@ -0,0 +1,16 @@
+"""Layout mode text extraction extension for pypdf"""
+from ._fixed_width_page import (
+ fixed_char_width,
+ fixed_width_page,
+ text_show_operations,
+ y_coordinate_groups,
+)
+from ._font import Font
+
+__all__ = [
+ "fixed_char_width",
+ "fixed_width_page",
+ "text_show_operations",
+ "y_coordinate_groups",
+ "Font",
+]
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
new file mode 100644
index 00000000..1be50095
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -0,0 +1,381 @@
+"""Extract PDF text preserving the layout of the source PDF"""
+
+import sys
+from itertools import groupby
+from math import ceil
+from pathlib import Path
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+from ..._utils import logger_warning
+from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
+from ._font import Font
+from ._text_state_manager import TextStateManager
+from ._text_state_params import TextStateParams
+
+if sys.version_info >= (3, 8):
+ from typing import Literal, TypedDict
+else:
+ from typing_extensions import Literal, TypedDict
+
+
+class BTGroup(TypedDict):
+ """
+ Dict describing a line of text rendered within a BT/ET operator pair.
+ If multiple text show operations render text on the same line, the text
+ will be combined into a single BTGroup dict.
+
+ Keys:
+ tx: x coordinate of first character in BTGroup
+ ty: y coordinate of first character in BTGroup
+ font_size: nominal font size
+ font_height: effective font height
+ text: rendered text
+ displaced_tx: x coordinate of last character in BTGroup
+ flip_sort: -1 if page is upside down, else 1
+ """
+
+ tx: float
+ ty: float
+ font_size: float
+ font_height: float
+ text: str
+ displaced_tx: float
+ flip_sort: Literal[-1, 1]
+
+
+def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup:
+ """
+ BTGroup constructed from a TextStateParams instance, rendered text, and
+ displaced tx value.
+
+ Args:
+ tj_op (TextStateParams): TextStateParams instance
+ rendered_text (str): rendered text
+ dispaced_tx (float): x coordinate of last character in BTGroup
+ """
+ return BTGroup(
+ tx=tj_op.tx,
+ ty=tj_op.ty,
+ font_size=tj_op.font_size,
+ font_height=tj_op.font_height,
+ text=rendered_text,
+ displaced_tx=dispaced_tx,
+ flip_sort=-1 if tj_op.flip_vertical else 1,
+ )
+
+
+def recurs_to_target_op(
+ ops: Iterator[Tuple[List[Any], bytes]],
+ text_state_mgr: TextStateManager,
+ end_target: Literal[b"Q", b"ET"],
+ fonts: Dict[str, Font],
+ strip_rotated: bool = True,
+) -> Tuple[List[BTGroup], List[TextStateParams]]:
+ """
+ Recurse operators between BT/ET and/or q/Q operators managing the transform
+ stack and capturing text positioning and rendering data.
+
+ Args:
+ ops: iterator of operators in content stream
+ text_state_mgr: a TextStateManager instance
+ end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op)
+ fonts: font dictionary as returned by PageObject._layout_mode_fonts()
+
+ Returns:
+ tuple: list of BTGroup dicts + list of TextStateParams dataclass instances.
+ """
+ # 1 entry per line of text rendered within each BT/ET operation.
+ bt_groups: List[BTGroup] = []
+
+ # 1 entry per text show operator (Tj/TJ/'/")
+ tj_ops: List[TextStateParams] = []
+
+ if end_target == b"Q":
+ # add new q level. cm's added at this level will be popped at next b'Q'
+ text_state_mgr.add_q()
+
+ while True:
+ try:
+ operands, op = next(ops)
+ except StopIteration:
+ return bt_groups, tj_ops
+ if op == end_target:
+ if op == b"Q":
+ text_state_mgr.remove_q()
+ if op == b"ET":
+ if not tj_ops:
+ return bt_groups, tj_ops
+ _text = ""
+ bt_idx = 0 # idx of first tj in this bt group
+ last_displaced_tx = tj_ops[bt_idx].displaced_tx
+ last_ty = tj_ops[bt_idx].ty
+ for _idx, _tj in enumerate(
+ tj_ops
+ ): # ... build text from new Tj operators
+ if strip_rotated and _tj.rotated:
+ continue
+ # if the y position of the text is greater than the font height, assume
+ # the text is on a new line and start a new group
+ if abs(_tj.ty - last_ty) > _tj.font_height:
+ if _text.strip():
+ bt_groups.append(
+ bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
+ )
+ bt_idx = _idx
+ _text = ""
+
+ # if the x position of the text is less than the last x position by
+ # more than 5 spaces widths, assume the text order should be flipped
+ # and start a new group
+ if (
+ last_displaced_tx - _tj.tx
+ > _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS
+ ):
+ if _text.strip():
+ bt_groups.append(
+ bt_group(tj_ops[bt_idx], _text, last_displaced_tx)
+ )
+ bt_idx = _idx
+ last_displaced_tx = _tj.displaced_tx
+ _text = ""
+
+ # calculate excess x translation based on ending tx of previous Tj.
+ # multiply by bool (_idx != bt_idx) to ensure spaces aren't double
+ # applied to the first tj of a BTGroup in fixed_width_page().
+ excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx)
+ # space_tx could be 0 if either Tz or font_size was 0 for this _tj.
+ spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0
+ new_text = f'{" " * spaces}{_tj.txt}'
+
+ last_ty = _tj.ty
+ _text = f"{_text}{new_text}"
+ last_displaced_tx = _tj.displaced_tx
+ if _text:
+ bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx))
+ text_state_mgr.reset_tm()
+ return bt_groups, tj_ops
+ if op == b"q":
+ bts, tjs = recurs_to_target_op(
+ ops, text_state_mgr, b"Q", fonts, strip_rotated
+ )
+ bt_groups.extend(bts)
+ tj_ops.extend(tjs)
+ elif op == b"cm":
+ text_state_mgr.add_cm(*operands)
+ elif op == b"BT":
+ bts, tjs = recurs_to_target_op(
+ ops, text_state_mgr, b"ET", fonts, strip_rotated
+ )
+ bt_groups.extend(bts)
+ tj_ops.extend(tjs)
+ elif op == b"Tj":
+ tj_ops.append(text_state_mgr.text_state_params(operands[0]))
+ elif op == b"TJ":
+ _tj = text_state_mgr.text_state_params()
+ for tj_op in operands[0]:
+ if isinstance(tj_op, bytes):
+ _tj = text_state_mgr.text_state_params(tj_op)
+ tj_ops.append(_tj)
+ else:
+ text_state_mgr.add_trm(_tj.displacement_matrix(TD_offset=tj_op))
+ elif op == b"'":
+ text_state_mgr.reset_trm()
+ text_state_mgr.add_tm([0, -text_state_mgr.TL])
+ tj_ops.append(text_state_mgr.text_state_params(operands[0]))
+ elif op == b'"':
+ text_state_mgr.reset_trm()
+ text_state_mgr.set_state_param(b"Tw", operands[0])
+ text_state_mgr.set_state_param(b"Tc", operands[1])
+ text_state_mgr.add_tm([0, -text_state_mgr.TL])
+ tj_ops.append(text_state_mgr.text_state_params(operands[2]))
+ elif op in (b"Td", b"Tm", b"TD", b"T*"):
+ text_state_mgr.reset_trm()
+ if op == b"Tm":
+ text_state_mgr.reset_tm()
+ elif op == b"TD":
+ text_state_mgr.set_state_param(b"TL", -operands[1])
+ elif op == b"T*":
+ operands = [0, -text_state_mgr.TL]
+ text_state_mgr.add_tm(operands)
+ elif op == b"Tf":
+ text_state_mgr.set_font(fonts[operands[0]], operands[1])
+ else: # handle Tc, Tw, Tz, TL, and Ts operators
+ text_state_mgr.set_state_param(op, operands)
+
+
+def y_coordinate_groups(
+ bt_groups: List[BTGroup], debug_path: Optional[Path] = None
+) -> Dict[int, List[BTGroup]]:
+ """
+ Group text operations by rendered y coordinate, i.e. the line number.
+
+ Args:
+ bt_groups: list of dicts as returned by text_show_operations()
+ debug_path (Path, optional): Path to a directory for saving debug output.
+
+ Returns:
+ Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator
+ keyed by y coordinate
+ """
+ ty_groups = {
+ ty: sorted(grp, key=lambda x: x["tx"])
+ for ty, grp in groupby(
+ bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"])
+ )
+ }
+ # combine groups whose y coordinates differ by less than the effective font height
+ # (accounts for mixed fonts and other minor oddities)
+ last_ty = next(iter(ty_groups))
+ last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()}
+ for ty in list(ty_groups)[1:]:
+ fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty))
+ txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()}
+ # prevent merge if both groups are rendering in the same x position.
+ no_text_overlap = not (txs & last_txs)
+ offset_less_than_font_height = abs(ty - last_ty) < fsz
+ if no_text_overlap and offset_less_than_font_height:
+ ty_groups[last_ty] = sorted(
+ ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"]
+ )
+ last_txs |= txs
+ else:
+ last_ty = ty
+ last_txs = txs
+ if debug_path: # pragma: no cover
+ import json
+
+ debug_path.joinpath("bt_groups.json").write_text(
+ json.dumps(ty_groups, indent=2, default=str), "utf-8"
+ )
+ return ty_groups
+
+
+def text_show_operations(
+ ops: Iterator[Tuple[List[Any], bytes]],
+ fonts: Dict[str, Font],
+ strip_rotated: bool = True,
+ debug_path: Optional[Path] = None,
+) -> List[BTGroup]:
+ """
+ Extract text from BT/ET operator pairs.
+
+ Args:
+ ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream
+ fonts (Dict[str, Font]): font dictionary
+ strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True.
+ debug_path (Path, optional): Path to a directory for saving debug output.
+
+ Returns:
+ List[BTGroup]: list of dicts of text rendered by each BT operator
+ """
+ state_mgr = TextStateManager() # transformation stack manager
+ debug = bool(debug_path)
+ bt_groups: List[BTGroup] = [] # BT operator dict
+ tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only)
+ try:
+ warned_rotation = False
+ while True:
+ operands, op = next(ops)
+ if op in (b"BT", b"q"):
+ bts, tjs = recurs_to_target_op(
+ ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated
+ )
+ if not warned_rotation and any(tj.rotated for tj in tjs):
+ warned_rotation = True
+ if strip_rotated:
+ logger_warning(
+ "Rotated text discovered. Output will be incomplete.",
+ __name__,
+ )
+ else:
+ logger_warning(
+ "Rotated text discovered. Layout will be degraded.",
+ __name__,
+ )
+ bt_groups.extend(bts)
+ if debug: # pragma: no cover
+ tj_debug.extend(tjs)
+ else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops
+ state_mgr.set_state_param(op, operands)
+ except StopIteration:
+ pass
+
+ # left align the data, i.e. decrement all tx values by min(tx)
+ min_x = min((x["tx"] for x in bt_groups), default=0.0)
+ bt_groups = [
+ dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc]
+ for ogrp in sorted(
+ bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True
+ )
+ ]
+
+ if debug_path: # pragma: no cover
+ import json
+
+ debug_path.joinpath("bts.json").write_text(
+ json.dumps(bt_groups, indent=2, default=str), "utf-8"
+ )
+ debug_path.joinpath("tjs.json").write_text(
+ json.dumps(
+ tj_debug, indent=2, default=lambda x: getattr(x, "to_dict", str)(x)
+ ),
+ "utf-8",
+ )
+ return bt_groups
+
+
+def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> float:
+ """
+ Calculate average character width weighted by the length of the rendered
+ text in each sample for conversion to fixed-width layout.
+
+ Args:
+ bt_groups (List[BTGroup]): List of dicts of text rendered by each
+ BT operator
+
+ Returns:
+ float: fixed character width
+ """
+ char_widths = []
+ for _bt in bt_groups:
+ _len = len(_bt["text"]) * scale_weight
+ char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len))
+ return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths)
+
+
+def fixed_width_page(
+ ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool
+) -> str:
+ """
+ Generate page text from text operations grouped by rendered y coordinate.
+
+ Args:
+ ty_groups: dict of text show ops as returned by y_coordinate_groups()
+ char_width: fixed character width
+ space_vertically: include blank lines inferred from y distance + font height.
+
+ Returns:
+ str: page text in a fixed width format that closely adheres to the rendered
+ layout in the source pdf.
+ """
+ lines: List[str] = []
+ last_y_coord = 0
+ for y_coord, line_data in ty_groups.items():
+ if space_vertically and lines:
+ blank_lines = (
+ int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1
+ )
+ lines.extend([""] * blank_lines)
+ line = ""
+ last_disp = 0.0
+ for bt_op in line_data:
+ offset = int(bt_op["tx"] // char_width)
+ spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"]))
+ line = f"{line}{' ' * spaces}{bt_op['text']}"
+ last_disp = bt_op["displaced_tx"]
+ if line.strip() or lines:
+ lines.append(
+ "".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line)
+ )
+ last_y_coord = y_coord
+ return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip())
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py
new file mode 100644
index 00000000..a912fddb
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py
@@ -0,0 +1,112 @@
+"""Font constants and classes for "layout" mode text operations"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Sequence, Union
+
+from ...generic import IndirectObject
+from ._font_widths import STANDARD_WIDTHS
+
+
+@dataclass
+class Font:
+ """
+ A font object formatted for use during "layout" mode text extraction
+
+ Attributes:
+ subtype (str): font subtype
+ space_width (int | float): width of a space character
+ encoding (str | Dict[int, str]): font encoding
+ char_map (dict): character map
+ font_dictionary (dict): font dictionary
+ """
+
+ subtype: str
+ space_width: Union[int, float]
+ encoding: Union[str, Dict[int, str]]
+ char_map: Dict[Any, Any]
+ font_dictionary: Dict[Any, Any]
+ width_map: Dict[str, int] = field(default_factory=dict, init=False)
+
+ def __post_init__(self) -> None:
+ # TrueType fonts have a /Widths array mapping character codes to widths
+ if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
+ first_char = self.font_dictionary.get("/FirstChar", 0)
+ self.width_map = {
+ self.encoding.get(idx + first_char, chr(idx + first_char)): width
+ for idx, width in enumerate(self.font_dictionary["/Widths"])
+ }
+
+ # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts
+ if "/DescendantFonts" in self.font_dictionary:
+ d_font: Dict[Any, Any]
+ for d_font_idx, d_font in enumerate(
+ self.font_dictionary["/DescendantFonts"]
+ ):
+ while isinstance(d_font, IndirectObject):
+ d_font = d_font.get_object() # type: ignore[assignment]
+ self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font
+ ord_map = {
+ ord(_target): _surrogate
+ for _target, _surrogate in self.char_map.items()
+ if isinstance(_target, str)
+ }
+ # /W width definitions have two valid formats which can be mixed and matched:
+ # (1) A character start index followed by a list of widths, e.g.
+ # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47.
+ # (2) A character start index, a character stop index, and a width, e.g.
+ # `45 65 500` applies width 500 to characters 45-65.
+ skip_count = 0
+ _w = d_font.get("/W", [])
+ for idx, w_entry in enumerate(_w):
+ if skip_count:
+ skip_count -= 1
+ continue
+ if not isinstance(w_entry, (int, float)): # pragma: no cover
+ # We should never get here due to skip_count above. Add a
+ # warning and or use reader's "strict" to force an ex???
+ continue
+ # check for format (1): `int [int int int int ...]`
+ if isinstance(_w[idx + 1], Sequence):
+ start_idx, width_list = _w[idx : idx + 2]
+ self.width_map.update(
+ {
+ ord_map[_cidx]: _width
+ for _cidx, _width in zip(
+ range(start_idx, start_idx + len(width_list), 1),
+ width_list,
+ )
+ if _cidx in ord_map
+ }
+ )
+ skip_count = 1
+ # check for format (2): `int int int`
+ if not isinstance(_w[idx + 1], Sequence) and not isinstance(
+ _w[idx + 2], Sequence
+ ):
+ start_idx, stop_idx, const_width = _w[idx : idx + 3]
+ self.width_map.update(
+ {
+ ord_map[_cidx]: const_width
+ for _cidx in range(start_idx, stop_idx + 1, 1)
+ if _cidx in ord_map
+ }
+ )
+ skip_count = 2
+ if not self.width_map and "/BaseFont" in self.font_dictionary:
+ for key in STANDARD_WIDTHS:
+ if self.font_dictionary["/BaseFont"].startswith(f"/{key}"):
+ self.width_map = STANDARD_WIDTHS[key]
+ break
+
+ def word_width(self, word: str) -> float:
+ """Sum of character widths specified in PDF font for the supplied word"""
+ return sum(
+ [self.width_map.get(char, self.space_width * 2) for char in word], 0.0
+ )
+
+ @staticmethod
+ def to_dict(font_instance: "Font") -> Dict[str, Any]:
+ """Dataclass to dict for json.dumps serialization."""
+ return {
+ k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__
+ }
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py
new file mode 100644
index 00000000..39092bcd
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py
@@ -0,0 +1,208 @@
+# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard
+STANDARD_WIDTHS = {
+ "Helvetica": { # 4 fonts, includes bold, oblique and boldoblique variants
+ " ": 278,
+ "!": 278,
+ '"': 355,
+ "#": 556,
+ "$": 556,
+ "%": 889,
+ "&": 667,
+ "'": 191,
+ "(": 333,
+ ")": 333,
+ "*": 389,
+ "+": 584,
+ ",": 278,
+ "-": 333,
+ ".": 278,
+ "/": 278,
+ "0": 556,
+ "1": 556,
+ "2": 556,
+ "3": 556,
+ "4": 556,
+ "5": 556,
+ "6": 556,
+ "7": 556,
+ "8": 556,
+ "9": 556,
+ ":": 278,
+ ";": 278,
+ "<": 584,
+ "=": 584,
+ ">": 584,
+ "?": 611,
+ "@": 975,
+ "A": 667,
+ "B": 667,
+ "C": 722,
+ "D": 722,
+ "E": 667,
+ "F": 611,
+ "G": 778,
+ "H": 722,
+ "I": 278,
+ "J": 500,
+ "K": 667,
+ "L": 556,
+ "M": 833,
+ "N": 722,
+ "O": 778,
+ "P": 667,
+ "Q": 944,
+ "R": 667,
+ "S": 667,
+ "T": 611,
+ "U": 278,
+ "V": 278,
+ "W": 584,
+ "X": 556,
+ "Y": 556,
+ "Z": 500,
+ "[": 556,
+ "\\": 556,
+ "]": 556,
+ "^": 278,
+ "_": 278,
+ "`": 278,
+ "a": 278,
+ "b": 278,
+ "c": 333,
+ "d": 556,
+ "e": 556,
+ "f": 556,
+ "g": 556,
+ "h": 556,
+ "i": 556,
+ "j": 556,
+ "k": 556,
+ "l": 556,
+ "m": 556,
+ "n": 278,
+ "o": 278,
+ "p": 556,
+ "q": 556,
+ "r": 500,
+ "s": 556,
+ "t": 556,
+ "u": 278,
+ "v": 500,
+ "w": 500,
+ "x": 222,
+ "y": 222,
+ "z": 556,
+ "{": 222,
+ "|": 833,
+ "}": 556,
+ "~": 556,
+ },
+ "Times": { # 4 fonts, includes bold, oblique and boldoblique variants
+ " ": 250,
+ "!": 333,
+ '"': 408,
+ "#": 500,
+ "$": 500,
+ "%": 833,
+ "&": 778,
+ "'": 180,
+ "(": 333,
+ ")": 333,
+ "*": 500,
+ "+": 564,
+ ",": 250,
+ "-": 333,
+ ".": 250,
+ "/": 564,
+ "0": 500,
+ "1": 500,
+ "2": 500,
+ "3": 500,
+ "4": 500,
+ "5": 500,
+ "6": 500,
+ "7": 500,
+ "8": 500,
+ "9": 500,
+ ":": 278,
+ ";": 278,
+ "<": 564,
+ "=": 564,
+ ">": 564,
+ "?": 444,
+ "@": 921,
+ "A": 722,
+ "B": 667,
+ "C": 667,
+ "D": 722,
+ "E": 611,
+ "F": 556,
+ "G": 722,
+ "H": 722,
+ "I": 333,
+ "J": 389,
+ "K": 722,
+ "L": 611,
+ "M": 889,
+ "N": 722,
+ "O": 722,
+ "P": 556,
+ "Q": 722,
+ "R": 667,
+ "S": 556,
+ "T": 611,
+ "U": 722,
+ "V": 722,
+ "W": 944,
+ "X": 722,
+ "Y": 722,
+ "Z": 611,
+ "[": 333,
+ "\\": 278,
+ "]": 333,
+ "^": 469,
+ "_": 500,
+ "`": 333,
+ "a": 444,
+ "b": 500,
+ "c": 444,
+ "d": 500,
+ "e": 444,
+ "f": 333,
+ "g": 500,
+ "h": 500,
+ "i": 278,
+ "j": 278,
+ "k": 500,
+ "l": 278,
+ "m": 722,
+ "n": 500,
+ "o": 500,
+ "p": 500,
+ "q": 500,
+ "r": 333,
+ "s": 389,
+ "t": 278,
+ "u": 500,
+ "v": 444,
+ "w": 722,
+ "x": 500,
+ "y": 444,
+ "z": 389,
+ "{": 348,
+ "|": 220,
+ "}": 348,
+ "~": 469,
+ },
+}
+STANDARD_WIDTHS[
+ "Courier"
+] = { # 4 fonts, includes bold, oblique and boldoblique variants
+ c: 600 for c in STANDARD_WIDTHS["Times"] # fixed width
+}
+STANDARD_WIDTHS["ZapfDingbats"] = {c: 1000 for c in STANDARD_WIDTHS["Times"]} # 1 font
+STANDARD_WIDTHS["Symbol"] = {c: 500 for c in STANDARD_WIDTHS["Times"]} # 1 font
+# add aliases per table H.3 on page 1110 of the PDF 1.7 standard
+STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"]
+STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"]
+STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"]
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py
new file mode 100644
index 00000000..3c5d4736
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py
@@ -0,0 +1,213 @@
+"""manage the PDF transform stack during "layout" mode text extraction"""
+
+from collections import ChainMap, Counter
+from typing import Any, Dict, List, MutableMapping, Union
+from typing import ChainMap as ChainMapType
+from typing import Counter as CounterType
+
+from ...errors import PdfReadError
+from .. import mult
+from ._font import Font
+from ._text_state_params import TextStateParams
+
+TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]]
+TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]]
+
+
+class TextStateManager:
+ """
+ Tracks the current text state including cm/tm/trm transformation matrices.
+
+ Attributes:
+ transform_stack (ChainMap): ChainMap of cm/tm transformation matrices
+ q_queue (Counter[int]): Counter of q operators
+ q_depth (List[int]): list of q operator nesting levels
+ Tc (float): character spacing
+ Tw (float): word spacing
+ Tz (int): horizontal scaling
+ TL (float): leading
+ Ts (float): text rise
+ font (Font): font object
+ font_size (int | float): font size
+ """
+
+ def __init__(self) -> None:
+ self.transform_stack: TextStateManagerChainMapType = ChainMap(
+ self.new_transform()
+ )
+ self.q_queue: CounterType[int] = Counter()
+ self.q_depth = [0]
+ self.Tc: float = 0.0
+ self.Tw: float = 0.0
+ self.Tz: float = 100.0
+ self.TL: float = 0.0
+ self.Ts: float = 0.0
+ self.font: Union[Font, None] = None
+ self.font_size: Union[int, float] = 0
+
+ def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None:
+ """
+ Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators.
+
+ Args:
+ op: operator read from PDF stream as bytes. No action is taken
+ for unsupported operators (see supported operators above).
+ value (float | List[Any]): new parameter value. If a list,
+ value[0] is used.
+ """
+ if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]:
+ return
+ self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value)
+
+ def set_font(self, font: Font, size: float) -> None:
+ """
+ Set the current font and font_size.
+
+ Args:
+ font (Font): a layout mode Font
+ size (float): font size
+ """
+ self.font = font
+ self.font_size = size
+
+ def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams:
+ """
+ Create a TextStateParams instance to display a text string. Type[bytes] values
+ will be decoded implicitly.
+
+ Args:
+ value (str | bytes): text to associate with the captured state.
+
+ Raises:
+ PdfReadError: if font not set (no Tf operator in incoming pdf content stream)
+
+ Returns:
+ TextStateParams: current text state parameters
+ """
+ if not isinstance(self.font, Font):
+ raise PdfReadError(
+ "font not set: is PDF missing a Tf operator?"
+ ) # pragma: no cover
+ if isinstance(value, bytes):
+ try:
+ if isinstance(self.font.encoding, str):
+ txt = value.decode(self.font.encoding, "surrogatepass")
+ else:
+ txt = "".join(
+ self.font.encoding[x]
+ if x in self.font.encoding
+ else bytes((x,)).decode()
+ for x in value
+ )
+ except (UnicodeEncodeError, UnicodeDecodeError):
+ txt = value.decode("utf-8", "replace")
+ txt = "".join(
+ self.font.char_map[x] if x in self.font.char_map else x for x in txt
+ )
+ else:
+ txt = value
+ return TextStateParams(
+ txt,
+ self.font,
+ self.font_size,
+ self.Tc,
+ self.Tw,
+ self.Tz,
+ self.TL,
+ self.Ts,
+ self.effective_transform,
+ )
+
+ @staticmethod
+ def raw_transform(
+ _a: float = 1.0,
+ _b: float = 0.0,
+ _c: float = 0.0,
+ _d: float = 1.0,
+ _e: float = 0.0,
+ _f: float = 0.0,
+ ) -> Dict[int, float]:
+ """Only a/b/c/d/e/f matrix params"""
+ return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f))))
+
+ @staticmethod
+ def new_transform(
+ _a: float = 1.0,
+ _b: float = 0.0,
+ _c: float = 0.0,
+ _d: float = 1.0,
+ _e: float = 0.0,
+ _f: float = 0.0,
+ is_text: bool = False,
+ is_render: bool = False,
+ ) -> TextStateManagerDictType:
+ """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys"""
+ result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f)
+ result.update({"is_text": is_text, "is_render": is_render})
+ return result
+
+ def reset_tm(self) -> TextStateManagerChainMapType:
+ """Clear all transforms from chainmap having is_text==True or is_render==True"""
+ while (
+ self.transform_stack.maps[0]["is_text"]
+ or self.transform_stack.maps[0]["is_render"]
+ ):
+ self.transform_stack = self.transform_stack.parents
+ return self.transform_stack
+
+ def reset_trm(self) -> TextStateManagerChainMapType:
+ """Clear all transforms from chainmap having is_render==True"""
+ while self.transform_stack.maps[0]["is_render"]:
+ self.transform_stack = self.transform_stack.parents
+ return self.transform_stack
+
+ def remove_q(self) -> TextStateManagerChainMapType:
+ """Rewind to stack prior state after closing a 'q' with internal 'cm' ops"""
+ self.transform_stack = self.reset_tm()
+ self.transform_stack.maps = self.transform_stack.maps[
+ self.q_queue.pop(self.q_depth.pop(), 0) :
+ ]
+ return self.transform_stack
+
+ def add_q(self) -> None:
+ """Add another level to q_queue"""
+ self.q_depth.append(len(self.q_depth))
+
+ def add_cm(self, *args: Any) -> TextStateManagerChainMapType:
+ """Concatenate an additional transform matrix"""
+ self.transform_stack = self.reset_tm()
+ self.q_queue.update(self.q_depth[-1:])
+ self.transform_stack = self.transform_stack.new_child(self.new_transform(*args))
+ return self.transform_stack
+
+ def _complete_matrix(self, operands: List[float]) -> List[float]:
+ """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)"""
+ if len(operands) == 2: # this is a Td operator or equivalent
+ operands = [1.0, 0.0, 0.0, 1.0, *operands]
+ return operands
+
+ def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType:
+ """Append a text transform matrix"""
+ self.transform_stack = self.transform_stack.new_child(
+ self.new_transform( # type: ignore[misc]
+ *self._complete_matrix(operands), is_text=True # type: ignore[arg-type]
+ )
+ )
+ return self.transform_stack
+
+ def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType:
+ """Append a text rendering transform matrix"""
+ self.transform_stack = self.transform_stack.new_child(
+ self.new_transform( # type: ignore[misc]
+ *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type]
+ )
+ )
+ return self.transform_stack
+
+ @property
+ def effective_transform(self) -> List[float]:
+ """Current effective transform accounting for cm, tm, and trm transforms"""
+ eff_transform = [*self.transform_stack.maps[0].values()]
+ for transform in self.transform_stack.maps[1:]:
+ eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5
+ return eff_transform
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py
new file mode 100644
index 00000000..b6e6930c
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py
@@ -0,0 +1,127 @@
+"""A dataclass that captures the CTM and Text State for a tj operation"""
+
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Union
+
+from .. import mult, orient
+from ._font import Font
+
+
+@dataclass
+class TextStateParams:
+ """
+ Text state parameters and operator values for a single text value in a
+ TJ or Tj PDF operation.
+
+ Attributes:
+ txt (str): the text to be rendered.
+ font (Font): font object
+ font_size (int | float): font size
+ Tc (float): character spacing. Defaults to 0.0.
+ Tw (float): word spacing. Defaults to 0.0.
+ Tz (float): horizontal scaling. Defaults to 100.0.
+ TL (float): leading, vertical displacement between text lines. Defaults to 0.0.
+ Ts (float): text rise. Used for super/subscripts. Defaults to 0.0.
+ transform (List[float]): effective transformation matrix.
+ tx (float): x cood of rendered text, i.e. self.transform[4]
+ ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts.
+ displaced_tx (float): x coord immediately following rendered text
+ space_tx (float): tx for a space character
+ font_height (float): effective font height accounting for CTM
+ flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.)
+ rotated (bool): True if the text orientation is rotated with respect to the page.
+ """
+
+ txt: str
+ font: Font
+ font_size: Union[int, float]
+ Tc: float = 0.0
+ Tw: float = 0.0
+ Tz: float = 100.0
+ TL: float = 0.0
+ Ts: float = 0.0
+ transform: List[float] = field(
+ default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+ )
+ tx: float = field(default=0.0, init=False)
+ ty: float = field(default=0.0, init=False)
+ displaced_tx: float = field(default=0.0, init=False)
+ space_tx: float = field(default=0.0, init=False)
+ font_height: float = field(default=0.0, init=False)
+ flip_vertical: bool = field(default=False, init=False)
+ rotated: bool = field(default=False, init=False)
+
+ def __post_init__(self) -> None:
+ if orient(self.transform) in (90, 270):
+ self.transform = mult(
+ [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0],
+ self.transform,
+ )
+ self.rotated = True
+ # self.transform[0] AND self.transform[3] < 0 indicates true rotation.
+ # If only self.transform[3] < 0, the y coords are simply inverted.
+ if orient(self.transform) == 180 and self.transform[0] < -1e-6:
+ self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform)
+ self.rotated = True
+ self.displaced_tx = self.displaced_transform()[4]
+ self.tx = self.transform[4]
+ self.ty = self.render_transform()[5]
+ self.space_tx = round(self.word_tx(" "), 3)
+ if self.space_tx < 1e-6:
+ # if the " " char is assigned 0 width (e.g. for fine tuned spacing
+ # with TJ int operators a la crazyones.pdf), calculate space_tx as
+ # a TD_offset of -2 * font.space_width where font.space_width is
+ # the space_width calculated in _cmap.py.
+ self.space_tx = round(self.word_tx("", self.font.space_width * -2), 3)
+ self.font_height = self.font_size * math.sqrt(
+ self.transform[1] ** 2 + self.transform[3] ** 2
+ )
+ # flip_vertical handles PDFs generated by Microsoft Word's "publish" command.
+ self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis
+
+ def font_size_matrix(self) -> List[float]:
+ """Font size matrix"""
+ return [
+ self.font_size * (self.Tz / 100.0),
+ 0.0,
+ 0.0,
+ self.font_size,
+ 0.0,
+ self.Ts,
+ ]
+
+ def displaced_transform(self) -> List[float]:
+ """Effective transform matrix after text has been rendered."""
+ return mult(self.displacement_matrix(), self.transform)
+
+ def render_transform(self) -> List[float]:
+ """Effective transform matrix accounting for font size, Tz, and Ts."""
+ return mult(self.font_size_matrix(), self.transform)
+
+ def displacement_matrix(
+ self, word: Union[str, None] = None, TD_offset: float = 0.0
+ ) -> List[float]:
+ """
+ Text displacement matrix
+
+ Args:
+ word (str, optional): Defaults to None in which case self.txt displacement is
+ returned.
+ TD_offset (float, optional): translation applied by TD operator. Defaults to 0.0.
+ """
+ word = word if word is not None else self.txt
+ return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, TD_offset), 0.0]
+
+ def word_tx(self, word: str, TD_offset: float = 0.0) -> float:
+ """Horizontal text displacement for any word according this text state"""
+ return (
+ (self.font_size * ((self.font.word_width(word) - TD_offset) / 1000.0))
+ + self.Tc
+ + word.count(" ") * self.Tw
+ ) * (self.Tz / 100.0)
+
+ @staticmethod
+ def to_dict(inst: "TextStateParams") -> Dict[str, Any]:
+ """Dataclass to dict for json.dumps serialization"""
+ return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"}