diff options
author | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
---|---|---|
committer | S. Solomon Darnell | 2025-03-28 21:52:21 -0500 |
commit | 4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch) | |
tree | ee3dc5af3b6313e921cd920906356f5d4febc4ed /.venv/lib/python3.12/site-packages/pypdf/_text_extraction | |
parent | cc961e04ba734dd72309fb548a2f97d67d578813 (diff) | |
download | gn-ai-master.tar.gz |
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_text_extraction')
7 files changed, 1342 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py new file mode 100644 index 00000000..3b1d687e --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/__init__.py @@ -0,0 +1,285 @@ +""" +Code related to text extraction. + +Some parts are still in _page.py. In doubt, they will stay there. +""" + +import math +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +from ..generic import DictionaryObject, TextStringObject, encode_pdfdocencoding + +CUSTOM_RTL_MIN: int = -1 +CUSTOM_RTL_MAX: int = -1 +CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] +LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 + + +class OrientationNotFoundError(Exception): + pass + + +def set_custom_rtl( + _min: Union[str, int, None] = None, + _max: Union[str, int, None] = None, + specials: Union[str, List[int], None] = None, +) -> Tuple[int, int, List[int]]: + """ + Change the Right-To-Left and special characters custom parameters. + + Args: + _min: The new minimum value for the range of custom characters that + will be written right to left. + If set to ``None``, the value will not be changed. + If set to an integer or string, it will be converted to its ASCII code. + The default value is -1, which sets no additional range to be converted. + _max: The new maximum value for the range of custom characters that will + be written right to left. + If set to ``None``, the value will not be changed. + If set to an integer or string, it will be converted to its ASCII code. + The default value is -1, which sets no additional range to be converted. + specials: The new list of special characters to be inserted in the + current insertion order. + If set to ``None``, the current value will not be changed. + If set to a string, it will be converted to a list of ASCII codes. + The default value is an empty list. + + Returns: + A tuple containing the new values for ``CUSTOM_RTL_MIN``, + ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. + """ + global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS + if isinstance(_min, int): + CUSTOM_RTL_MIN = _min + elif isinstance(_min, str): + CUSTOM_RTL_MIN = ord(_min) + if isinstance(_max, int): + CUSTOM_RTL_MAX = _max + elif isinstance(_max, str): + CUSTOM_RTL_MAX = ord(_max) + if isinstance(specials, str): + CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] + elif isinstance(specials, list): + CUSTOM_RTL_SPECIAL_CHARS = specials + return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS + + +def mult(m: List[float], n: List[float]) -> List[float]: + return [ + m[0] * n[0] + m[1] * n[2], + m[0] * n[1] + m[1] * n[3], + m[2] * n[0] + m[3] * n[2], + m[2] * n[1] + m[3] * n[3], + m[4] * n[0] + m[5] * n[2] + n[4], + m[4] * n[1] + m[5] * n[3] + n[5], + ] + + +def orient(m: List[float]) -> int: + if m[3] > 1e-6: + return 0 + elif m[3] < -1e-6: + return 180 + elif m[1] > 0: + return 90 + else: + return 270 + + +def crlf_space_check( + text: str, + cmtm_prev: Tuple[List[float], List[float]], + cmtm_matrix: Tuple[List[float], List[float]], + memo_cmtm: Tuple[List[float], List[float]], + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + orientations: Tuple[int, ...], + output: str, + font_size: float, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], + spacewidth: float, +) -> Tuple[str, str, List[float], List[float]]: + cm_prev = cmtm_prev[0] + tm_prev = cmtm_prev[1] + cm_matrix = cmtm_matrix[0] + tm_matrix = cmtm_matrix[1] + memo_cm = memo_cmtm[0] + memo_tm = memo_cmtm[1] + + m_prev = mult(tm_prev, cm_prev) + m = mult(tm_matrix, cm_matrix) + orientation = orient(m) + delta_x = m[4] - m_prev[4] + delta_y = m[5] - m_prev[5] + k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) + f = font_size * k + cm_prev = m + if orientation not in orientations: + raise OrientationNotFoundError + try: + if orientation == 0: + if delta_y < -0.8 * f: + if (output + text)[-1] != "\n": + output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + text = "" + elif ( + abs(delta_y) < f * 0.3 + and abs(delta_x) > spacewidth * f * 15 + and (output + text)[-1] != " " + ): + text += " " + elif orientation == 180: + if delta_y > 0.8 * f: + if (output + text)[-1] != "\n": + output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + text = "" + elif ( + abs(delta_y) < f * 0.3 + and abs(delta_x) > spacewidth * f * 15 + and (output + text)[-1] != " " + ): + text += " " + elif orientation == 90: + if delta_x > 0.8 * f: + if (output + text)[-1] != "\n": + output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + text = "" + elif ( + abs(delta_x) < f * 0.3 + and abs(delta_y) > spacewidth * f * 15 + and (output + text)[-1] != " " + ): + text += " " + elif orientation == 270: + if delta_x < -0.8 * f: + if (output + text)[-1] != "\n": + output += text + "\n" + if visitor_text is not None: + visitor_text( + text + "\n", + memo_cm, + memo_tm, + cmap[3], + font_size, + ) + text = "" + elif ( + abs(delta_x) < f * 0.3 + and abs(delta_y) > spacewidth * f * 15 + and (output + text)[-1] != " " + ): + text += " " + except Exception: + pass + tm_prev = tm_matrix.copy() + cm_prev = cm_matrix.copy() + return text, output, cm_prev, tm_prev + + +def handle_tj( + text: str, + operands: List[Union[str, TextStringObject]], + cm_matrix: List[float], + tm_matrix: List[float], + cmap: Tuple[ + Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] + ], + orientations: Tuple[int, ...], + output: str, + font_size: float, + rtl_dir: bool, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], +) -> Tuple[str, bool]: + m = mult(tm_matrix, cm_matrix) + orientation = orient(m) + if orientation in orientations and len(operands) > 0: + if isinstance(operands[0], str): + text += operands[0] + else: + t: str = "" + tt: bytes = ( + encode_pdfdocencoding(operands[0]) + if isinstance(operands[0], str) + else operands[0] + ) + if isinstance(cmap[0], str): + try: + t = tt.decode(cmap[0], "surrogatepass") # apply str encoding + except Exception: + # the data does not match the expectation, + # we use the alternative ; + # text extraction may not be good + t = tt.decode( + "utf-16-be" if cmap[0] == "charmap" else "charmap", + "surrogatepass", + ) # apply str encoding + else: # apply dict encoding + t = "".join( + [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] + ) + # "\u0590 - \u08FF \uFB50 - \uFDFF" + for x in [cmap[1][x] if x in cmap[1] else x for x in t]: + # x can be a sequence of bytes ; ex: habibi.pdf + if len(x) == 1: + xx = ord(x) + else: + xx = 1 + # fmt: off + if ( + # cases where the current inserting order is kept + (xx <= 0x2F) # punctuations but... + or 0x3A <= xx <= 0x40 # numbers (x30-39) + or 0x2000 <= xx <= 0x206F # upper punctuations.. + or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents + or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... + ): + text = x + text if rtl_dir else text + x + elif ( # right-to-left characters set + 0x0590 <= xx <= 0x08FF + or 0xFB1D <= xx <= 0xFDFF + or 0xFE70 <= xx <= 0xFEFF + or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX + ): + if not rtl_dir: + rtl_dir = True + output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + text = "" + text = x + text + else: # left-to-right + # print(">",xx,x,end="") + if rtl_dir: + rtl_dir = False + output += text + if visitor_text is not None: + visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + text = "" + text = text + x + # fmt: on + return text, rtl_dir diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py new file mode 100644 index 00000000..8f4d5929 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/__init__.py @@ -0,0 +1,16 @@ +"""Layout mode text extraction extension for pypdf""" +from ._fixed_width_page import ( + fixed_char_width, + fixed_width_page, + text_show_operations, + y_coordinate_groups, +) +from ._font import Font + +__all__ = [ + "fixed_char_width", + "fixed_width_page", + "text_show_operations", + "y_coordinate_groups", + "Font", +] diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py new file mode 100644 index 00000000..1be50095 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py @@ -0,0 +1,381 @@ +"""Extract PDF text preserving the layout of the source PDF""" + +import sys +from itertools import groupby +from math import ceil +from pathlib import Path +from typing import Any, Dict, Iterator, List, Optional, Tuple + +from ..._utils import logger_warning +from .. import LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS +from ._font import Font +from ._text_state_manager import TextStateManager +from ._text_state_params import TextStateParams + +if sys.version_info >= (3, 8): + from typing import Literal, TypedDict +else: + from typing_extensions import Literal, TypedDict + + +class BTGroup(TypedDict): + """ + Dict describing a line of text rendered within a BT/ET operator pair. + If multiple text show operations render text on the same line, the text + will be combined into a single BTGroup dict. + + Keys: + tx: x coordinate of first character in BTGroup + ty: y coordinate of first character in BTGroup + font_size: nominal font size + font_height: effective font height + text: rendered text + displaced_tx: x coordinate of last character in BTGroup + flip_sort: -1 if page is upside down, else 1 + """ + + tx: float + ty: float + font_size: float + font_height: float + text: str + displaced_tx: float + flip_sort: Literal[-1, 1] + + +def bt_group(tj_op: TextStateParams, rendered_text: str, dispaced_tx: float) -> BTGroup: + """ + BTGroup constructed from a TextStateParams instance, rendered text, and + displaced tx value. + + Args: + tj_op (TextStateParams): TextStateParams instance + rendered_text (str): rendered text + dispaced_tx (float): x coordinate of last character in BTGroup + """ + return BTGroup( + tx=tj_op.tx, + ty=tj_op.ty, + font_size=tj_op.font_size, + font_height=tj_op.font_height, + text=rendered_text, + displaced_tx=dispaced_tx, + flip_sort=-1 if tj_op.flip_vertical else 1, + ) + + +def recurs_to_target_op( + ops: Iterator[Tuple[List[Any], bytes]], + text_state_mgr: TextStateManager, + end_target: Literal[b"Q", b"ET"], + fonts: Dict[str, Font], + strip_rotated: bool = True, +) -> Tuple[List[BTGroup], List[TextStateParams]]: + """ + Recurse operators between BT/ET and/or q/Q operators managing the transform + stack and capturing text positioning and rendering data. + + Args: + ops: iterator of operators in content stream + text_state_mgr: a TextStateManager instance + end_target: Either b"Q" (ends b"q" op) or b"ET" (ends b"BT" op) + fonts: font dictionary as returned by PageObject._layout_mode_fonts() + + Returns: + tuple: list of BTGroup dicts + list of TextStateParams dataclass instances. + """ + # 1 entry per line of text rendered within each BT/ET operation. + bt_groups: List[BTGroup] = [] + + # 1 entry per text show operator (Tj/TJ/'/") + tj_ops: List[TextStateParams] = [] + + if end_target == b"Q": + # add new q level. cm's added at this level will be popped at next b'Q' + text_state_mgr.add_q() + + while True: + try: + operands, op = next(ops) + except StopIteration: + return bt_groups, tj_ops + if op == end_target: + if op == b"Q": + text_state_mgr.remove_q() + if op == b"ET": + if not tj_ops: + return bt_groups, tj_ops + _text = "" + bt_idx = 0 # idx of first tj in this bt group + last_displaced_tx = tj_ops[bt_idx].displaced_tx + last_ty = tj_ops[bt_idx].ty + for _idx, _tj in enumerate( + tj_ops + ): # ... build text from new Tj operators + if strip_rotated and _tj.rotated: + continue + # if the y position of the text is greater than the font height, assume + # the text is on a new line and start a new group + if abs(_tj.ty - last_ty) > _tj.font_height: + if _text.strip(): + bt_groups.append( + bt_group(tj_ops[bt_idx], _text, last_displaced_tx) + ) + bt_idx = _idx + _text = "" + + # if the x position of the text is less than the last x position by + # more than 5 spaces widths, assume the text order should be flipped + # and start a new group + if ( + last_displaced_tx - _tj.tx + > _tj.space_tx * LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS + ): + if _text.strip(): + bt_groups.append( + bt_group(tj_ops[bt_idx], _text, last_displaced_tx) + ) + bt_idx = _idx + last_displaced_tx = _tj.displaced_tx + _text = "" + + # calculate excess x translation based on ending tx of previous Tj. + # multiply by bool (_idx != bt_idx) to ensure spaces aren't double + # applied to the first tj of a BTGroup in fixed_width_page(). + excess_tx = round(_tj.tx - last_displaced_tx, 3) * (_idx != bt_idx) + # space_tx could be 0 if either Tz or font_size was 0 for this _tj. + spaces = int(excess_tx // _tj.space_tx) if _tj.space_tx else 0 + new_text = f'{" " * spaces}{_tj.txt}' + + last_ty = _tj.ty + _text = f"{_text}{new_text}" + last_displaced_tx = _tj.displaced_tx + if _text: + bt_groups.append(bt_group(tj_ops[bt_idx], _text, last_displaced_tx)) + text_state_mgr.reset_tm() + return bt_groups, tj_ops + if op == b"q": + bts, tjs = recurs_to_target_op( + ops, text_state_mgr, b"Q", fonts, strip_rotated + ) + bt_groups.extend(bts) + tj_ops.extend(tjs) + elif op == b"cm": + text_state_mgr.add_cm(*operands) + elif op == b"BT": + bts, tjs = recurs_to_target_op( + ops, text_state_mgr, b"ET", fonts, strip_rotated + ) + bt_groups.extend(bts) + tj_ops.extend(tjs) + elif op == b"Tj": + tj_ops.append(text_state_mgr.text_state_params(operands[0])) + elif op == b"TJ": + _tj = text_state_mgr.text_state_params() + for tj_op in operands[0]: + if isinstance(tj_op, bytes): + _tj = text_state_mgr.text_state_params(tj_op) + tj_ops.append(_tj) + else: + text_state_mgr.add_trm(_tj.displacement_matrix(TD_offset=tj_op)) + elif op == b"'": + text_state_mgr.reset_trm() + text_state_mgr.add_tm([0, -text_state_mgr.TL]) + tj_ops.append(text_state_mgr.text_state_params(operands[0])) + elif op == b'"': + text_state_mgr.reset_trm() + text_state_mgr.set_state_param(b"Tw", operands[0]) + text_state_mgr.set_state_param(b"Tc", operands[1]) + text_state_mgr.add_tm([0, -text_state_mgr.TL]) + tj_ops.append(text_state_mgr.text_state_params(operands[2])) + elif op in (b"Td", b"Tm", b"TD", b"T*"): + text_state_mgr.reset_trm() + if op == b"Tm": + text_state_mgr.reset_tm() + elif op == b"TD": + text_state_mgr.set_state_param(b"TL", -operands[1]) + elif op == b"T*": + operands = [0, -text_state_mgr.TL] + text_state_mgr.add_tm(operands) + elif op == b"Tf": + text_state_mgr.set_font(fonts[operands[0]], operands[1]) + else: # handle Tc, Tw, Tz, TL, and Ts operators + text_state_mgr.set_state_param(op, operands) + + +def y_coordinate_groups( + bt_groups: List[BTGroup], debug_path: Optional[Path] = None +) -> Dict[int, List[BTGroup]]: + """ + Group text operations by rendered y coordinate, i.e. the line number. + + Args: + bt_groups: list of dicts as returned by text_show_operations() + debug_path (Path, optional): Path to a directory for saving debug output. + + Returns: + Dict[int, List[BTGroup]]: dict of lists of text rendered by each BT operator + keyed by y coordinate + """ + ty_groups = { + ty: sorted(grp, key=lambda x: x["tx"]) + for ty, grp in groupby( + bt_groups, key=lambda bt_grp: int(bt_grp["ty"] * bt_grp["flip_sort"]) + ) + } + # combine groups whose y coordinates differ by less than the effective font height + # (accounts for mixed fonts and other minor oddities) + last_ty = next(iter(ty_groups)) + last_txs = {int(_t["tx"]) for _t in ty_groups[last_ty] if _t["text"].strip()} + for ty in list(ty_groups)[1:]: + fsz = min(ty_groups[_y][0]["font_height"] for _y in (ty, last_ty)) + txs = {int(_t["tx"]) for _t in ty_groups[ty] if _t["text"].strip()} + # prevent merge if both groups are rendering in the same x position. + no_text_overlap = not (txs & last_txs) + offset_less_than_font_height = abs(ty - last_ty) < fsz + if no_text_overlap and offset_less_than_font_height: + ty_groups[last_ty] = sorted( + ty_groups.pop(ty) + ty_groups[last_ty], key=lambda x: x["tx"] + ) + last_txs |= txs + else: + last_ty = ty + last_txs = txs + if debug_path: # pragma: no cover + import json + + debug_path.joinpath("bt_groups.json").write_text( + json.dumps(ty_groups, indent=2, default=str), "utf-8" + ) + return ty_groups + + +def text_show_operations( + ops: Iterator[Tuple[List[Any], bytes]], + fonts: Dict[str, Font], + strip_rotated: bool = True, + debug_path: Optional[Path] = None, +) -> List[BTGroup]: + """ + Extract text from BT/ET operator pairs. + + Args: + ops (Iterator[Tuple[List, bytes]]): iterator of operators in content stream + fonts (Dict[str, Font]): font dictionary + strip_rotated: Removes text if rotated w.r.t. to the page. Defaults to True. + debug_path (Path, optional): Path to a directory for saving debug output. + + Returns: + List[BTGroup]: list of dicts of text rendered by each BT operator + """ + state_mgr = TextStateManager() # transformation stack manager + debug = bool(debug_path) + bt_groups: List[BTGroup] = [] # BT operator dict + tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only) + try: + warned_rotation = False + while True: + operands, op = next(ops) + if op in (b"BT", b"q"): + bts, tjs = recurs_to_target_op( + ops, state_mgr, b"ET" if op == b"BT" else b"Q", fonts, strip_rotated + ) + if not warned_rotation and any(tj.rotated for tj in tjs): + warned_rotation = True + if strip_rotated: + logger_warning( + "Rotated text discovered. Output will be incomplete.", + __name__, + ) + else: + logger_warning( + "Rotated text discovered. Layout will be degraded.", + __name__, + ) + bt_groups.extend(bts) + if debug: # pragma: no cover + tj_debug.extend(tjs) + else: # set Tc, Tw, Tz, TL, and Ts if required. ignores all other ops + state_mgr.set_state_param(op, operands) + except StopIteration: + pass + + # left align the data, i.e. decrement all tx values by min(tx) + min_x = min((x["tx"] for x in bt_groups), default=0.0) + bt_groups = [ + dict(ogrp, tx=ogrp["tx"] - min_x, displaced_tx=ogrp["displaced_tx"] - min_x) # type: ignore[misc] + for ogrp in sorted( + bt_groups, key=lambda x: (x["ty"] * x["flip_sort"], -x["tx"]), reverse=True + ) + ] + + if debug_path: # pragma: no cover + import json + + debug_path.joinpath("bts.json").write_text( + json.dumps(bt_groups, indent=2, default=str), "utf-8" + ) + debug_path.joinpath("tjs.json").write_text( + json.dumps( + tj_debug, indent=2, default=lambda x: getattr(x, "to_dict", str)(x) + ), + "utf-8", + ) + return bt_groups + + +def fixed_char_width(bt_groups: List[BTGroup], scale_weight: float = 1.25) -> float: + """ + Calculate average character width weighted by the length of the rendered + text in each sample for conversion to fixed-width layout. + + Args: + bt_groups (List[BTGroup]): List of dicts of text rendered by each + BT operator + + Returns: + float: fixed character width + """ + char_widths = [] + for _bt in bt_groups: + _len = len(_bt["text"]) * scale_weight + char_widths.append(((_bt["displaced_tx"] - _bt["tx"]) / _len, _len)) + return sum(_w * _l for _w, _l in char_widths) / sum(_l for _, _l in char_widths) + + +def fixed_width_page( + ty_groups: Dict[int, List[BTGroup]], char_width: float, space_vertically: bool +) -> str: + """ + Generate page text from text operations grouped by rendered y coordinate. + + Args: + ty_groups: dict of text show ops as returned by y_coordinate_groups() + char_width: fixed character width + space_vertically: include blank lines inferred from y distance + font height. + + Returns: + str: page text in a fixed width format that closely adheres to the rendered + layout in the source pdf. + """ + lines: List[str] = [] + last_y_coord = 0 + for y_coord, line_data in ty_groups.items(): + if space_vertically and lines: + blank_lines = ( + int(abs(y_coord - last_y_coord) / line_data[0]["font_height"]) - 1 + ) + lines.extend([""] * blank_lines) + line = "" + last_disp = 0.0 + for bt_op in line_data: + offset = int(bt_op["tx"] // char_width) + spaces = (offset - len(line)) * (ceil(last_disp) < int(bt_op["tx"])) + line = f"{line}{' ' * spaces}{bt_op['text']}" + last_disp = bt_op["displaced_tx"] + if line.strip() or lines: + lines.append( + "".join(c if ord(c) < 14 or ord(c) > 31 else " " for c in line) + ) + last_y_coord = y_coord + return "\n".join(ln.rstrip() for ln in lines if space_vertically or ln.strip()) diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py new file mode 100644 index 00000000..a912fddb --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font.py @@ -0,0 +1,112 @@ +"""Font constants and classes for "layout" mode text operations""" + +from dataclasses import dataclass, field +from typing import Any, Dict, Sequence, Union + +from ...generic import IndirectObject +from ._font_widths import STANDARD_WIDTHS + + +@dataclass +class Font: + """ + A font object formatted for use during "layout" mode text extraction + + Attributes: + subtype (str): font subtype + space_width (int | float): width of a space character + encoding (str | Dict[int, str]): font encoding + char_map (dict): character map + font_dictionary (dict): font dictionary + """ + + subtype: str + space_width: Union[int, float] + encoding: Union[str, Dict[int, str]] + char_map: Dict[Any, Any] + font_dictionary: Dict[Any, Any] + width_map: Dict[str, int] = field(default_factory=dict, init=False) + + def __post_init__(self) -> None: + # TrueType fonts have a /Widths array mapping character codes to widths + if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary: + first_char = self.font_dictionary.get("/FirstChar", 0) + self.width_map = { + self.encoding.get(idx + first_char, chr(idx + first_char)): width + for idx, width in enumerate(self.font_dictionary["/Widths"]) + } + + # CID fonts have a /W array mapping character codes to widths stashed in /DescendantFonts + if "/DescendantFonts" in self.font_dictionary: + d_font: Dict[Any, Any] + for d_font_idx, d_font in enumerate( + self.font_dictionary["/DescendantFonts"] + ): + while isinstance(d_font, IndirectObject): + d_font = d_font.get_object() # type: ignore[assignment] + self.font_dictionary["/DescendantFonts"][d_font_idx] = d_font + ord_map = { + ord(_target): _surrogate + for _target, _surrogate in self.char_map.items() + if isinstance(_target, str) + } + # /W width definitions have two valid formats which can be mixed and matched: + # (1) A character start index followed by a list of widths, e.g. + # `45 [500 600 700]` applies widths 500, 600, 700 to characters 45-47. + # (2) A character start index, a character stop index, and a width, e.g. + # `45 65 500` applies width 500 to characters 45-65. + skip_count = 0 + _w = d_font.get("/W", []) + for idx, w_entry in enumerate(_w): + if skip_count: + skip_count -= 1 + continue + if not isinstance(w_entry, (int, float)): # pragma: no cover + # We should never get here due to skip_count above. Add a + # warning and or use reader's "strict" to force an ex??? + continue + # check for format (1): `int [int int int int ...]` + if isinstance(_w[idx + 1], Sequence): + start_idx, width_list = _w[idx : idx + 2] + self.width_map.update( + { + ord_map[_cidx]: _width + for _cidx, _width in zip( + range(start_idx, start_idx + len(width_list), 1), + width_list, + ) + if _cidx in ord_map + } + ) + skip_count = 1 + # check for format (2): `int int int` + if not isinstance(_w[idx + 1], Sequence) and not isinstance( + _w[idx + 2], Sequence + ): + start_idx, stop_idx, const_width = _w[idx : idx + 3] + self.width_map.update( + { + ord_map[_cidx]: const_width + for _cidx in range(start_idx, stop_idx + 1, 1) + if _cidx in ord_map + } + ) + skip_count = 2 + if not self.width_map and "/BaseFont" in self.font_dictionary: + for key in STANDARD_WIDTHS: + if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): + self.width_map = STANDARD_WIDTHS[key] + break + + def word_width(self, word: str) -> float: + """Sum of character widths specified in PDF font for the supplied word""" + return sum( + [self.width_map.get(char, self.space_width * 2) for char in word], 0.0 + ) + + @staticmethod + def to_dict(font_instance: "Font") -> Dict[str, Any]: + """Dataclass to dict for json.dumps serialization.""" + return { + k: getattr(font_instance, k) for k in font_instance.__dataclass_fields__ + } diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py new file mode 100644 index 00000000..39092bcd --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_font_widths.py @@ -0,0 +1,208 @@ +# Widths for the standard 14 fonts as described on page 416 of the PDF 1.7 standard +STANDARD_WIDTHS = { + "Helvetica": { # 4 fonts, includes bold, oblique and boldoblique variants + " ": 278, + "!": 278, + '"': 355, + "#": 556, + "$": 556, + "%": 889, + "&": 667, + "'": 191, + "(": 333, + ")": 333, + "*": 389, + "+": 584, + ",": 278, + "-": 333, + ".": 278, + "/": 278, + "0": 556, + "1": 556, + "2": 556, + "3": 556, + "4": 556, + "5": 556, + "6": 556, + "7": 556, + "8": 556, + "9": 556, + ":": 278, + ";": 278, + "<": 584, + "=": 584, + ">": 584, + "?": 611, + "@": 975, + "A": 667, + "B": 667, + "C": 722, + "D": 722, + "E": 667, + "F": 611, + "G": 778, + "H": 722, + "I": 278, + "J": 500, + "K": 667, + "L": 556, + "M": 833, + "N": 722, + "O": 778, + "P": 667, + "Q": 944, + "R": 667, + "S": 667, + "T": 611, + "U": 278, + "V": 278, + "W": 584, + "X": 556, + "Y": 556, + "Z": 500, + "[": 556, + "\\": 556, + "]": 556, + "^": 278, + "_": 278, + "`": 278, + "a": 278, + "b": 278, + "c": 333, + "d": 556, + "e": 556, + "f": 556, + "g": 556, + "h": 556, + "i": 556, + "j": 556, + "k": 556, + "l": 556, + "m": 556, + "n": 278, + "o": 278, + "p": 556, + "q": 556, + "r": 500, + "s": 556, + "t": 556, + "u": 278, + "v": 500, + "w": 500, + "x": 222, + "y": 222, + "z": 556, + "{": 222, + "|": 833, + "}": 556, + "~": 556, + }, + "Times": { # 4 fonts, includes bold, oblique and boldoblique variants + " ": 250, + "!": 333, + '"': 408, + "#": 500, + "$": 500, + "%": 833, + "&": 778, + "'": 180, + "(": 333, + ")": 333, + "*": 500, + "+": 564, + ",": 250, + "-": 333, + ".": 250, + "/": 564, + "0": 500, + "1": 500, + "2": 500, + "3": 500, + "4": 500, + "5": 500, + "6": 500, + "7": 500, + "8": 500, + "9": 500, + ":": 278, + ";": 278, + "<": 564, + "=": 564, + ">": 564, + "?": 444, + "@": 921, + "A": 722, + "B": 667, + "C": 667, + "D": 722, + "E": 611, + "F": 556, + "G": 722, + "H": 722, + "I": 333, + "J": 389, + "K": 722, + "L": 611, + "M": 889, + "N": 722, + "O": 722, + "P": 556, + "Q": 722, + "R": 667, + "S": 556, + "T": 611, + "U": 722, + "V": 722, + "W": 944, + "X": 722, + "Y": 722, + "Z": 611, + "[": 333, + "\\": 278, + "]": 333, + "^": 469, + "_": 500, + "`": 333, + "a": 444, + "b": 500, + "c": 444, + "d": 500, + "e": 444, + "f": 333, + "g": 500, + "h": 500, + "i": 278, + "j": 278, + "k": 500, + "l": 278, + "m": 722, + "n": 500, + "o": 500, + "p": 500, + "q": 500, + "r": 333, + "s": 389, + "t": 278, + "u": 500, + "v": 444, + "w": 722, + "x": 500, + "y": 444, + "z": 389, + "{": 348, + "|": 220, + "}": 348, + "~": 469, + }, +} +STANDARD_WIDTHS[ + "Courier" +] = { # 4 fonts, includes bold, oblique and boldoblique variants + c: 600 for c in STANDARD_WIDTHS["Times"] # fixed width +} +STANDARD_WIDTHS["ZapfDingbats"] = {c: 1000 for c in STANDARD_WIDTHS["Times"]} # 1 font +STANDARD_WIDTHS["Symbol"] = {c: 500 for c in STANDARD_WIDTHS["Times"]} # 1 font +# add aliases per table H.3 on page 1110 of the PDF 1.7 standard +STANDARD_WIDTHS["CourierNew"] = STANDARD_WIDTHS["Courier"] +STANDARD_WIDTHS["Arial"] = STANDARD_WIDTHS["Helvetica"] +STANDARD_WIDTHS["TimesNewRoman"] = STANDARD_WIDTHS["Times"] diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py new file mode 100644 index 00000000..3c5d4736 --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_manager.py @@ -0,0 +1,213 @@ +"""manage the PDF transform stack during "layout" mode text extraction""" + +from collections import ChainMap, Counter +from typing import Any, Dict, List, MutableMapping, Union +from typing import ChainMap as ChainMapType +from typing import Counter as CounterType + +from ...errors import PdfReadError +from .. import mult +from ._font import Font +from ._text_state_params import TextStateParams + +TextStateManagerChainMapType = ChainMapType[Union[int, str], Union[float, bool]] +TextStateManagerDictType = MutableMapping[Union[int, str], Union[float, bool]] + + +class TextStateManager: + """ + Tracks the current text state including cm/tm/trm transformation matrices. + + Attributes: + transform_stack (ChainMap): ChainMap of cm/tm transformation matrices + q_queue (Counter[int]): Counter of q operators + q_depth (List[int]): list of q operator nesting levels + Tc (float): character spacing + Tw (float): word spacing + Tz (int): horizontal scaling + TL (float): leading + Ts (float): text rise + font (Font): font object + font_size (int | float): font size + """ + + def __init__(self) -> None: + self.transform_stack: TextStateManagerChainMapType = ChainMap( + self.new_transform() + ) + self.q_queue: CounterType[int] = Counter() + self.q_depth = [0] + self.Tc: float = 0.0 + self.Tw: float = 0.0 + self.Tz: float = 100.0 + self.TL: float = 0.0 + self.Ts: float = 0.0 + self.font: Union[Font, None] = None + self.font_size: Union[int, float] = 0 + + def set_state_param(self, op: bytes, value: Union[float, List[Any]]) -> None: + """ + Set a text state parameter. Supports Tc, Tz, Tw, TL, and Ts operators. + + Args: + op: operator read from PDF stream as bytes. No action is taken + for unsupported operators (see supported operators above). + value (float | List[Any]): new parameter value. If a list, + value[0] is used. + """ + if op not in [b"Tc", b"Tz", b"Tw", b"TL", b"Ts"]: + return + self.__setattr__(op.decode(), value[0] if isinstance(value, list) else value) + + def set_font(self, font: Font, size: float) -> None: + """ + Set the current font and font_size. + + Args: + font (Font): a layout mode Font + size (float): font size + """ + self.font = font + self.font_size = size + + def text_state_params(self, value: Union[bytes, str] = "") -> TextStateParams: + """ + Create a TextStateParams instance to display a text string. Type[bytes] values + will be decoded implicitly. + + Args: + value (str | bytes): text to associate with the captured state. + + Raises: + PdfReadError: if font not set (no Tf operator in incoming pdf content stream) + + Returns: + TextStateParams: current text state parameters + """ + if not isinstance(self.font, Font): + raise PdfReadError( + "font not set: is PDF missing a Tf operator?" + ) # pragma: no cover + if isinstance(value, bytes): + try: + if isinstance(self.font.encoding, str): + txt = value.decode(self.font.encoding, "surrogatepass") + else: + txt = "".join( + self.font.encoding[x] + if x in self.font.encoding + else bytes((x,)).decode() + for x in value + ) + except (UnicodeEncodeError, UnicodeDecodeError): + txt = value.decode("utf-8", "replace") + txt = "".join( + self.font.char_map[x] if x in self.font.char_map else x for x in txt + ) + else: + txt = value + return TextStateParams( + txt, + self.font, + self.font_size, + self.Tc, + self.Tw, + self.Tz, + self.TL, + self.Ts, + self.effective_transform, + ) + + @staticmethod + def raw_transform( + _a: float = 1.0, + _b: float = 0.0, + _c: float = 0.0, + _d: float = 1.0, + _e: float = 0.0, + _f: float = 0.0, + ) -> Dict[int, float]: + """Only a/b/c/d/e/f matrix params""" + return dict(zip(range(6), map(float, (_a, _b, _c, _d, _e, _f)))) + + @staticmethod + def new_transform( + _a: float = 1.0, + _b: float = 0.0, + _c: float = 0.0, + _d: float = 1.0, + _e: float = 0.0, + _f: float = 0.0, + is_text: bool = False, + is_render: bool = False, + ) -> TextStateManagerDictType: + """Standard a/b/c/d/e/f matrix params + 'is_text' and 'is_render' keys""" + result: Any = TextStateManager.raw_transform(_a, _b, _c, _d, _e, _f) + result.update({"is_text": is_text, "is_render": is_render}) + return result + + def reset_tm(self) -> TextStateManagerChainMapType: + """Clear all transforms from chainmap having is_text==True or is_render==True""" + while ( + self.transform_stack.maps[0]["is_text"] + or self.transform_stack.maps[0]["is_render"] + ): + self.transform_stack = self.transform_stack.parents + return self.transform_stack + + def reset_trm(self) -> TextStateManagerChainMapType: + """Clear all transforms from chainmap having is_render==True""" + while self.transform_stack.maps[0]["is_render"]: + self.transform_stack = self.transform_stack.parents + return self.transform_stack + + def remove_q(self) -> TextStateManagerChainMapType: + """Rewind to stack prior state after closing a 'q' with internal 'cm' ops""" + self.transform_stack = self.reset_tm() + self.transform_stack.maps = self.transform_stack.maps[ + self.q_queue.pop(self.q_depth.pop(), 0) : + ] + return self.transform_stack + + def add_q(self) -> None: + """Add another level to q_queue""" + self.q_depth.append(len(self.q_depth)) + + def add_cm(self, *args: Any) -> TextStateManagerChainMapType: + """Concatenate an additional transform matrix""" + self.transform_stack = self.reset_tm() + self.q_queue.update(self.q_depth[-1:]) + self.transform_stack = self.transform_stack.new_child(self.new_transform(*args)) + return self.transform_stack + + def _complete_matrix(self, operands: List[float]) -> List[float]: + """Adds a, b, c, and d to an "e/f only" operand set (e.g Td)""" + if len(operands) == 2: # this is a Td operator or equivalent + operands = [1.0, 0.0, 0.0, 1.0, *operands] + return operands + + def add_tm(self, operands: List[float]) -> TextStateManagerChainMapType: + """Append a text transform matrix""" + self.transform_stack = self.transform_stack.new_child( + self.new_transform( # type: ignore[misc] + *self._complete_matrix(operands), is_text=True # type: ignore[arg-type] + ) + ) + return self.transform_stack + + def add_trm(self, operands: List[float]) -> TextStateManagerChainMapType: + """Append a text rendering transform matrix""" + self.transform_stack = self.transform_stack.new_child( + self.new_transform( # type: ignore[misc] + *self._complete_matrix(operands), is_text=True, is_render=True # type: ignore[arg-type] + ) + ) + return self.transform_stack + + @property + def effective_transform(self) -> List[float]: + """Current effective transform accounting for cm, tm, and trm transforms""" + eff_transform = [*self.transform_stack.maps[0].values()] + for transform in self.transform_stack.maps[1:]: + eff_transform = mult(eff_transform, transform) # type: ignore[arg-type] # dict has int keys 0-5 + return eff_transform diff --git a/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py new file mode 100644 index 00000000..b6e6930c --- /dev/null +++ b/.venv/lib/python3.12/site-packages/pypdf/_text_extraction/_layout_mode/_text_state_params.py @@ -0,0 +1,127 @@ +"""A dataclass that captures the CTM and Text State for a tj operation""" + +import math +from dataclasses import dataclass, field +from typing import Any, Dict, List, Union + +from .. import mult, orient +from ._font import Font + + +@dataclass +class TextStateParams: + """ + Text state parameters and operator values for a single text value in a + TJ or Tj PDF operation. + + Attributes: + txt (str): the text to be rendered. + font (Font): font object + font_size (int | float): font size + Tc (float): character spacing. Defaults to 0.0. + Tw (float): word spacing. Defaults to 0.0. + Tz (float): horizontal scaling. Defaults to 100.0. + TL (float): leading, vertical displacement between text lines. Defaults to 0.0. + Ts (float): text rise. Used for super/subscripts. Defaults to 0.0. + transform (List[float]): effective transformation matrix. + tx (float): x cood of rendered text, i.e. self.transform[4] + ty (float): y cood of rendered text. May differ from self.transform[5] per self.Ts. + displaced_tx (float): x coord immediately following rendered text + space_tx (float): tx for a space character + font_height (float): effective font height accounting for CTM + flip_vertical (bool): True if y axis has been inverted (i.e. if self.transform[3] < 0.) + rotated (bool): True if the text orientation is rotated with respect to the page. + """ + + txt: str + font: Font + font_size: Union[int, float] + Tc: float = 0.0 + Tw: float = 0.0 + Tz: float = 100.0 + TL: float = 0.0 + Ts: float = 0.0 + transform: List[float] = field( + default_factory=lambda: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + ) + tx: float = field(default=0.0, init=False) + ty: float = field(default=0.0, init=False) + displaced_tx: float = field(default=0.0, init=False) + space_tx: float = field(default=0.0, init=False) + font_height: float = field(default=0.0, init=False) + flip_vertical: bool = field(default=False, init=False) + rotated: bool = field(default=False, init=False) + + def __post_init__(self) -> None: + if orient(self.transform) in (90, 270): + self.transform = mult( + [1.0, -self.transform[1], -self.transform[2], 1.0, 0.0, 0.0], + self.transform, + ) + self.rotated = True + # self.transform[0] AND self.transform[3] < 0 indicates true rotation. + # If only self.transform[3] < 0, the y coords are simply inverted. + if orient(self.transform) == 180 and self.transform[0] < -1e-6: + self.transform = mult([-1.0, 0.0, 0.0, -1.0, 0.0, 0.0], self.transform) + self.rotated = True + self.displaced_tx = self.displaced_transform()[4] + self.tx = self.transform[4] + self.ty = self.render_transform()[5] + self.space_tx = round(self.word_tx(" "), 3) + if self.space_tx < 1e-6: + # if the " " char is assigned 0 width (e.g. for fine tuned spacing + # with TJ int operators a la crazyones.pdf), calculate space_tx as + # a TD_offset of -2 * font.space_width where font.space_width is + # the space_width calculated in _cmap.py. + self.space_tx = round(self.word_tx("", self.font.space_width * -2), 3) + self.font_height = self.font_size * math.sqrt( + self.transform[1] ** 2 + self.transform[3] ** 2 + ) + # flip_vertical handles PDFs generated by Microsoft Word's "publish" command. + self.flip_vertical = self.transform[3] < -1e-6 # inverts y axis + + def font_size_matrix(self) -> List[float]: + """Font size matrix""" + return [ + self.font_size * (self.Tz / 100.0), + 0.0, + 0.0, + self.font_size, + 0.0, + self.Ts, + ] + + def displaced_transform(self) -> List[float]: + """Effective transform matrix after text has been rendered.""" + return mult(self.displacement_matrix(), self.transform) + + def render_transform(self) -> List[float]: + """Effective transform matrix accounting for font size, Tz, and Ts.""" + return mult(self.font_size_matrix(), self.transform) + + def displacement_matrix( + self, word: Union[str, None] = None, TD_offset: float = 0.0 + ) -> List[float]: + """ + Text displacement matrix + + Args: + word (str, optional): Defaults to None in which case self.txt displacement is + returned. + TD_offset (float, optional): translation applied by TD operator. Defaults to 0.0. + """ + word = word if word is not None else self.txt + return [1.0, 0.0, 0.0, 1.0, self.word_tx(word, TD_offset), 0.0] + + def word_tx(self, word: str, TD_offset: float = 0.0) -> float: + """Horizontal text displacement for any word according this text state""" + return ( + (self.font_size * ((self.font.word_width(word) - TD_offset) / 1000.0)) + + self.Tc + + word.count(" ") * self.Tw + ) * (self.Tz / 100.0) + + @staticmethod + def to_dict(inst: "TextStateParams") -> Dict[str, Any]: + """Dataclass to dict for json.dumps serialization""" + return {k: getattr(inst, k) for k in inst.__dataclass_fields__ if k != "font"} |