aboutsummaryrefslogtreecommitdiff
path: root/.venv/lib/python3.12/site-packages/pypdf/_cmap.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/pypdf/_cmap.py')
-rw-r--r--.venv/lib/python3.12/site-packages/pypdf/_cmap.py520
1 files changed, 520 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/pypdf/_cmap.py b/.venv/lib/python3.12/site-packages/pypdf/_cmap.py
new file mode 100644
index 00000000..9a2d10a6
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/pypdf/_cmap.py
@@ -0,0 +1,520 @@
+from binascii import unhexlify
+from math import ceil
+from typing import Any, Dict, List, Tuple, Union, cast
+
+from ._codecs import adobe_glyphs, charset_encoding
+from ._utils import b_, logger_error, logger_warning
+from .generic import (
+ DecodedStreamObject,
+ DictionaryObject,
+ IndirectObject,
+ NullObject,
+ StreamObject,
+)
+
+
+# code freely inspired from @twiggy ; see #711
+def build_char_map(
+ font_name: str, space_width: float, obj: DictionaryObject
+) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
+ """
+ Determine information about a font.
+
+ Args:
+ font_name: font name as a string
+ space_width: default space width if no data is found.
+ obj: XObject or Page where you can find a /Resource dictionary
+
+ Returns:
+ Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
+ The font-dictionary itself is suitable for the curious.
+ """
+ ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
+ font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
+ space_width, ft
+ )
+ return font_subtype, font_halfspace, font_encoding, font_map, ft
+
+
+def build_char_map_from_dict(
+ space_width: float, ft: DictionaryObject
+) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
+ """
+ Determine information about a font.
+
+ Args:
+ space_width: default space with if no data found
+ (normally half the width of a character).
+ ft: Font Dictionary
+
+ Returns:
+ Font sub-type, space_width criteria(50% of width), encoding, map character-map.
+ The font-dictionary itself is suitable for the curious.
+ """
+ font_type: str = cast(str, ft["/Subtype"])
+
+ space_code = 32
+ encoding, space_code = parse_encoding(ft, space_code)
+ map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)
+
+ # encoding can be either a string for decode
+ # (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
+ # if empty string, it means it is than encoding field is not present and
+ # we have to select the good encoding from cmap input data
+ if encoding == "":
+ if -1 not in map_dict or map_dict[-1] == 1:
+ # I have not been able to find any rule for no /Encoding nor /ToUnicode
+ # One example shows /Symbol,bold I consider 8 bits encoding default
+ encoding = "charmap"
+ else:
+ encoding = "utf-16-be"
+ # apply rule from PDF ref 1.7 §5.9.1, 1st bullet :
+ # if cmap not empty encoding should be discarded
+ # (here transformed into identity for those characters)
+ # if encoding is an str it is expected to be a identity translation
+ elif isinstance(encoding, dict):
+ for x in int_entry:
+ if x <= 255:
+ encoding[x] = chr(x)
+ try:
+ # override space_width with new params
+ space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
+ except Exception:
+ pass
+ # I consider the space_code is available on one byte
+ if isinstance(space_code, str):
+ try: # one byte
+ sp = space_code.encode("charmap")[0]
+ except Exception:
+ sp = space_code.encode("utf-16-be")
+ sp = sp[0] + 256 * sp[1]
+ else:
+ sp = space_code
+ sp_width = compute_space_width(ft, sp, space_width)
+
+ return (
+ font_type,
+ float(sp_width / 2),
+ encoding,
+ # https://github.com/python/mypy/issues/4374
+ map_dict,
+ )
+
+
+# used when missing data, e.g. font def missing
+unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
+ "Unknown",
+ 9999,
+ dict(zip(range(256), ["�"] * 256)),
+ {},
+)
+
+
+_predefined_cmap: Dict[str, str] = {
+ "/Identity-H": "utf-16-be",
+ "/Identity-V": "utf-16-be",
+ "/GB-EUC-H": "gbk",
+ "/GB-EUC-V": "gbk",
+ "/GBpc-EUC-H": "gb2312",
+ "/GBpc-EUC-V": "gb2312",
+ "/GBK-EUC-H": "gbk",
+ "/GBK-EUC-V": "gbk",
+ "/GBK2K-H": "gb18030",
+ "/GBK2K-V": "gb18030",
+ "/ETen-B5-H": "cp950",
+ "/ETen-B5-V": "cp950",
+ "/ETenms-B5-H": "cp950",
+ "/ETenms-B5-V": "cp950",
+ "/UniCNS-UTF16-H": "utf-16-be",
+ "/UniCNS-UTF16-V": "utf-16-be",
+ # UCS2 in code
+}
+
+# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
+_default_fonts_space_width: Dict[str, int] = {
+ "/Courier": 600,
+ "/Courier-Bold": 600,
+ "/Courier-BoldOblique": 600,
+ "/Courier-Oblique": 600,
+ "/Helvetica": 278,
+ "/Helvetica-Bold": 278,
+ "/Helvetica-BoldOblique": 278,
+ "/Helvetica-Oblique": 278,
+ "/Helvetica-Narrow": 228,
+ "/Helvetica-NarrowBold": 228,
+ "/Helvetica-NarrowBoldOblique": 228,
+ "/Helvetica-NarrowOblique": 228,
+ "/Times-Roman": 250,
+ "/Times-Bold": 250,
+ "/Times-BoldItalic": 250,
+ "/Times-Italic": 250,
+ "/Symbol": 250,
+ "/ZapfDingbats": 278,
+}
+
+
+def parse_encoding(
+ ft: DictionaryObject, space_code: int
+) -> Tuple[Union[str, Dict[int, str]], int]:
+ encoding: Union[str, List[str], Dict[int, str]] = []
+ if "/Encoding" not in ft:
+ try:
+ if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
+ encoding = dict(
+ zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
+ )
+ else:
+ encoding = "charmap"
+ return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
+ except Exception:
+ if cast(str, ft["/Subtype"]) == "/Type1":
+ return "charmap", space_code
+ else:
+ return "", space_code
+ enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
+ if isinstance(enc, str):
+ try:
+ # already done : enc = NameObject.unnumber(enc.encode()).decode()
+ # for #xx decoding
+ if enc in charset_encoding:
+ encoding = charset_encoding[enc].copy()
+ elif enc in _predefined_cmap:
+ encoding = _predefined_cmap[enc]
+ elif "-UCS2-" in enc:
+ encoding = "utf-16-be"
+ else:
+ raise Exception("not found")
+ except Exception:
+ logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
+ encoding = enc
+ elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
+ try:
+ encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
+ except Exception:
+ logger_error(
+ f"Advanced encoding {encoding} not implemented yet",
+ __name__,
+ )
+ encoding = charset_encoding["/StandardCoding"].copy()
+ else:
+ encoding = charset_encoding["/StandardCoding"].copy()
+ if "/Differences" in enc:
+ x: int = 0
+ o: Union[int, str]
+ for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
+ if isinstance(o, int):
+ x = o
+ else: # isinstance(o,str):
+ try:
+ encoding[x] = adobe_glyphs[o] # type: ignore
+ except Exception:
+ encoding[x] = o # type: ignore
+ if o == " ":
+ space_code = x
+ x += 1
+ if isinstance(encoding, list):
+ encoding = dict(zip(range(256), encoding))
+ return encoding, space_code
+
+
+def parse_to_unicode(
+ ft: DictionaryObject, space_code: int
+) -> Tuple[Dict[Any, Any], int, List[int]]:
+ # will store all translation code
+ # and map_dict[-1] we will have the number of bytes to convert
+ map_dict: Dict[Any, Any] = {}
+
+ # will provide the list of cmap keys as int to correct encoding
+ int_entry: List[int] = []
+
+ if "/ToUnicode" not in ft:
+ if ft.get("/Subtype", "") == "/Type1":
+ return type1_alternative(ft, map_dict, space_code, int_entry)
+ else:
+ return {}, space_code, []
+ process_rg: bool = False
+ process_char: bool = False
+ multiline_rg: Union[
+ None, Tuple[int, int]
+ ] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
+ cm = prepare_cm(ft)
+ for line in cm.split(b"\n"):
+ process_rg, process_char, multiline_rg = process_cm_line(
+ line.strip(b" \t"),
+ process_rg,
+ process_char,
+ multiline_rg,
+ map_dict,
+ int_entry,
+ )
+
+ for a, value in map_dict.items():
+ if value == " ":
+ space_code = a
+ return map_dict, space_code, int_entry
+
+
+def prepare_cm(ft: DictionaryObject) -> bytes:
+ tu = ft["/ToUnicode"]
+ cm: bytes
+ if isinstance(tu, StreamObject):
+ cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
+ elif isinstance(tu, str) and tu.startswith("/Identity"):
+ # the full range 0000-FFFF will be processed
+ cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
+ if isinstance(cm, str):
+ cm = cm.encode()
+ # we need to prepare cm before due to missing return line in pdf printed
+ # to pdf from word
+ cm = (
+ cm.strip()
+ .replace(b"beginbfchar", b"\nbeginbfchar\n")
+ .replace(b"endbfchar", b"\nendbfchar\n")
+ .replace(b"beginbfrange", b"\nbeginbfrange\n")
+ .replace(b"endbfrange", b"\nendbfrange\n")
+ .replace(b"<<", b"\n{\n") # text between << and >> not used but
+ .replace(b">>", b"\n}\n") # some solution to find it back
+ )
+ ll = cm.split(b"<")
+ for i in range(len(ll)):
+ j = ll[i].find(b">")
+ if j >= 0:
+ if j == 0:
+ # string is empty: stash a placeholder here (see below)
+ # see https://github.com/py-pdf/pypdf/issues/1111
+ content = b"."
+ else:
+ content = ll[i][:j].replace(b" ", b"")
+ ll[i] = content + b" " + ll[i][j + 1 :]
+ cm = (
+ (b" ".join(ll))
+ .replace(b"[", b" [ ")
+ .replace(b"]", b" ]\n ")
+ .replace(b"\r", b"\n")
+ )
+ return cm
+
+
+def process_cm_line(
+ line: bytes,
+ process_rg: bool,
+ process_char: bool,
+ multiline_rg: Union[None, Tuple[int, int]],
+ map_dict: Dict[Any, Any],
+ int_entry: List[int],
+) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
+ if line == b"" or line[0] == 37: # 37 = %
+ return process_rg, process_char, multiline_rg
+ line = line.replace(b"\t", b" ")
+ if b"beginbfrange" in line:
+ process_rg = True
+ elif b"endbfrange" in line:
+ process_rg = False
+ elif b"beginbfchar" in line:
+ process_char = True
+ elif b"endbfchar" in line:
+ process_char = False
+ elif process_rg:
+ multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
+ elif process_char:
+ parse_bfchar(line, map_dict, int_entry)
+ return process_rg, process_char, multiline_rg
+
+
+def parse_bfrange(
+ line: bytes,
+ map_dict: Dict[Any, Any],
+ int_entry: List[int],
+ multiline_rg: Union[None, Tuple[int, int]],
+) -> Union[None, Tuple[int, int]]:
+ lst = [x for x in line.split(b" ") if x]
+ closure_found = False
+ if multiline_rg is not None:
+ fmt = b"%%0%dX" % (map_dict[-1] * 2)
+ a = multiline_rg[0] # a, b not in the current line
+ b = multiline_rg[1]
+ for sq in lst[0:]:
+ if sq == b"]":
+ closure_found = True
+ break
+ map_dict[
+ unhexlify(fmt % a).decode(
+ "charmap" if map_dict[-1] == 1 else "utf-16-be",
+ "surrogatepass",
+ )
+ ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+ int_entry.append(a)
+ a += 1
+ else:
+ a = int(lst[0], 16)
+ b = int(lst[1], 16)
+ nbi = max(len(lst[0]), len(lst[1]))
+ map_dict[-1] = ceil(nbi / 2)
+ fmt = b"%%0%dX" % (map_dict[-1] * 2)
+ if lst[2] == b"[":
+ for sq in lst[3:]:
+ if sq == b"]":
+ closure_found = True
+ break
+ map_dict[
+ unhexlify(fmt % a).decode(
+ "charmap" if map_dict[-1] == 1 else "utf-16-be",
+ "surrogatepass",
+ )
+ ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+ int_entry.append(a)
+ a += 1
+ else: # case without list
+ c = int(lst[2], 16)
+ fmt2 = b"%%0%dX" % max(4, len(lst[2]))
+ closure_found = True
+ while a <= b:
+ map_dict[
+ unhexlify(fmt % a).decode(
+ "charmap" if map_dict[-1] == 1 else "utf-16-be",
+ "surrogatepass",
+ )
+ ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
+ int_entry.append(a)
+ a += 1
+ c += 1
+ return None if closure_found else (a, b)
+
+
+def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
+ lst = [x for x in line.split(b" ") if x]
+ map_dict[-1] = len(lst[0]) // 2
+ while len(lst) > 1:
+ map_to = ""
+ # placeholder (see above) means empty string
+ if lst[1] != b".":
+ map_to = unhexlify(lst[1]).decode(
+ "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
+ ) # join is here as some cases where the code was split
+ map_dict[
+ unhexlify(lst[0]).decode(
+ "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
+ )
+ ] = map_to
+ int_entry.append(int(lst[0], 16))
+ lst = lst[2:]
+
+
+def compute_space_width(
+ ft: DictionaryObject, space_code: int, space_width: float
+) -> float:
+ sp_width: float = space_width * 2.0 # default value
+ w = []
+ w1 = {}
+ st: int = 0
+ if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
+ ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
+ try:
+ w1[-1] = cast(float, ft1["/DW"])
+ except Exception:
+ w1[-1] = 1000.0
+ if "/W" in ft1:
+ w = list(ft1["/W"])
+ else:
+ w = []
+ while len(w) > 0:
+ st = w[0] if isinstance(w[0], int) else w[0].get_object()
+ second = w[1].get_object()
+ if isinstance(second, int):
+ for x in range(st, second):
+ w1[x] = w[2]
+ w = w[3:]
+ elif isinstance(second, list):
+ for y in second:
+ w1[st] = y
+ st += 1
+ w = w[2:]
+ else:
+ logger_warning(
+ "unknown widths : \n" + (ft1["/W"]).__repr__(),
+ __name__,
+ )
+ break
+ try:
+ sp_width = w1[space_code]
+ except Exception:
+ sp_width = (
+ w1[-1] / 2.0
+ ) # if using default we consider space will be only half size
+ elif "/Widths" in ft:
+ w = list(ft["/Widths"]) # type: ignore
+ try:
+ st = cast(int, ft["/FirstChar"])
+ en: int = cast(int, ft["/LastChar"])
+ if st > space_code or en < space_code:
+ raise Exception("Not in range")
+ if w[space_code - st] == 0:
+ raise Exception("null width")
+ sp_width = w[space_code - st]
+ except Exception:
+ if "/FontDescriptor" in ft and "/MissingWidth" in cast(
+ DictionaryObject, ft["/FontDescriptor"]
+ ):
+ sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore
+ else:
+ # will consider width of char as avg(width)/2
+ m = 0
+ cpt = 0
+ for x in w:
+ if x > 0:
+ m += x
+ cpt += 1
+ sp_width = m / max(1, cpt) / 2
+
+ if isinstance(sp_width, IndirectObject):
+ # According to
+ # 'Table 122 - Entries common to all font descriptors (continued)'
+ # the MissingWidth should be a number, but according to #2286 it can
+ # be an indirect object
+ obj = sp_width.get_object()
+ if obj is None or isinstance(obj, NullObject):
+ return 0.0
+ return obj # type: ignore
+
+ return sp_width
+
+
+def type1_alternative(
+ ft: DictionaryObject,
+ map_dict: Dict[Any, Any],
+ space_code: int,
+ int_entry: List[int],
+) -> Tuple[Dict[Any, Any], int, List[int]]:
+ if "/FontDescriptor" not in ft:
+ return map_dict, space_code, int_entry
+ ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
+ if ft_desc is None:
+ return map_dict, space_code, int_entry
+ txt = ft_desc.get_object().get_data()
+ txt = txt.split(b"eexec\n")[0] # only clear part
+ txt = txt.split(b"/Encoding")[1] # to get the encoding part
+ lines = txt.replace(b"\r", b"\n").split(b"\n")
+ for li in lines:
+ if li.startswith(b"dup"):
+ words = [_w for _w in li.split(b" ") if _w != b""]
+ if len(words) > 3 and words[3] != b"put":
+ continue
+ try:
+ i = int(words[1])
+ except ValueError: # pragma: no cover
+ continue
+ try:
+ v = adobe_glyphs[words[2].decode()]
+ except KeyError:
+ if words[2].startswith(b"/uni"):
+ try:
+ v = chr(int(words[2][4:], 16))
+ except ValueError: # pragma: no cover
+ continue
+ else:
+ continue
+ if words[2].decode() == b" ":
+ space_code = i
+ map_dict[chr(i)] = v
+ int_entry.append(i)
+ return map_dict, space_code, int_entry