about summary refs log tree commit diff
path: root/.venv/lib/python3.12/site-packages/PyPDF2/_cmap.py
diff options
context:
space:
mode:
Diffstat (limited to '.venv/lib/python3.12/site-packages/PyPDF2/_cmap.py')
-rw-r--r--.venv/lib/python3.12/site-packages/PyPDF2/_cmap.py413
1 files changed, 413 insertions, 0 deletions
diff --git a/.venv/lib/python3.12/site-packages/PyPDF2/_cmap.py b/.venv/lib/python3.12/site-packages/PyPDF2/_cmap.py
new file mode 100644
index 00000000..db082a82
--- /dev/null
+++ b/.venv/lib/python3.12/site-packages/PyPDF2/_cmap.py
@@ -0,0 +1,413 @@
+import warnings
+from binascii import unhexlify
+from math import ceil
+from typing import Any, Dict, List, Tuple, Union, cast
+
+from ._codecs import adobe_glyphs, charset_encoding
+from ._utils import logger_warning
+from .errors import PdfReadWarning
+from .generic import DecodedStreamObject, DictionaryObject, StreamObject
+
+
+# code freely inspired from @twiggy ; see #711
+def build_char_map(
+    font_name: str, space_width: float, obj: DictionaryObject
+) -> Tuple[
+    str, float, Union[str, Dict[int, str]], Dict, DictionaryObject
+]:  # font_type,space_width /2, encoding, cmap
+    """Determine information about a font.
+
+    This function returns a tuple consisting of:
+    font sub-type, space_width/2, encoding, map character-map, font-dictionary.
+    The font-dictionary itself is suitable for the curious."""
+    ft: DictionaryObject = obj["/Resources"]["/Font"][font_name]  # type: ignore
+    font_type: str = cast(str, ft["/Subtype"])
+
+    space_code = 32
+    encoding, space_code = parse_encoding(ft, space_code)
+    map_dict, space_code, int_entry = parse_to_unicode(ft, space_code)
+
+    # encoding can be either a string for decode (on 1,2 or a variable number of bytes) of a char table (for 1 byte only for me)
+    # if empty string, it means it is than encoding field is not present and we have to select the good encoding from cmap input data
+    if encoding == "":
+        if -1 not in map_dict or map_dict[-1] == 1:
+            # I have not been able to find any rule for no /Encoding nor /ToUnicode
+            # One example shows /Symbol,bold I consider 8 bits encoding default
+            encoding = "charmap"
+        else:
+            encoding = "utf-16-be"
+    # apply rule from PDF ref 1.7 §5.9.1, 1st bullet : if cmap not empty encoding should be discarded (here transformed into identity for those characters)
+    # if encoding is an str it is expected to be a identity translation
+    elif isinstance(encoding, dict):
+        for x in int_entry:
+            if x <= 255:
+                encoding[x] = chr(x)
+    try:
+        # override space_width with new params
+        space_width = _default_fonts_space_width[cast(str, ft["/BaseFont"])]
+    except Exception:
+        pass
+    # I conside the space_code is available on one byte
+    if isinstance(space_code, str):
+        try:  # one byte
+            sp = space_code.encode("charmap")[0]
+        except Exception:
+            sp = space_code.encode("utf-16-be")
+            sp = sp[0] + 256 * sp[1]
+    else:
+        sp = space_code
+    sp_width = compute_space_width(ft, sp, space_width)
+
+    return (
+        font_type,
+        float(sp_width / 2),
+        encoding,
+        # https://github.com/python/mypy/issues/4374
+        map_dict,
+        ft,
+    )
+
+
+# used when missing data, e.g. font def missing
+unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
+    "Unknown",
+    9999,
+    dict(zip(range(256), ["�"] * 256)),
+    {},
+)
+
+
+_predefined_cmap: Dict[str, str] = {
+    "/Identity-H": "utf-16-be",
+    "/Identity-V": "utf-16-be",
+    "/GB-EUC-H": "gbk",  # TBC
+    "/GB-EUC-V": "gbk",  # TBC
+    "/GBpc-EUC-H": "gb2312",  # TBC
+    "/GBpc-EUC-V": "gb2312",  # TBC
+}
+
+
+# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
+_default_fonts_space_width: Dict[str, int] = {
+    "/Courrier": 600,
+    "/Courier-Bold": 600,
+    "/Courier-BoldOblique": 600,
+    "/Courier-Oblique": 600,
+    "/Helvetica": 278,
+    "/Helvetica-Bold": 278,
+    "/Helvetica-BoldOblique": 278,
+    "/Helvetica-Oblique": 278,
+    "/Helvetica-Narrow": 228,
+    "/Helvetica-NarrowBold": 228,
+    "/Helvetica-NarrowBoldOblique": 228,
+    "/Helvetica-NarrowOblique": 228,
+    "/Times-Roman": 250,
+    "/Times-Bold": 250,
+    "/Times-BoldItalic": 250,
+    "/Times-Italic": 250,
+    "/Symbol": 250,
+    "/ZapfDingbats": 278,
+}
+
+
+def parse_encoding(
+    ft: DictionaryObject, space_code: int
+) -> Tuple[Union[str, Dict[int, str]], int]:
+    encoding: Union[str, List[str], Dict[int, str]] = []
+    if "/Encoding" not in ft:
+        try:
+            if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
+                encoding = dict(
+                    zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
+                )
+            else:
+                encoding = "charmap"
+            return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
+        except Exception:
+            if cast(str, ft["/Subtype"]) == "/Type1":
+                return "charmap", space_code
+            else:
+                return "", space_code
+    enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object()  # type: ignore
+    if isinstance(enc, str):
+        try:
+            # allready done : enc = NameObject.unnumber(enc.encode()).decode()  # for #xx decoding
+            if enc in charset_encoding:
+                encoding = charset_encoding[enc].copy()
+            elif enc in _predefined_cmap:
+                encoding = _predefined_cmap[enc]
+            else:
+                raise Exception("not found")
+        except Exception:
+            warnings.warn(
+                f"Advanced encoding {enc} not implemented yet",
+                PdfReadWarning,
+            )
+            encoding = enc
+    elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
+        try:
+            encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
+        except Exception:
+            warnings.warn(
+                f"Advanced encoding {encoding} not implemented yet",
+                PdfReadWarning,
+            )
+            encoding = charset_encoding["/StandardCoding"].copy()
+    else:
+        encoding = charset_encoding["/StandardCoding"].copy()
+    if "/Differences" in enc:
+        x: int = 0
+        o: Union[int, str]
+        for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
+            if isinstance(o, int):
+                x = o
+            else:  # isinstance(o,str):
+                try:
+                    encoding[x] = adobe_glyphs[o]  # type: ignore
+                except Exception:
+                    encoding[x] = o  # type: ignore
+                    if o == " ":
+                        space_code = x
+                x += 1
+    if isinstance(encoding, list):
+        encoding = dict(zip(range(256), encoding))
+    return encoding, space_code
+
+
+def parse_to_unicode(
+    ft: DictionaryObject, space_code: int
+) -> Tuple[Dict[Any, Any], int, List[int]]:
+    # will store all translation code
+    # and map_dict[-1] we will have the number of bytes to convert
+    map_dict: Dict[Any, Any] = {}
+
+    # will provide the list of cmap keys as int to correct encoding
+    int_entry: List[int] = []
+
+    if "/ToUnicode" not in ft:
+        return {}, space_code, []
+    process_rg: bool = False
+    process_char: bool = False
+    multiline_rg: Union[
+        None, Tuple[int, int]
+    ] = None  # tuple = (current_char, remaining size) ; cf #1285 for example of file
+    cm = prepare_cm(ft)
+    for l in cm.split(b"\n"):
+        process_rg, process_char, multiline_rg = process_cm_line(
+            l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
+        )
+
+    for a, value in map_dict.items():
+        if value == " ":
+            space_code = a
+    return map_dict, space_code, int_entry
+
+
+def prepare_cm(ft: DictionaryObject) -> bytes:
+    tu = ft["/ToUnicode"]
+    cm: bytes
+    if isinstance(tu, StreamObject):
+        cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
+    elif isinstance(tu, str) and tu.startswith("/Identity"):
+        cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"  # the full range 0000-FFFF will be processed
+    if isinstance(cm, str):
+        cm = cm.encode()
+    # we need to prepare cm before due to missing return line in pdf printed to pdf from word
+    cm = (
+        cm.strip()
+        .replace(b"beginbfchar", b"\nbeginbfchar\n")
+        .replace(b"endbfchar", b"\nendbfchar\n")
+        .replace(b"beginbfrange", b"\nbeginbfrange\n")
+        .replace(b"endbfrange", b"\nendbfrange\n")
+        .replace(b"<<", b"\n{\n")  # text between << and >> not used but
+        .replace(b">>", b"\n}\n")  # some solution to find it back
+    )
+    ll = cm.split(b"<")
+    for i in range(len(ll)):
+        j = ll[i].find(b">")
+        if j >= 0:
+            if j == 0:
+                # string is empty: stash a placeholder here (see below)
+                # see https://github.com/py-pdf/PyPDF2/issues/1111
+                content = b"."
+            else:
+                content = ll[i][:j].replace(b" ", b"")
+            ll[i] = content + b" " + ll[i][j + 1 :]
+    cm = (
+        (b" ".join(ll))
+        .replace(b"[", b" [ ")
+        .replace(b"]", b" ]\n ")
+        .replace(b"\r", b"\n")
+    )
+    return cm
+
+
+def process_cm_line(
+    l: bytes,
+    process_rg: bool,
+    process_char: bool,
+    multiline_rg: Union[None, Tuple[int, int]],
+    map_dict: Dict[Any, Any],
+    int_entry: List[int],
+) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
+    if l in (b"", b" ") or l[0] == 37:  # 37 = %
+        return process_rg, process_char, multiline_rg
+    if b"beginbfrange" in l:
+        process_rg = True
+    elif b"endbfrange" in l:
+        process_rg = False
+    elif b"beginbfchar" in l:
+        process_char = True
+    elif b"endbfchar" in l:
+        process_char = False
+    elif process_rg:
+        multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
+    elif process_char:
+        parse_bfchar(l, map_dict, int_entry)
+    return process_rg, process_char, multiline_rg
+
+
+def parse_bfrange(
+    l: bytes,
+    map_dict: Dict[Any, Any],
+    int_entry: List[int],
+    multiline_rg: Union[None, Tuple[int, int]],
+) -> Union[None, Tuple[int, int]]:
+    lst = [x for x in l.split(b" ") if x]
+    closure_found = False
+    nbi = max(len(lst[0]), len(lst[1]))
+    map_dict[-1] = ceil(nbi / 2)
+    fmt = b"%%0%dX" % (map_dict[-1] * 2)
+    if multiline_rg is not None:
+        a = multiline_rg[0]  # a, b not in the current line
+        b = multiline_rg[1]
+        for sq in lst[1:]:
+            if sq == b"]":
+                closure_found = True
+                break
+            map_dict[
+                unhexlify(fmt % a).decode(
+                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                    "surrogatepass",
+                )
+            ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+            int_entry.append(a)
+            a += 1
+    else:
+        a = int(lst[0], 16)
+        b = int(lst[1], 16)
+        if lst[2] == b"[":
+            for sq in lst[3:]:
+                if sq == b"]":
+                    closure_found = True
+                    break
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+        else:  # case without list
+            c = int(lst[2], 16)
+            fmt2 = b"%%0%dX" % max(4, len(lst[2]))
+            closure_found = True
+            while a <= b:
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+                c += 1
+    return None if closure_found else (a, b)
+
+
+def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
+    lst = [x for x in l.split(b" ") if x]
+    map_dict[-1] = len(lst[0]) // 2
+    while len(lst) > 1:
+        map_to = ""
+        # placeholder (see above) means empty string
+        if lst[1] != b".":
+            map_to = unhexlify(lst[1]).decode(
+                "charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
+            )  # join is here as some cases where the code was split
+        map_dict[
+            unhexlify(lst[0]).decode(
+                "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
+            )
+        ] = map_to
+        int_entry.append(int(lst[0], 16))
+        lst = lst[2:]
+
+
+def compute_space_width(
+    ft: DictionaryObject, space_code: int, space_width: float
+) -> float:
+    sp_width: float = space_width * 2  # default value
+    w = []
+    w1 = {}
+    st: int = 0
+    if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
+        ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
+        try:
+            w1[-1] = cast(float, ft1["/DW"])
+        except Exception:
+            w1[-1] = 1000.0
+        if "/W" in ft1:
+            w = list(ft1["/W"])
+        else:
+            w = []
+        while len(w) > 0:
+            st = w[0]
+            second = w[1]
+            if isinstance(second, int):
+                for x in range(st, second):
+                    w1[x] = w[2]
+                w = w[3:]
+            elif isinstance(second, list):
+                for y in second:
+                    w1[st] = y
+                    st += 1
+                w = w[2:]
+            else:
+                logger_warning(
+                    "unknown widths : \n" + (ft1["/W"]).__repr__(),
+                    __name__,
+                )
+                break
+        try:
+            sp_width = w1[space_code]
+        except Exception:
+            sp_width = (
+                w1[-1] / 2.0
+            )  # if using default we consider space will be only half size
+    elif "/Widths" in ft:
+        w = list(ft["/Widths"])  # type: ignore
+        try:
+            st = cast(int, ft["/FirstChar"])
+            en: int = cast(int, ft["/LastChar"])
+            if st > space_code or en < space_code:
+                raise Exception("Not in range")
+            if w[space_code - st] == 0:
+                raise Exception("null width")
+            sp_width = w[space_code - st]
+        except Exception:
+            if "/FontDescriptor" in ft and "/MissingWidth" in cast(
+                DictionaryObject, ft["/FontDescriptor"]
+            ):
+                sp_width = ft["/FontDescriptor"]["/MissingWidth"]  # type: ignore
+            else:
+                # will consider width of char as avg(width)/2
+                m = 0
+                cpt = 0
+                for x in w:
+                    if x > 0:
+                        m += x
+                        cpt += 1
+                sp_width = m / max(1, cpt) / 2
+    return sp_width