Skip to content

Commit 15c42ff

Browse files
ENH: Add basic support for JBIG2 by using jbig2dec (#3163)
Closes #1989.
1 parent 77238e0 commit 15c42ff

File tree

6 files changed

+225
-4
lines changed

6 files changed

+225
-4
lines changed

.github/workflows/github-ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
sudo apt-get update
7171
- name: Install APT dependencies
7272
run:
73-
sudo apt-get install ghostscript poppler-utils
73+
sudo apt-get install ghostscript jbig2dec poppler-utils
7474
- name: Checkout Code
7575
uses: actions/checkout@v4
7676
with:

docs/user/installation.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ If you plan to use image extraction, you need Pillow:
4848
pip install pypdf[image]
4949
```
5050

51+
For JBIG2 support, you need to install a global OS-level package as well:
52+
[`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure
53+
depends on our operating system. For Ubuntu, just use the following for example:
54+
55+
```
56+
sudo apt-get install jbig2dec
57+
```
58+
5159
## Python Version Support
5260

5361
Since pypdf 4.0, every release, including point releases, should work with all

pypdf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ class FilterTypes(StrEnum):
245245
CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF
246246
DCT_DECODE = "/DCTDecode" # abbreviation: DCT
247247
JPX_DECODE = "/JPXDecode"
248+
JBIG2_DECODE = "/JBIG2Decode"
248249

249250

250251
class FilterTypeAbbreviations:

pypdf/filters.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,16 @@
3535
__author_email__ = "biziqe@mathieu.fenniak.net"
3636

3737
import math
38+
import os
39+
import shutil
3840
import struct
41+
import subprocess
3942
import zlib
4043
from base64 import a85decode
4144
from dataclasses import dataclass
4245
from io import BytesIO
46+
from pathlib import Path
47+
from tempfile import TemporaryDirectory
4348
from typing import Any, Dict, List, Optional, Tuple, Union, cast
4449

4550
from ._codecs._codecs import LzwCodec as _LzwCodec
@@ -56,13 +61,15 @@
5661
from .constants import ImageAttributes as IA
5762
from .constants import LzwFilterParameters as LZW
5863
from .constants import StreamAttributes as SA
59-
from .errors import DeprecationError, PdfReadError, PdfStreamError
64+
from .errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
6065
from .generic import (
6166
ArrayObject,
6267
BooleanObject,
6368
DictionaryObject,
6469
IndirectObject,
6570
NullObject,
71+
StreamObject,
72+
is_null_or_none,
6673
)
6774

6875

@@ -641,6 +648,67 @@ def decode(
641648
return tiff_header + data
642649

643650

651+
JBIG2DEC_BINARY = shutil.which("jbig2dec")
652+
653+
654+
class JBIG2Decode:
655+
@staticmethod
656+
def decode(
657+
data: bytes,
658+
decode_parms: Optional[DictionaryObject] = None,
659+
**kwargs: Any,
660+
) -> bytes:
661+
if JBIG2DEC_BINARY is None:
662+
raise DependencyError("jbig2dec binary is not available.")
663+
664+
with TemporaryDirectory() as tempdir:
665+
directory = Path(tempdir)
666+
paths: List[Path] = []
667+
668+
if decode_parms and "/JBIG2Globals" in decode_parms:
669+
jbig2_globals = decode_parms["/JBIG2Globals"]
670+
if not is_null_or_none(jbig2_globals) and not is_null_or_none(pointer := jbig2_globals.get_object()):
671+
assert pointer is not None, "mypy"
672+
if isinstance(pointer, StreamObject):
673+
path = directory.joinpath("globals.jbig2")
674+
path.write_bytes(pointer.get_data())
675+
paths.append(path)
676+
677+
path = directory.joinpath("image.jbig2")
678+
path.write_bytes(data)
679+
paths.append(path)
680+
681+
environment = os.environ.copy()
682+
environment["LC_ALL"] = "C"
683+
result = subprocess.run( # noqa: S603
684+
[JBIG2DEC_BINARY, "--embedded", "--format", "png", "--output", "-", *paths],
685+
capture_output=True,
686+
env=environment,
687+
)
688+
if b"unrecognized option '--embedded'" in result.stderr:
689+
raise DependencyError("jbig2dec>=0.15 is required.")
690+
if result.stderr:
691+
for line in result.stderr.decode("utf-8").splitlines():
692+
logger_warning(line, __name__)
693+
if result.returncode != 0:
694+
raise PdfStreamError(f"Unable to decode JBIG2 data. Exit code: {result.returncode}")
695+
return result.stdout
696+
697+
@staticmethod
698+
def _is_binary_compatible() -> bool:
699+
if not JBIG2DEC_BINARY: # pragma: no cover
700+
return False
701+
result = subprocess.run( # noqa: S603
702+
[JBIG2DEC_BINARY, "--version"],
703+
capture_output=True,
704+
text=True,
705+
)
706+
version = result.stdout.split(" ", maxsplit=1)[1]
707+
708+
from ._utils import Version
709+
return Version(version) >= Version("0.15")
710+
711+
644712
def decode_stream_data(stream: Any) -> bytes:
645713
"""
646714
Decode the stream data based on the specified filters.
@@ -691,6 +759,8 @@ def decode_stream_data(stream: Any) -> bytes:
691759
data = DCTDecode.decode(data)
692760
elif filter_name == FT.JPX_DECODE:
693761
data = JPXDecode.decode(data)
762+
elif filter_name == FT.JBIG2_DECODE:
763+
data = JBIG2Decode.decode(data, params)
694764
elif filter_name == "/Crypt":
695765
if "/Name" in params or "/Type" in params:
696766
raise NotImplementedError(
@@ -828,6 +898,13 @@ def _apply_alpha(
828898
".tiff",
829899
False,
830900
)
901+
elif lfilters == FT.JBIG2_DECODE:
902+
img, image_format, extension, invert_color = (
903+
Image.open(BytesIO(data), formats=("PNG",)),
904+
"PNG",
905+
".png",
906+
False,
907+
)
831908
elif mode == "CMYK":
832909
img, image_format, extension, invert_color = (
833910
_extended_image_frombytes(mode, size, data),

tests/test_filters.py

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,31 @@
66
from io import BytesIO
77
from itertools import product as cartesian_product
88
from pathlib import Path
9+
from unittest import mock
910

1011
import pytest
1112
from PIL import Image, ImageOps
1213

1314
from pypdf import PdfReader
14-
from pypdf.errors import DeprecationError, PdfReadError
15+
from pypdf.errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError
1516
from pypdf.filters import (
1617
ASCII85Decode,
1718
ASCIIHexDecode,
1819
CCITParameters,
1920
CCITTFaxDecode,
2021
CCITTParameters,
2122
FlateDecode,
23+
JBIG2Decode,
24+
)
25+
from pypdf.generic import (
26+
ArrayObject,
27+
ContentStream,
28+
DictionaryObject,
29+
IndirectObject,
30+
NameObject,
31+
NullObject,
32+
NumberObject,
2233
)
23-
from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NumberObject
2434

2535
from . import PILContext, get_data_from_url
2636
from .test_encryption import HAS_AES
@@ -697,3 +707,86 @@ def test_flate_decode__not_rectangular(caplog):
697707
expected = get_data_from_url(url, name=name)
698708
assert actual_image.getvalue() == expected
699709
assert caplog.messages == ["Image data is not rectangular. Adding padding."]
710+
711+
712+
def test_jbig2decode__binary_errors():
713+
with mock.patch("pypdf.filters.JBIG2DEC_BINARY", None), \
714+
pytest.raises(DependencyError, match="jbig2dec binary is not available."):
715+
JBIG2Decode.decode(b"dummy")
716+
717+
result = subprocess.CompletedProcess(
718+
args=["dummy"], returncode=0, stdout=b"",
719+
stderr=(
720+
b"jbig2dec: unrecognized option '--embedded'\n"
721+
b"Usage: jbig2dec [options] <file.jbig2>\n"
722+
b" or jbig2dec [options] <global_stream> <page_stream>\n"
723+
)
724+
)
725+
with mock.patch("pypdf.filters.subprocess.run", return_value=result), \
726+
mock.patch("pypdf.filters.JBIG2DEC_BINARY", "/usr/bin/jbig2dec"), \
727+
pytest.raises(DependencyError, match="jbig2dec>=0.15 is required."):
728+
JBIG2Decode.decode(b"dummy")
729+
730+
731+
@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
732+
def test_jbig2decode__edge_cases(caplog):
733+
image_data = (
734+
b'\x00\x00\x00\x010\x00\x01\x00\x00\x00\x13\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x06"'
735+
b'\x00\x01\x00\x00\x00\x1c\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x9f\xa8_\xff\xac'
736+
737+
)
738+
jbig2_globals = b"\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x18\x00\x00\x03\xff\xfd\xff\x02\xfe\xfe\xfe\x00\x00\x00\x01\x00\x00\x00\x01R\xd0u7\xff\xac" # noqa: E501
739+
740+
# Validation: Is our image data valid?
741+
content_stream = ContentStream(stream=None, pdf=None)
742+
content_stream.set_data(jbig2_globals)
743+
result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": content_stream}))
744+
image = Image.open(BytesIO(result), formats=("PNG",))
745+
for x in range(5):
746+
for y in range(5):
747+
assert image.getpixel((x, y)) == (255 if x < 3 else 0), (x, y)
748+
assert caplog.messages == []
749+
750+
# No decode_params. Completely white image.
751+
result = JBIG2Decode.decode(image_data)
752+
image = Image.open(BytesIO(result), formats=("PNG",))
753+
for x in range(5):
754+
for y in range(5):
755+
assert image.getpixel((x, y)) == 255, (x, y)
756+
assert caplog.messages == [
757+
"jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)",
758+
"jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)"
759+
]
760+
caplog.clear()
761+
762+
# JBIG2Globals is NULL. Completely white image.
763+
result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": NullObject()}))
764+
image = Image.open(BytesIO(result), formats=("PNG",))
765+
for x in range(5):
766+
for y in range(5):
767+
assert image.getpixel((x, y)) == 255, (x, y)
768+
assert caplog.messages == [
769+
"jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)",
770+
"jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)"
771+
]
772+
caplog.clear()
773+
774+
# JBIG2Globals is DictionaryObject. Completely white image.
775+
result = JBIG2Decode.decode(image_data, decode_parms=DictionaryObject({"/JBIG2Globals": DictionaryObject()}))
776+
image = Image.open(BytesIO(result), formats=("PNG",))
777+
for x in range(5):
778+
for y in range(5):
779+
assert image.getpixel((x, y)) == 255, (x, y)
780+
assert caplog.messages == [
781+
"jbig2dec WARNING text region refers to no symbol dictionaries (segment 0x00000002)",
782+
"jbig2dec WARNING ignoring out of range symbol ID (0/0) (segment 0x00000002)"
783+
]
784+
caplog.clear()
785+
786+
# Invalid input.
787+
with pytest.raises(PdfStreamError, match="Unable to decode JBIG2 data. Exit code: 1"):
788+
JBIG2Decode.decode(b"aaaaaa")
789+
assert caplog.messages == [
790+
"jbig2dec FATAL ERROR page has no image, cannot be completed",
791+
"jbig2dec WARNING unable to complete page"
792+
]

tests/test_images.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from PIL import Image, ImageChops, ImageDraw
1616

1717
from pypdf import PageObject, PdfReader, PdfWriter
18+
from pypdf.filters import JBIG2Decode
1819
from pypdf.generic import ContentStream, NameObject, NullObject
1920

2021
from . import get_data_from_url
@@ -530,3 +531,44 @@ def test_inline_image_containing_ei_in_body():
530531
output = BytesIO()
531532
writer.write(output)
532533
assert expected in output.getvalue()
534+
535+
536+
@pytest.mark.enable_socket
537+
@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
538+
def test_jbig2decode():
539+
url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
540+
name = "jbig2.pdf"
541+
542+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
543+
page = reader.pages[0]
544+
image = next(iter(page.images))
545+
assert image.image.size == (5138, 6630)
546+
assert image.image.mode == "1"
547+
assert image.image.format == "PNG"
548+
549+
url = "https://github.com/user-attachments/assets/d6f88c80-a2e0-4ea9-b1e0-34442041d004"
550+
name = "jbig2.png"
551+
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
552+
553+
assert image_similarity(image.image, img) >= 0.999
554+
555+
556+
@pytest.mark.enable_socket
557+
@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec")
558+
def test_jbig2decode__jbig2globals():
559+
url = "https://github.com/user-attachments/files/20119148/out.pdf"
560+
name = "jbig2_globals.pdf"
561+
562+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
563+
page = reader.pages[0]
564+
image = next(iter(page.images))
565+
assert image.image.size == (1067, 1067)
566+
assert image.image.mode == "1"
567+
assert image.image.format == "PNG"
568+
569+
url = "https://github.com/user-attachments/assets/7ac41ee3-9c13-44cf-aa74-8f106287e354"
570+
name = "jbig2_globals.png"
571+
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
572+
573+
# Wrong image: 0.9618265964800714
574+
assert image_similarity(image.image, img) >= 0.999

0 commit comments

Comments
 (0)