Meta/jbig2_to_pdf.py: Allow jbig2 files with random-access organization

jbig2 data in PDFs is in the embedded organization, which is like the
sequential organization with the file header removed.

That means jbig2 files using the random-access organization need to
be transformed to be supported. A random-access jbig2 has all segment
headers at the start, followed by the data of all segments. Decode
all headers and rewrite them to the sequential organization, where
each segment header is followed by that segment's data.

The motivation is that almost all of the jbig2 files in
ghostpdl/test/jbig2 use the random-access organization.
This commit is contained in:
Nico Weber 2024-04-01 19:48:26 -07:00 committed by Tim Flynn
parent c1c1ad8678
commit 2872c37993
Notes: sideshowbarker 2024-07-17 01:12:07 +09:00

View file

@ -9,15 +9,80 @@ Usage is a bit clunky (use Build/lagom/bin/file to get the dimensions):
% open foo.pdf
"""
from dataclasses import dataclass
import argparse
import sys
import struct
import textwrap
EndOfFile = 51
def dedent(b):
return textwrap.dedent(b.decode('latin1')).encode('latin1')
@dataclass
class SegmentHeader:
segment_header_size: int
type: int
bytes: bytes
data_size: int
def read_segment_header(data, offset):
segment_number, = struct.unpack_from('>I', data, offset)
flags = data[offset + 4]
segment_page_association_size_is_32_bits = (flags & 0b100_0000) != 0
type = (flags & 0b11_1111)
referred_segments_count = data[offset + 5] >> 5
if referred_segments_count > 4:
raise Exception('cannot handle more than 4 referred-to segments')
if segment_number <= 256:
ref_size = 1
elif segment_number <= 65536:
ref_size = 2
else:
ref_size = 4
segment_header_size = 4 + 1 + 1 + ref_size * referred_segments_count
if segment_page_association_size_is_32_bits:
segment_header_size += 4
else:
segment_header_size += 1
data_size, = struct.unpack_from('>I', data, offset + segment_header_size)
if data_size == 0xffff_ffff:
raise Exception('cannot handle indeterminate length')
segment_header_size += 4
bytes = data[offset:offset + segment_header_size]
return SegmentHeader(segment_header_size, type, bytes, data_size)
def random_access_to_sequential(data):
offset = 0
segment_headers = []
while True:
segment_header = read_segment_header(data, offset)
segment_headers.append(segment_header)
offset += segment_header.segment_header_size
if segment_header.type == EndOfFile:
break
out_data = bytes()
for segment_header in segment_headers:
out_data += segment_header.bytes
out_data += data[offset:offset + segment_header.data_size]
offset += segment_header.data_size
return out_data
def main():
parser = argparse.ArgumentParser(
epilog=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
@ -36,13 +101,14 @@ def main():
# strip jbig2 header
image_data = image_data[8:]
if image_data[0] & 1 == 0:
print('random-access jbig2 does not work', file=sys.stderr)
sys.exit(1)
is_random_access = image_data[0] & 1 == 0
if image_data[0] & 2 == 0:
image_data = image_data[4:]
image_data = image_data[1:]
if is_random_access:
image_data = random_access_to_sequential(image_data)
start = dedent(b'''\
%PDF-1.4
%\265\266