LibPDF: Allow reading documents with incremental updates

The PDF spec allows incremental changes of a document by appending a
new XRef table and file trailer to it. These will only contain the
changed objects and will point back to the previous change, forming an
arbitrarily long chain of XRef sections and file trailers.

Every one of those XRef sections may be encoded as an XRef stream as
well, in which case the trailer is part of the stream dictionary as
usual. To make this easier, I made it so every XRef table may "own" a
trailer. This means that the main file trailer is now part of the main
XRef table.
This commit is contained in:
Julian Offenhäuser 2023-02-11 20:39:40 +01:00 committed by Linus Groh
parent 0c230f5ff0
commit 34350ee9e7
Notes: sideshowbarker 2024-07-17 00:29:38 +09:00
3 changed files with 41 additions and 25 deletions

View file

@ -186,14 +186,12 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
// The linearization parameter dictionary has just been parsed, and the xref table
// comes immediately after it. We are in the correct spot.
m_xref_table = TRY(parse_xref_table());
if (!m_trailer)
m_trailer = TRY(parse_file_trailer());
// Also parse the main xref table and merge into the first-page xref table. Note
// that we don't use the main xref table offset from the linearization dict because
// for some reason, it specified the offset of the whitespace after the object
// index start and length? So it's much easier to do it this way.
auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
auto main_xref_table_offset = m_xref_table->trailer()->get_value(CommonNames::Prev).to_int();
m_reader.move_to(main_xref_table_offset);
auto main_xref_table = TRY(parse_xref_table());
TRY(m_xref_table->merge(move(*main_xref_table)));
@ -267,15 +265,31 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
return error("No xref");
m_reader.set_reading_forwards();
auto xref_offset_value = parse_number();
if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
return error("Invalid xref offset");
auto xref_offset = xref_offset_value.value().get<int>();
auto xref_offset_value = TRY(parse_number());
auto xref_offset = TRY(m_document->resolve_to<int>(xref_offset_value));
m_reader.move_to(xref_offset);
m_xref_table = TRY(parse_xref_table());
if (!m_trailer)
m_trailer = TRY(parse_file_trailer());
// As per 7.5.6 Incremental Updates:
// When a conforming reader reads the file, it shall build its cross-reference
// information in such a way that the most recent copy of each object shall be
// the one accessed from the file.
// NOTE: This means that we have to follow back the chain of XRef table sections
// and only add objects that were not already specified in a previous
// (and thus newer) XRef section.
while (1) {
auto xref_table = TRY(parse_xref_table());
if (!m_xref_table)
m_xref_table = xref_table;
else
TRY(m_xref_table->merge(move(*xref_table)));
if (!xref_table->trailer() || !xref_table->trailer()->contains(CommonNames::Prev))
break;
auto offset = TRY(m_document->resolve_to<int>(xref_table->trailer()->get_value(CommonNames::Prev)));
m_reader.move_to(offset);
}
return validate_xref_table_and_fix_if_necessary();
}
@ -406,7 +420,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
}
}
m_trailer = dict;
table->set_trailer(dict);
return table;
}
@ -424,10 +438,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
auto table = adopt_ref(*new XRefTable());
do {
if (m_reader.matches("trailer"))
return table;
while (m_reader.matches_number()) {
Vector<XRefEntry> entries;
auto starting_index_value = TRY(parse_number());
@ -470,7 +481,11 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
}
table->add_section({ starting_index, object_count, entries });
} while (m_reader.matches_number());
}
m_reader.consume_whitespace();
if (m_reader.matches("trailer"))
table->set_trailer(TRY(parse_file_trailer()));
return table;
}

View file

@ -20,7 +20,7 @@ public:
Linearized,
};
[[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
[[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_xref_table->trailer(); }
// Parses the header and initializes the xref table and trailer
PDFErrorOr<void> initialize();
@ -94,7 +94,6 @@ private:
bool navigate_to_after_startxref();
RefPtr<XRefTable> m_xref_table;
RefPtr<DictObject> m_trailer;
Optional<LinearizationDictionary> m_linearization_dictionary;
};

View file

@ -35,7 +35,7 @@ public:
{
auto this_size = m_entries.size();
auto other_size = other.m_entries.size();
m_entries.ensure_capacity(other_size);
TRY(m_entries.try_ensure_capacity(other_size));
for (size_t i = 0; i < other_size; i++) {
auto other_entry = other.m_entries[i];
@ -46,12 +46,9 @@ public:
auto this_entry = m_entries[i];
if (this_entry.byte_offset == invalid_byte_offset) {
// Only add values that we don't already have.
if (this_entry.byte_offset == invalid_byte_offset)
m_entries[i] = other_entry;
} else if (other_entry.byte_offset != invalid_byte_offset) {
// Both xref tables have an entry for the same object index
return Error { Error::Type::Parse, "Conflicting xref entry during merge" };
}
}
return {};
@ -68,8 +65,12 @@ public:
m_entries.append(entry);
}
void set_trailer(RefPtr<DictObject> trailer) { m_trailer = trailer; }
ALWAYS_INLINE Vector<XRefEntry>& entries() { return m_entries; }
ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
[[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const
{
return index < m_entries.size() && m_entries[index].byte_offset != -1;
@ -113,6 +114,7 @@ private:
friend struct AK::Formatter<PDF::XRefTable>;
Vector<XRefEntry> m_entries;
RefPtr<DictObject> m_trailer;
};
}