LibPDF: Allow reading documents with incremental updates

The PDF spec allows incremental changes of a document by appending a new XRef table and file trailer to it. These will only contain the changed objects and will point back to the previous change, forming an arbitrarily long chain of XRef sections and file trailers. Every one of those XRef sections may be encoded as an XRef stream as well, in which case the trailer is part of the stream dictionary as usual. To make this easier, I made it so every XRef table may "own" a trailer. This means that the main file trailer is now part of the main XRef table.
Author: https://github.com/janso3 Commit: https://github.com/SerenityOS/serenity/commit/34350ee9e7 Pull-request: https://github.com/SerenityOS/serenity/pull/17443 Reviewed-by: https://github.com/linusg Reviewed-by: https://github.com/rtobar ✅
2024-09-30 08:41:15 +00:00 · 2023-02-11 20:39:40 +01:00 · 2023-02-11 20:39:40 +01:00 · 34350ee9e7 · 2024-07-17 00:29:38 +09:00
parent 0c230f5ff0
commit 34350ee9e7
3 changed files with 41 additions and 25 deletions
--- a/Userland/Libraries/LibPDF/DocumentParser.cpp
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@ -186,14 +186,12 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
    // The linearization parameter dictionary has just been parsed, and the xref table
    // comes immediately after it. We are in the correct spot.
    m_xref_table = TRY(parse_xref_table());
-    if (!m_trailer)
-        m_trailer = TRY(parse_file_trailer());

    // Also parse the main xref table and merge into the first-page xref table. Note
    // that we don't use the main xref table offset from the linearization dict because
    // for some reason, it specified the offset of the whitespace after the object
    // index start and length? So it's much easier to do it this way.
-    auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
+    auto main_xref_table_offset = m_xref_table->trailer()->get_value(CommonNames::Prev).to_int();
    m_reader.move_to(main_xref_table_offset);
    auto main_xref_table = TRY(parse_xref_table());
    TRY(m_xref_table->merge(move(*main_xref_table)));
@ -267,15 +265,31 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
        return error("No xref");

    m_reader.set_reading_forwards();
-    auto xref_offset_value = parse_number();
-    if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
-        return error("Invalid xref offset");
-    auto xref_offset = xref_offset_value.value().get<int>();
-
+    auto xref_offset_value = TRY(parse_number());
+    auto xref_offset = TRY(m_document->resolve_to<int>(xref_offset_value));
    m_reader.move_to(xref_offset);
-    m_xref_table = TRY(parse_xref_table());
-    if (!m_trailer)
-        m_trailer = TRY(parse_file_trailer());
+
+    // As per 7.5.6 Incremental Updates:
+    // When a conforming reader reads the file, it shall build its cross-reference
+    // information in such a way that the most recent copy of each object shall be
+    // the one accessed from the file.
+    // NOTE: This means that we have to follow back the chain of XRef table sections
+    //       and only add objects that were not already specified in a previous
+    //       (and thus newer) XRef section.
+    while (1) {
+        auto xref_table = TRY(parse_xref_table());
+        if (!m_xref_table)
+            m_xref_table = xref_table;
+        else
+            TRY(m_xref_table->merge(move(*xref_table)));
+
+        if (!xref_table->trailer() || !xref_table->trailer()->contains(CommonNames::Prev))
+            break;
+
+        auto offset = TRY(m_document->resolve_to<int>(xref_table->trailer()->get_value(CommonNames::Prev)));
+        m_reader.move_to(offset);
+    }
+
    return validate_xref_table_and_fix_if_necessary();
 }

@ -406,7 +420,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
        }
    }

-    m_trailer = dict;
+    table->set_trailer(dict);

    return table;
 }
@ -424,10 +438,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()

    auto table = adopt_ref(*new XRefTable());

-    do {
-        if (m_reader.matches("trailer"))
-            return table;
-
+    while (m_reader.matches_number()) {
        Vector<XRefEntry> entries;

        auto starting_index_value = TRY(parse_number());
@ -470,7 +481,11 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
        }

        table->add_section({ starting_index, object_count, entries });
-    } while (m_reader.matches_number());
+    }
+
+    m_reader.consume_whitespace();
+    if (m_reader.matches("trailer"))
+        table->set_trailer(TRY(parse_file_trailer()));

    return table;
 }
--- a/Userland/Libraries/LibPDF/DocumentParser.h
+++ b/Userland/Libraries/LibPDF/DocumentParser.h
@ -20,7 +20,7 @@ public:
        Linearized,
    };

-    [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
+    [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_xref_table->trailer(); }

    // Parses the header and initializes the xref table and trailer
    PDFErrorOr<void> initialize();
@ -94,7 +94,6 @@ private:
    bool navigate_to_after_startxref();

    RefPtr<XRefTable> m_xref_table;
-    RefPtr<DictObject> m_trailer;
    Optional<LinearizationDictionary> m_linearization_dictionary;
 };

--- a/Userland/Libraries/LibPDF/XRefTable.h
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@ -35,7 +35,7 @@ public:
    {
        auto this_size = m_entries.size();
        auto other_size = other.m_entries.size();
-        m_entries.ensure_capacity(other_size);
+        TRY(m_entries.try_ensure_capacity(other_size));

        for (size_t i = 0; i < other_size; i++) {
            auto other_entry = other.m_entries[i];
@ -46,12 +46,9 @@ public:

            auto this_entry = m_entries[i];

-            if (this_entry.byte_offset == invalid_byte_offset) {
+            // Only add values that we don't already have.
+            if (this_entry.byte_offset == invalid_byte_offset)
                m_entries[i] = other_entry;
-            } else if (other_entry.byte_offset != invalid_byte_offset) {
-                // Both xref tables have an entry for the same object index
-                return Error { Error::Type::Parse, "Conflicting xref entry during merge" };
-            }
        }

        return {};
@ -68,8 +65,12 @@ public:
            m_entries.append(entry);
    }

+    void set_trailer(RefPtr<DictObject> trailer) { m_trailer = trailer; }
+
    ALWAYS_INLINE Vector<XRefEntry>& entries() { return m_entries; }

+    ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
+
    [[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const
    {
        return index < m_entries.size() && m_entries[index].byte_offset != -1;
@ -113,6 +114,7 @@ private:
    friend struct AK::Formatter<PDF::XRefTable>;

    Vector<XRefEntry> m_entries;
+    RefPtr<DictObject> m_trailer;
 };

 }