Update PdfParser to 2.10.0

2024-05-16 12:36:43 -04:00 · 2024-05-16 12:36:43 -04:00 · fb7e295490
parent 4f679114c3
commit fb7e295490
24 changed files with 1080 additions and 621 deletions
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Config.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Config.php
@ -82,6 +82,13 @@ class Config
     */
    private $dataTmFontInfoHasToBeIncluded = false;

+    /**
+     * Whether to attempt to read PDFs even if they are marked as encrypted.
+     *
+     * @var bool
+     */
+    private $ignoreEncryption = false;
+
    public function getFontSpaceLimit()
    {
        return $this->fontSpaceLimit;
@ -151,4 +158,18 @@ class Config
    {
        $this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
    }
+
+    public function getIgnoreEncryption(): bool
+    {
+        return $this->ignoreEncryption;
+    }
+
+    /**
+     * @deprecated this is a temporary workaround, don't rely on it
+     * @see https://github.com/smalot/pdfparser/pull/653
+     */
+    public function setIgnoreEncryption(bool $ignoreEncryption): void
+    {
+        $this->ignoreEncryption = $ignoreEncryption;
+    }
 }
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
@ -255,7 +255,7 @@ class Document
                            if ('rdf:li' == $val['tag']) {
                                $metadata[] = $val['value'];

-                                // Else assign a value to this property
+                            // Else assign a value to this property
                            } else {
                                $metadata[$val['tag']] = $val['value'];
                            }
@ -263,12 +263,20 @@ class Document
                        break;

                    case 'close':
-                        // If the value of this property is a single-
-                        // element array where the element is of type
-                        // string, use the value of the first list item
-                        // as the value for this property
-                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
-                            $metadata = $metadata[0];
+                        // If the value of this property is an array
+                        if (\is_array($metadata)) {
+                            // If the value is a single element array
+                            // where the element is of type string, use
+                            // the value of the first list item as the
+                            // value for this property
+                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
+                                $metadata = $metadata[0];
+                            } elseif (0 == \count($metadata)) {
+                                // if the value is an empty array, set
+                                // the value of this property to the empty
+                                // string
+                                $metadata = '';
+                            }
                        }

                        // Move down one level in the stack
@ -328,12 +336,12 @@ class Document
        return null;
    }

-    public function hasObjectsByType(string $type, string $subtype = null): bool
+    public function hasObjectsByType(string $type, ?string $subtype = null): bool
    {
        return 0 < \count($this->getObjectsByType($type, $subtype));
    }

-    public function getObjectsByType(string $type, string $subtype = null): array
+    public function getObjectsByType(string $type, ?string $subtype = null): array
    {
        if (!isset($this->dictionary[$type])) {
            return [];
@ -410,7 +418,7 @@ class Document
        throw new \Exception('Missing catalog.');
    }

-    public function getText(int $pageLimit = null): string
+    public function getText(?int $pageLimit = null): string
    {
        $texts = [];
        $pages = $this->getPages();
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element.php
@ -49,13 +49,13 @@ use Smalot\PdfParser\Element\ElementXRef;
 class Element
 {
    /**
-     * @var Document
+     * @var Document|null
     */
    protected $document;

    protected $value;

-    public function __construct($value, Document $document = null)
+    public function __construct($value, ?Document $document = null)
    {
        $this->value = $value;
        $this->document = $document;
@ -96,7 +96,7 @@ class Element
        return (string) $this->value;
    }

-    public static function parse(string $content, Document $document = null, int &$position = 0)
+    public static function parse(string $content, ?Document $document = null, int &$position = 0)
    {
        $args = \func_get_args();
        $only_values = isset($args[3]) ? $args[3] : false;
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php
@ -42,7 +42,7 @@ use Smalot\PdfParser\PDFObject;
 */
 class ElementArray extends Element
 {
-    public function __construct($value, Document $document = null)
+    public function __construct($value, ?Document $document = null)
    {
        parent::__construct($value, $document);
    }
@ -107,7 +107,7 @@ class ElementArray extends Element
     *
     * @return bool|ElementArray
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*\[(?P<array>.*)/is', $content, $match)) {
            preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches);
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php
@ -61,7 +61,7 @@ class ElementBoolean extends Element
    /**
     * @return bool|ElementBoolean
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*(?P<value>true|false)/is', $content, $match)) {
            $value = $match['value'];
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php
@ -40,7 +40,7 @@ use Smalot\PdfParser\Document;
 class ElementDate extends ElementString
 {
    /**
-     * @var array
+     * @var array<int,string>
     */
    protected static $formats = [
        4 => 'Y',
@ -98,7 +98,7 @@ class ElementDate extends ElementString
    /**
     * @return bool|ElementDate
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*\(D\:(?P<name>.*?)\)/s', $content, $match)) {
            $name = $match['name'];
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php
@ -42,7 +42,7 @@ class ElementHexa extends ElementString
    /**
     * @return bool|ElementHexa|ElementDate
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
            $name = $match['name'];
@ -64,15 +64,21 @@ class ElementHexa extends ElementString
    public static function decode(string $value): string
    {
        $text = '';
-        $length = \strlen($value);

-        if ('00' === substr($value, 0, 2)) {
-            for ($i = 0; $i < $length; $i += 4) {
+        // Filter $value of non-hexadecimal characters
+        $value = (string) preg_replace('/[^0-9a-f]/i', '', $value);
+
+        // Check for leading zeros (4-byte hexadecimal indicator), or
+        // the BE BOM
+        if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) {
+            $value = (string) preg_replace('/^feff/i', '', $value);
+            for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) {
                $hex = substr($value, $i, 4);
                $text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
            }
        } else {
-            for ($i = 0; $i < $length; $i += 2) {
+            // Otherwise decode this as 2-byte hexadecimal
+            for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) {
                $hex = substr($value, $i, 2);
                $text .= \chr(hexdec($hex));
            }
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php
@ -54,7 +54,7 @@ class ElementName extends Element
    /**
     * @return bool|ElementName
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) {
            $name = $match[1];
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php
@ -58,7 +58,7 @@ class ElementNull extends Element
    /**
     * @return bool|ElementNull
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*(null)/s', $content, $match)) {
            $offset += strpos($content, 'null') + \strlen('null');
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php
@ -48,7 +48,7 @@ class ElementNumeric extends Element
    /**
     * @return bool|ElementNumeric
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*(?P<value>\-?[0-9\.]+)/s', $content, $match)) {
            $value = $match['value'];
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php
@ -54,7 +54,7 @@ class ElementString extends Element
    /**
     * @return bool|ElementString
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
            $name = $match['name'];
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php
@ -44,7 +44,7 @@ class ElementStruct extends Element
    /**
     * @return false|Header
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*<<(?P<struct>.*)/is', $content)) {
            preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches);
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php
@ -83,7 +83,7 @@ class ElementXRef extends Element
    /**
     * @return bool|ElementXRef
     */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
    {
        if (preg_match('/^\s*(?P<id>[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) {
            $id = $match['id'];
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
@ -145,6 +145,12 @@ class Encoding extends PDFObject
    {
        // Load reference table charset.
        $baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent());
+
+        // Check for empty BaseEncoding field value
+        if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) {
+            $baseEncoding = 'StandardEncoding';
+        }
+
        $className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding;

        if (!class_exists($className)) {
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
@ -178,7 +178,7 @@ class PDFDocEncoding
            "\xfc" => "\u{00fc}", // udieresis
            "\xfd" => "\u{00fd}", // yacute
            "\xfe" => "\u{00fe}", // thorn
-            "\xff" => "\u{00ff}",  // ydieresis
+            "\xff" => "\u{00ff}", // ydieresis
        ];
    }

@ -186,4 +186,4 @@ class PDFDocEncoding
    {
        return strtr($content, static::getCodePage());
    }
-}
+}
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Font.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Font.php
@ -134,9 +134,16 @@ class Font extends PDFObject

    /**
     * Convert unicode character code to "utf-8" encoded string.
+     *
+     * @param int|float $code Unicode character code. Will be casted to int internally!
     */
-    public static function uchr(int $code): string
+    public static function uchr($code): string
    {
+        // note:
+        // $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623
+        // because in some cases uchr was called with a float instead of an integer.
+        $code = (int) $code;
+
        if (!isset(self::$uchrCache[$code])) {
            // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
            // therefore, we use mb_convert_encoding() instead
@ -272,11 +279,13 @@ class Font extends PDFObject
    /**
     * Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
     */
-    public function calculateTextWidth(string $text, array &$missing = null): ?float
+    public function calculateTextWidth(string $text, ?array &$missing = null): ?float
    {
        $index_map = array_flip($this->table);
        $details = $this->getDetails();
-        $widths = $details['Widths'];
+
+        // Usually, Widths key is set in $details array, but if it isn't use an empty array instead.
+        $widths = $details['Widths'] ?? [];

        // Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
        $width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
@ -312,12 +321,12 @@ class Font extends PDFObject
        }

        $text = '';
-        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);

        foreach ($parts as $part) {
-            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
-                // strip line breaks
-                $part = preg_replace("/[\r\n]/", '', $part);
+            if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
+                // strip whitespace
+                $part = preg_replace("/\s/", '', $part);
                $part = trim($part, '<>');
                if ($add_braces) {
                    $text .= '(';
@ -342,18 +351,20 @@ class Font extends PDFObject
     */
    public static function decodeOctal(string $text): string
    {
-        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
-        $text = '';
+        // Replace all double backslashes \\ with a special string
+        $text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);

-        foreach ($parts as $part) {
-            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
-                $text .= \chr(octdec(trim($part, '\\')));
-            } else {
-                $text .= $part;
-            }
-        }
+        // Now we can replace all octal codes without worrying about
+        // escaped backslashes
+        $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
+            return \chr(octdec($m[1]));
+        }, $text);

-        return $text;
+        // Unescape any parentheses
+        $text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
+
+        // Replace instances of the special string with a single backslash
+        return str_replace('[**pdfparserdblslsh**]', '\\', $text);
    }

    /**
@ -361,18 +372,9 @@ class Font extends PDFObject
     */
    public static function decodeEntities(string $text): string
    {
-        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
-        $text = '';
-
-        foreach ($parts as $part) {
-            if (preg_match('/^#\d{2}$/', $part)) {
-                $text .= \chr(hexdec(trim($part, '#')));
-            } else {
-                $text .= $part;
-            }
-        }
-
-        return $text;
+        return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
+            return \chr(hexdec($m[1]));
+        }, $text);
    }

    /**
@ -384,7 +386,7 @@ class Font extends PDFObject
     */
    public static function decodeUnicode(string $text): string
    {
-        if (preg_match('/^\xFE\xFF/i', $text)) {
+        if ("\xFE\xFF" === substr($text, 0, 2)) {
            // Strip U+FEFF byte order marker.
            $decode = substr($text, 2);
            $text = '';
@ -409,16 +411,17 @@ class Font extends PDFObject
    /**
     * Decode text by commands array.
     */
-    public function decodeText(array $commands): string
+    public function decodeText(array $commands, float $fontFactor = 4): string
    {
        $word_position = 0;
        $words = [];
-        $font_space = $this->getFontSpaceLimit();
+        $font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4;

        foreach ($commands as $command) {
            switch ($command[PDFObject::TYPE]) {
                case 'n':
-                    if ((float) trim($command[PDFObject::COMMAND]) < $font_space) {
+                    $offset = (float) trim($command[PDFObject::COMMAND]);
+                    if ($offset - (float) $font_space < 0) {
                        $word_position = \count($words);
                    }
                    continue 2;
@ -434,8 +437,8 @@ class Font extends PDFObject

            // replace escaped chars
            $text = str_replace(
-                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
-                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
+                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'],
+                [\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)],
                $text
            );

@ -449,9 +452,32 @@ class Font extends PDFObject

        foreach ($words as &$word) {
            $word = $this->decodeContent($word);
+            $word = str_replace("\t", ' ', $word);
        }

-        return implode(' ', $words);
+        // Remove internal "words" that are just spaces, but leave them
+        // if they are at either end of the array of words. This fixes,
+        // for   example,   lines   that   are   justified   to   fill
+        // a whole row.
+        for ($x = \count($words) - 2; $x >= 1; --$x) {
+            if ('' === trim($words[$x], ' ')) {
+                unset($words[$x]);
+            }
+        }
+        $words = array_values($words);
+
+        // Cut down on the number of unnecessary internal spaces by
+        // imploding the string on the null byte, and checking if the
+        // text includes extra spaces on either side. If so, merge
+        // where appropriate.
+        $words = implode("\x00\x00", $words);
+        $words = str_replace(
+            [" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"],
+            ['  ', ' ', ' ', ' '],
+            $words
+        );
+
+        return $words;
    }

    /**
@ -459,8 +485,14 @@ class Font extends PDFObject
     *
     * @param bool $unicode This parameter is deprecated and might be removed in a future release
     */
-    public function decodeContent(string $text, bool &$unicode = null): string
+    public function decodeContent(string $text, ?bool &$unicode = null): string
    {
+        // If this string begins with a UTF-16BE BOM, then decode it
+        // directly as Unicode
+        if ("\xFE\xFF" === substr($text, 0, 2)) {
+            return $this->decodeUnicode($text);
+        }
+
        if ($this->has('ToUnicode')) {
            return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
        }
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Header.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Header.php
@ -43,7 +43,7 @@ use Smalot\PdfParser\Element\ElementXRef;
 class Header
 {
    /**
-     * @var Document
+     * @var Document|null
     */
    protected $document;

@ -56,7 +56,7 @@ class Header
     * @param Element[] $elements list of elements
     * @param Document  $document document
     */
-    public function __construct(array $elements = [], Document $document = null)
+    public function __construct(array $elements = [], ?Document $document = null)
    {
        $this->elements = $elements;
        $this->document = $document;
--- a/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Page.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Page.php
@ -176,7 +176,7 @@ class Page extends PDFObject
        }*/
    }

-    public function getText(self $page = null): string
+    public function getText(?self $page = null): string
    {
        if ($contents = $this->get('Contents')) {
            if ($contents instanceof ElementMissing) {
@ -312,7 +312,7 @@ class Page extends PDFObject
        return new self($pdfObject->document, $header, $new_content, $config);
    }

-    public function getTextArray(self $page = null): array
+    public function getTextArray(?self $page = null): array
    {
        if ($this->isFpdf()) {
            $pdfObject = $this->getPDFObjectForFpdf();
@ -400,8 +400,6 @@ class Page extends PDFObject
            }
            $sectionsText = $content->getSectionsText($content->getContent());
            foreach ($sectionsText as $sectionText) {
-                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
-
                $commandsText = $content->getCommandsText($sectionText);
                foreach ($commandsText as $command) {
                    $extractedData[] = $command;
@ -420,7 +418,7 @@ class Page extends PDFObject
     *
     * @return array An array with the data and the internal representation
     */
-    public function extractDecodedRawData(array $extractedRawData = null): array
+    public function extractDecodedRawData(?array $extractedRawData = null): array
    {
        if (!isset($extractedRawData) || !$extractedRawData) {
            $extractedRawData = $this->extractRawData();
@ -500,7 +498,7 @@ class Page extends PDFObject
     *
     * @return array An array with the text command of the page
     */
-    public function getDataCommands(array $extractedDecodedRawData = null): array
+    public function getDataCommands(?array $extractedDecodedRawData = null): array
    {
        if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
            $extractedDecodedRawData = $this->extractDecodedRawData();
@ -651,7 +649,7 @@ class Page extends PDFObject
     * @return array an array with the data of the page including the Tm information
     *               of any text in the page
     */
-    public function getDataTm(array $dataCommands = null): array
+    public function getDataTm(?array $dataCommands = null): array
    {
        if (!isset($dataCommands) || !$dataCommands) {
            $dataCommands = $this->getDataCommands();
@ -701,6 +699,12 @@ class Page extends PDFObject
        $extractedTexts = $this->getTextArray();
        $extractedData = [];
        foreach ($dataCommands as $command) {
+            // If we've used up all the texts from getTextArray(), exit
+            // so we aren't accessing non-existent array indices
+            // Fixes 'undefined array key' errors in Issues #575, #576
+            if (\count($extractedTexts) <= \count($extractedData)) {
+                break;
+            }
            $currentText = $extractedTexts[\count($extractedData)];
            switch ($command['o']) {
                /*
@ -712,21 +716,13 @@ class Page extends PDFObject
                    $Tl = $defaultTl;
                    $Tx = 0;
                    $Ty = 0;
-                    $fontId = $defaultFontId;
-                    $fontSize = $defaultFontSize;
                    break;

                    /*
                     * ET
-                     * End a text object, discarding the text matrix
+                     * End a text object
                     */
                case 'ET':
-                    $Tm = $defaultTm;
-                    $Tl = $defaultTl;
-                    $Tx = 0;
-                    $Ty = 0;
-                    $fontId = $defaultFontId;
-                    $fontSize = $defaultFontSize;
                    break;

                    /*
@ -741,7 +737,7 @@ class Page extends PDFObject

                    /*
                     * tx ty Td
-                     * Move to the start of the next line, offset form the start of the
+                     * Move to the start of the next line, offset from the start of the
                     * current line by tx, ty.
                     */
                case 'Td':
@ -898,7 +894,7 @@ class Page extends PDFObject
     *               "near" the x,y coordinate, an empty array is returned. If Both, x
     *               and y coordinates are null, null is returned.
     */
-    public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
+    public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
    {
        if (!isset($this->dataTm) || !$this->dataTm) {
            $this->getDataTm();
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php
@ -60,7 +60,7 @@ class Parser

    protected $rawDataParser;

-    public function __construct($cfg = [], Config $config = null)
+    public function __construct($cfg = [], ?Config $config = null)
    {
        $this->config = $config ?: new Config();
        $this->rawDataParser = new RawDataParser($cfg, $this->config);
@ -77,6 +77,7 @@ class Parser
    public function parseFile(string $filename): Document
    {
        $content = file_get_contents($filename);
+
        /*
         * 2018/06/20 @doganoo as multiple times a
         * users have complained that the parseFile()
@ -101,7 +102,7 @@ class Parser
        // Create structure from raw data.
        list($xref, $data) = $this->rawDataParser->parseData($content);

-        if (isset($xref['trailer']['encrypt'])) {
+        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
            throw new \Exception('Secured pdf file are currently not supported.');
        }

--- a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php
@ -233,32 +233,32 @@ class FilterHelper
     */
    protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
    {
-        /*
-         * gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty)
-         * the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
-         */
-        set_error_handler(function ($errNo, $errStr) {
-            if (\E_WARNING === $errNo) {
-                throw new \Exception($errStr);
-            } else {
-                // fallback to default php error handler
-                return false;
-            }
-        });
+        // Uncatchable E_WARNING for "data error" is @ suppressed
+        // so execution may proceed with an alternate decompression
+        // method.
+        $decoded = @gzuncompress($data, $decodeMemoryLimit);

-        $decoded = null;
-
-        // initialize string to return
-        try {
-            $decoded = gzuncompress($data, $decodeMemoryLimit);
-            if (false === $decoded) {
-                throw new \Exception('decodeFilterFlateDecode: invalid code');
+        if (false === $decoded) {
+            // If gzuncompress() failed, try again using the compress.zlib://
+            // wrapper to decode it in a file-based context.
+            // See: https://www.php.net/manual/en/function.gzuncompress.php#79042
+            // Issue: https://github.com/smalot/pdfparser/issues/592
+            $ztmp = tmpfile();
+            if (false != $ztmp) {
+                fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
+                $file = stream_get_meta_data($ztmp)['uri'];
+                if (0 === $decodeMemoryLimit) {
+                    $decoded = file_get_contents('compress.zlib://'.$file);
+                } else {
+                    $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit);
+                }
+                fclose($ztmp);
            }
-        } catch (\Exception $e) {
-            throw $e;
-        } finally {
-            // Restore old handler just in case it was customized outside of PDFParser.
-            restore_error_handler();
+        }
+
+        if (false === \is_string($decoded) || '' === $decoded) {
+            // If the decoded string is empty, that means decoding failed.
+            throw new \Exception('decodeFilterFlateDecode: invalid data');
        }

        return $decoded;
--- a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php
@ -47,12 +47,14 @@ use Smalot\PdfParser\Config;
 class RawDataParser
 {
    /**
-     * @var \Smalot\PdfParser\Config
+     * @var Config
     */
    private $config;

    /**
     * Configuration array.
+     *
+     * @var array<string,bool>
     */
    protected $cfg = [
        // if `true` ignore filter decoding errors
@ -67,7 +69,7 @@ class RawDataParser
    /**
     * @param array $cfg Configuration array, default is []
     */
-    public function __construct($cfg = [], Config $config = null)
+    public function __construct($cfg = [], ?Config $config = null)
    {
        // merge given array with default values
        $this->cfg = array_merge($this->cfg, $cfg);
@ -125,7 +127,7 @@ class RawDataParser
        // decode the stream
        $remaining_filters = [];
        foreach ($filters as $filter) {
-            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
+            if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
                try {
                    $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
                } catch (\Exception $e) {
@ -402,14 +404,19 @@ class RawDataParser
                    }
                    $prev_row = $ddata[$k];
                } // end for each row
-                // complete decoding
+            // complete decoding
            } else {
                // number of bytes in a row
                $rowlen = array_sum($wb);
-                // convert the stream into an array of integers
-                $sdata = unpack('C*', $xrefcrs[1][3][0]);
-                // split the rows
-                $ddata = array_chunk($sdata, $rowlen);
+                if (0 < $rowlen) {
+                    // convert the stream into an array of integers
+                    $sdata = unpack('C*', $xrefcrs[1][3][0]);
+                    // split the rows
+                    $ddata = array_chunk($sdata, $rowlen);
+                } else {
+                    // if the row length is zero, $ddata should be an empty array as well
+                    $ddata = [];
+                }
            }

            $sdata = [];
@ -609,7 +616,7 @@ class RawDataParser
     *
     * @return array containing object type, raw value and offset to next object
     */
-    protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
+    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
    {
        $objtype = ''; // object type to be returned
        $objval = ''; // object value to be returned
@ -756,7 +763,7 @@ class RawDataParser
                    // start stream object
                    $objtype = 'stream';
                    $offset += 6;
-                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
+                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
                        $offset += \strlen($matches[0]);

                        // we get stream length here to later help preg_match test less data
@ -857,39 +864,39 @@ class RawDataParser
     */
    protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
    {
-        $startxrefPreg = preg_match(
-            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
+        // If the $offset is currently pointed at whitespace, bump it
+        // forward until it isn't; affects loosely targetted offsets
+        // for the 'xref' keyword
+        // See: https://github.com/smalot/pdfparser/issues/673
+        $bumpOffset = $offset;
+        while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
+            ++$bumpOffset;
+        }
+
+        // Find all startxref tables from this $offset forward
+        $startxrefPreg = preg_match_all(
+            '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
            $pdfData,
-            $matches,
-            \PREG_OFFSET_CAPTURE,
+            $startxrefMatches,
+            \PREG_SET_ORDER,
            $offset
        );

-        if (0 == $offset) {
-            // find last startxref
-            $pregResult = preg_match_all(
-                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
-                $pdfData,
-                $matches,
-                \PREG_SET_ORDER,
-                $offset
-            );
-            if (0 == $pregResult) {
-                throw new \Exception('Unable to find startxref');
-            }
-            $matches = array_pop($matches);
-            $startxref = $matches[1];
-        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
-            // Already pointing at the xref table
-            $startxref = $offset;
-        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
-            // Cross-Reference Stream object
-            $startxref = $offset;
-        } elseif ($startxrefPreg) {
-            // startxref found
-            $startxref = $matches[1][0];
-        } else {
+        if (0 == $startxrefPreg) {
+            // No startxref tables were found
            throw new \Exception('Unable to find startxref');
+        } elseif (0 == $offset) {
+            // Use the last startxref in the document
+            $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
+        } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
+            // Already pointing at the xref table
+            $startxref = $bumpOffset;
+        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
+            // Cross-Reference Stream object
+            $startxref = $bumpOffset;
+        } else {
+            // Use the next startxref from this $offset
+            $startxref = (int) $startxrefMatches[0][1];
        }

        if ($startxref > \strlen($pdfData)) {
@ -901,8 +908,15 @@ class RawDataParser
            // Cross-Reference
            $xref = $this->decodeXref($pdfData, $startxref, $xref);
        } else {
-            // Cross-Reference Stream
-            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
+            // Check if the $pdfData might have the wrong line-endings
+            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
+            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
+                // Return Unix-line-ending flag
+                $xref = ['Unix' => true];
+            } else {
+                // Cross-Reference Stream
+                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
+            }
        }
        if (empty($xref)) {
            throw new \Exception('Unable to find xref');
@ -937,6 +951,12 @@ class RawDataParser
        // get xref and trailer data
        $xref = $this->getXrefData($pdfData);

+        // If we found Unix line-endings
+        if (isset($xref['Unix'])) {
+            $pdfData = str_replace("\r\n", "\n", $pdfData);
+            $xref = $this->getXrefData($pdfData);
+        }
+
        // parse all document objects
        $objects = [];
        foreach ($xref['xref'] as $obj => $offset) {
--- a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php
@ -41,7 +41,7 @@ use Smalot\PdfParser\PDFObject;
 */
 class Form extends Page
 {
-    public function getText(Page $page = null): string
+    public function getText(?Page $page = null): string
    {
        $header = new Header([], $this->document);
        $contents = new PDFObject($this->document, $header, $this->content, $this->config);
--- a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php
@ -40,7 +40,7 @@ use Smalot\PdfParser\PDFObject;
 */
 class Image extends PDFObject
 {
-    public function getText(Page $page = null): string
+    public function getText(?Page $page = null): string
    {
        return '';
    }