From fb7e29549024da73706906fafb6f2f8d3a217bcc Mon Sep 17 00:00:00 2001
From: Brian Huisman <bhuisman@greywyvern.com>
Date: Thu, 16 May 2024 12:36:43 -0400
Subject: [PATCH] Update PdfParser to 2.10.0

---
 .../pdfparser/src/Smalot/PdfParser/Config.php |   21 +
 .../src/Smalot/PdfParser/Document.php         |   28 +-
 .../src/Smalot/PdfParser/Element.php          |    6 +-
 .../Smalot/PdfParser/Element/ElementArray.php |    4 +-
 .../PdfParser/Element/ElementBoolean.php      |    2 +-
 .../Smalot/PdfParser/Element/ElementDate.php  |    4 +-
 .../Smalot/PdfParser/Element/ElementHexa.php  |   16 +-
 .../Smalot/PdfParser/Element/ElementName.php  |    2 +-
 .../Smalot/PdfParser/Element/ElementNull.php  |    2 +-
 .../PdfParser/Element/ElementNumeric.php      |    2 +-
 .../PdfParser/Element/ElementString.php       |    2 +-
 .../PdfParser/Element/ElementStruct.php       |    2 +-
 .../Smalot/PdfParser/Element/ElementXRef.php  |    2 +-
 .../src/Smalot/PdfParser/Encoding.php         |    6 +
 .../PdfParser/Encoding/PDFDocEncoding.php     |    4 +-
 .../pdfparser/src/Smalot/PdfParser/Font.php   |  106 +-
 .../pdfparser/src/Smalot/PdfParser/Header.php |    4 +-
 .../src/Smalot/PdfParser/PDFObject.php        | 1299 +++++++++++------
 .../pdfparser/src/Smalot/PdfParser/Page.php   |   32 +-
 .../pdfparser/src/Smalot/PdfParser/Parser.php |    5 +-
 .../Smalot/PdfParser/RawData/FilterHelper.php |   48 +-
 .../PdfParser/RawData/RawDataParser.php       |  100 +-
 .../src/Smalot/PdfParser/XObject/Form.php     |    2 +-
 .../src/Smalot/PdfParser/XObject/Image.php    |    2 +-
 24 files changed, 1080 insertions(+), 621 deletions(-)

diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Config.php b/orcinus/pdfparser/src/Smalot/PdfParser/Config.php
index ff69d3e..e44b164 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Config.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Config.php
@@ -82,6 +82,13 @@ class Config
      */
     private $dataTmFontInfoHasToBeIncluded = false;
 
+    /**
+     * Whether to attempt to read PDFs even if they are marked as encrypted.
+     *
+     * @var bool
+     */
+    private $ignoreEncryption = false;
+
     public function getFontSpaceLimit()
     {
         return $this->fontSpaceLimit;
@@ -151,4 +158,18 @@ class Config
     {
         $this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
     }
+
+    public function getIgnoreEncryption(): bool
+    {
+        return $this->ignoreEncryption;
+    }
+
+    /**
+     * @deprecated this is a temporary workaround, don't rely on it
+     * @see https://github.com/smalot/pdfparser/pull/653
+     */
+    public function setIgnoreEncryption(bool $ignoreEncryption): void
+    {
+        $this->ignoreEncryption = $ignoreEncryption;
+    }
 }
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Document.php b/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
index d2cec38..016787a 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
@@ -255,7 +255,7 @@ class Document
                             if ('rdf:li' == $val['tag']) {
                                 $metadata[] = $val['value'];
 
-                                // Else assign a value to this property
+                            // Else assign a value to this property
                             } else {
                                 $metadata[$val['tag']] = $val['value'];
                             }
@@ -263,12 +263,20 @@ class Document
                         break;
 
                     case 'close':
-                        // If the value of this property is a single-
-                        // element array where the element is of type
-                        // string, use the value of the first list item
-                        // as the value for this property
-                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
-                            $metadata = $metadata[0];
+                        // If the value of this property is an array
+                        if (\is_array($metadata)) {
+                            // If the value is a single element array
+                            // where the element is of type string, use
+                            // the value of the first list item as the
+                            // value for this property
+                            if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
+                                $metadata = $metadata[0];
+                            } elseif (0 == \count($metadata)) {
+                                // if the value is an empty array, set
+                                // the value of this property to the empty
+                                // string
+                                $metadata = '';
+                            }
                         }
 
                         // Move down one level in the stack
@@ -328,12 +336,12 @@ class Document
         return null;
     }
 
-    public function hasObjectsByType(string $type, string $subtype = null): bool
+    public function hasObjectsByType(string $type, ?string $subtype = null): bool
     {
         return 0 < \count($this->getObjectsByType($type, $subtype));
     }
 
-    public function getObjectsByType(string $type, string $subtype = null): array
+    public function getObjectsByType(string $type, ?string $subtype = null): array
     {
         if (!isset($this->dictionary[$type])) {
             return [];
@@ -410,7 +418,7 @@ class Document
         throw new \Exception('Missing catalog.');
     }
 
-    public function getText(int $pageLimit = null): string
+    public function getText(?int $pageLimit = null): string
     {
         $texts = [];
         $pages = $this->getPages();
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element.php
index 0ce6c42..8066030 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element.php
@@ -49,13 +49,13 @@ use Smalot\PdfParser\Element\ElementXRef;
 class Element
 {
     /**
-     * @var Document
+     * @var Document|null
      */
     protected $document;
 
     protected $value;
 
-    public function __construct($value, Document $document = null)
+    public function __construct($value, ?Document $document = null)
     {
         $this->value = $value;
         $this->document = $document;
@@ -96,7 +96,7 @@ class Element
         return (string) $this->value;
     }
 
-    public static function parse(string $content, Document $document = null, int &$position = 0)
+    public static function parse(string $content, ?Document $document = null, int &$position = 0)
     {
         $args = \func_get_args();
         $only_values = isset($args[3]) ? $args[3] : false;
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php
index 6ad2220..b54bf84 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php
@@ -42,7 +42,7 @@ use Smalot\PdfParser\PDFObject;
  */
 class ElementArray extends Element
 {
-    public function __construct($value, Document $document = null)
+    public function __construct($value, ?Document $document = null)
     {
         parent::__construct($value, $document);
     }
@@ -107,7 +107,7 @@ class ElementArray extends Element
      *
      * @return bool|ElementArray
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*\[(?P<array>.*)/is', $content, $match)) {
             preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches);
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php
index 4831a4a..55fb463 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php
@@ -61,7 +61,7 @@ class ElementBoolean extends Element
     /**
      * @return bool|ElementBoolean
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*(?P<value>true|false)/is', $content, $match)) {
             $value = $match['value'];
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php
index c4d3984..f1f2df6 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php
@@ -40,7 +40,7 @@ use Smalot\PdfParser\Document;
 class ElementDate extends ElementString
 {
     /**
-     * @var array
+     * @var array<int,string>
      */
     protected static $formats = [
         4 => 'Y',
@@ -98,7 +98,7 @@ class ElementDate extends ElementString
     /**
      * @return bool|ElementDate
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*\(D\:(?P<name>.*?)\)/s', $content, $match)) {
             $name = $match['name'];
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php
index d031461..3fc3413 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php
@@ -42,7 +42,7 @@ class ElementHexa extends ElementString
     /**
      * @return bool|ElementHexa|ElementDate
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
             $name = $match['name'];
@@ -64,15 +64,21 @@ class ElementHexa extends ElementString
     public static function decode(string $value): string
     {
         $text = '';
-        $length = \strlen($value);
 
-        if ('00' === substr($value, 0, 2)) {
-            for ($i = 0; $i < $length; $i += 4) {
+        // Filter $value of non-hexadecimal characters
+        $value = (string) preg_replace('/[^0-9a-f]/i', '', $value);
+
+        // Check for leading zeros (4-byte hexadecimal indicator), or
+        // the BE BOM
+        if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) {
+            $value = (string) preg_replace('/^feff/i', '', $value);
+            for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) {
                 $hex = substr($value, $i, 4);
                 $text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
             }
         } else {
-            for ($i = 0; $i < $length; $i += 2) {
+            // Otherwise decode this as 2-byte hexadecimal
+            for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) {
                 $hex = substr($value, $i, 2);
                 $text .= \chr(hexdec($hex));
             }
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php
index 0f8d06b..6e8d97a 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php
@@ -54,7 +54,7 @@ class ElementName extends Element
     /**
      * @return bool|ElementName
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) {
             $name = $match[1];
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php
index 8757630..9af8843 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php
@@ -58,7 +58,7 @@ class ElementNull extends Element
     /**
      * @return bool|ElementNull
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*(null)/s', $content, $match)) {
             $offset += strpos($content, 'null') + \strlen('null');
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php
index 80885c1..5454acc 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php
@@ -48,7 +48,7 @@ class ElementNumeric extends Element
     /**
      * @return bool|ElementNumeric
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*(?P<value>\-?[0-9\.]+)/s', $content, $match)) {
             $value = $match['value'];
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php
index a18ba5f..011bcf4 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php
@@ -54,7 +54,7 @@ class ElementString extends Element
     /**
      * @return bool|ElementString
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
             $name = $match['name'];
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php
index 7c95559..c37b6da 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php
@@ -44,7 +44,7 @@ class ElementStruct extends Element
     /**
      * @return false|Header
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*<<(?P<struct>.*)/is', $content)) {
             preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches);
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php
index 50531a7..ebba71a 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php
@@ -83,7 +83,7 @@ class ElementXRef extends Element
     /**
      * @return bool|ElementXRef
      */
-    public static function parse(string $content, Document $document = null, int &$offset = 0)
+    public static function parse(string $content, ?Document $document = null, int &$offset = 0)
     {
         if (preg_match('/^\s*(?P<id>[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) {
             $id = $match['id'];
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
index 6018eec..511411b 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
@@ -145,6 +145,12 @@ class Encoding extends PDFObject
     {
         // Load reference table charset.
         $baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent());
+
+        // Check for empty BaseEncoding field value
+        if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) {
+            $baseEncoding = 'StandardEncoding';
+        }
+
         $className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding;
 
         if (!class_exists($className)) {
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
index 60e5616..70bc48c 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
@@ -178,7 +178,7 @@ class PDFDocEncoding
             "\xfc" => "\u{00fc}", // udieresis
             "\xfd" => "\u{00fd}", // yacute
             "\xfe" => "\u{00fe}", // thorn
-            "\xff" => "\u{00ff}",  // ydieresis
+            "\xff" => "\u{00ff}", // ydieresis
         ];
     }
 
@@ -186,4 +186,4 @@ class PDFDocEncoding
     {
         return strtr($content, static::getCodePage());
     }
-}
\ No newline at end of file
+}
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Font.php b/orcinus/pdfparser/src/Smalot/PdfParser/Font.php
index 9e4db9f..cfe85d7 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Font.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Font.php
@@ -134,9 +134,16 @@ class Font extends PDFObject
 
     /**
      * Convert unicode character code to "utf-8" encoded string.
+     *
+     * @param int|float $code Unicode character code. Will be casted to int internally!
      */
-    public static function uchr(int $code): string
+    public static function uchr($code): string
     {
+        // note:
+        // $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623
+        // because in some cases uchr was called with a float instead of an integer.
+        $code = (int) $code;
+
         if (!isset(self::$uchrCache[$code])) {
             // html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
             // therefore, we use mb_convert_encoding() instead
@@ -272,11 +279,13 @@ class Font extends PDFObject
     /**
      * Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
      */
-    public function calculateTextWidth(string $text, array &$missing = null): ?float
+    public function calculateTextWidth(string $text, ?array &$missing = null): ?float
     {
         $index_map = array_flip($this->table);
         $details = $this->getDetails();
-        $widths = $details['Widths'];
+
+        // Usually, Widths key is set in $details array, but if it isn't use an empty array instead.
+        $widths = $details['Widths'] ?? [];
 
         // Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
         $width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
@@ -312,12 +321,12 @@ class Font extends PDFObject
         }
 
         $text = '';
-        $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
+        $parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
 
         foreach ($parts as $part) {
-            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
-                // strip line breaks
-                $part = preg_replace("/[\r\n]/", '', $part);
+            if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
+                // strip whitespace
+                $part = preg_replace("/\s/", '', $part);
                 $part = trim($part, '<>');
                 if ($add_braces) {
                     $text .= '(';
@@ -342,18 +351,20 @@ class Font extends PDFObject
      */
     public static function decodeOctal(string $text): string
     {
-        $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
-        $text = '';
+        // Replace all double backslashes \\ with a special string
+        $text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);
 
-        foreach ($parts as $part) {
-            if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
-                $text .= \chr(octdec(trim($part, '\\')));
-            } else {
-                $text .= $part;
-            }
-        }
+        // Now we can replace all octal codes without worrying about
+        // escaped backslashes
+        $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
+            return \chr(octdec($m[1]));
+        }, $text);
 
-        return $text;
+        // Unescape any parentheses
+        $text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
+
+        // Replace instances of the special string with a single backslash
+        return str_replace('[**pdfparserdblslsh**]', '\\', $text);
     }
 
     /**
@@ -361,18 +372,9 @@ class Font extends PDFObject
      */
     public static function decodeEntities(string $text): string
     {
-        $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
-        $text = '';
-
-        foreach ($parts as $part) {
-            if (preg_match('/^#\d{2}$/', $part)) {
-                $text .= \chr(hexdec(trim($part, '#')));
-            } else {
-                $text .= $part;
-            }
-        }
-
-        return $text;
+        return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
+            return \chr(hexdec($m[1]));
+        }, $text);
     }
 
     /**
@@ -384,7 +386,7 @@ class Font extends PDFObject
      */
     public static function decodeUnicode(string $text): string
     {
-        if (preg_match('/^\xFE\xFF/i', $text)) {
+        if ("\xFE\xFF" === substr($text, 0, 2)) {
             // Strip U+FEFF byte order marker.
             $decode = substr($text, 2);
             $text = '';
@@ -409,16 +411,17 @@ class Font extends PDFObject
     /**
      * Decode text by commands array.
      */
-    public function decodeText(array $commands): string
+    public function decodeText(array $commands, float $fontFactor = 4): string
     {
         $word_position = 0;
         $words = [];
-        $font_space = $this->getFontSpaceLimit();
+        $font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4;
 
         foreach ($commands as $command) {
             switch ($command[PDFObject::TYPE]) {
                 case 'n':
-                    if ((float) trim($command[PDFObject::COMMAND]) < $font_space) {
+                    $offset = (float) trim($command[PDFObject::COMMAND]);
+                    if ($offset - (float) $font_space < 0) {
                         $word_position = \count($words);
                     }
                     continue 2;
@@ -434,8 +437,8 @@ class Font extends PDFObject
 
             // replace escaped chars
             $text = str_replace(
-                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
-                ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
+                ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'],
+                [\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)],
                 $text
             );
 
@@ -449,9 +452,32 @@ class Font extends PDFObject
 
         foreach ($words as &$word) {
             $word = $this->decodeContent($word);
+            $word = str_replace("\t", ' ', $word);
         }
 
-        return implode(' ', $words);
+        // Remove internal "words" that are just spaces, but leave them
+        // if they are at either end of the array of words. This fixes,
+        // for   example,   lines   that   are   justified   to   fill
+        // a whole row.
+        for ($x = \count($words) - 2; $x >= 1; --$x) {
+            if ('' === trim($words[$x], ' ')) {
+                unset($words[$x]);
+            }
+        }
+        $words = array_values($words);
+
+        // Cut down on the number of unnecessary internal spaces by
+        // imploding the string on the null byte, and checking if the
+        // text includes extra spaces on either side. If so, merge
+        // where appropriate.
+        $words = implode("\x00\x00", $words);
+        $words = str_replace(
+            [" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"],
+            ['  ', ' ', ' ', ' '],
+            $words
+        );
+
+        return $words;
     }
 
     /**
@@ -459,8 +485,14 @@ class Font extends PDFObject
      *
      * @param bool $unicode This parameter is deprecated and might be removed in a future release
      */
-    public function decodeContent(string $text, bool &$unicode = null): string
+    public function decodeContent(string $text, ?bool &$unicode = null): string
     {
+        // If this string begins with a UTF-16BE BOM, then decode it
+        // directly as Unicode
+        if ("\xFE\xFF" === substr($text, 0, 2)) {
+            return $this->decodeUnicode($text);
+        }
+
         if ($this->has('ToUnicode')) {
             return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
         }
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Header.php b/orcinus/pdfparser/src/Smalot/PdfParser/Header.php
index 562897c..b58773a 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Header.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Header.php
@@ -43,7 +43,7 @@ use Smalot\PdfParser\Element\ElementXRef;
 class Header
 {
     /**
-     * @var Document
+     * @var Document|null
      */
     protected $document;
 
@@ -56,7 +56,7 @@ class Header
      * @param Element[] $elements list of elements
      * @param Document  $document document
      */
-    public function __construct(array $elements = [], Document $document = null)
+    public function __construct(array $elements = [], ?Document $document = null)
     {
         $this->elements = $elements;
         $this->document = $document;
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php b/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php
index c879176..87b5a6c 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php
@@ -54,7 +54,7 @@ class PDFObject
     public static $recursionStack = [];
 
     /**
-     * @var Document
+     * @var Document|null
      */
     protected $document;
 
@@ -69,15 +69,20 @@ class PDFObject
     protected $content;
 
     /**
-     * @var Config
+     * @var Config|null
      */
     protected $config;
 
+    /**
+     * @var bool
+     */
+    protected $addPositionWhitespace = false;
+
     public function __construct(
         Document $document,
-        Header $header = null,
-        string $content = null,
-        Config $config = null
+        ?Header $header = null,
+        ?string $content = null,
+        ?Config $config = null
     ) {
         $this->document = $document;
         $this->header = $header ?? new Header();
@@ -127,6 +132,16 @@ class PDFObject
         return $this->content;
     }
 
+    /**
+     * Creates a duplicate of the document stream with
+     * strings and other items replaced by $char. Formerly
+     * getSectionsText() used this output to more easily gather offset
+     * values to extract text from the *actual* document stream.
+     *
+     * @deprecated function is no longer used and will be removed in a future release
+     *
+     * @internal
+     */
     public function cleanContent(string $content, string $char = 'X')
     {
         $char = $char[0];
@@ -186,48 +201,298 @@ class PDFObject
         return $content;
     }
 
-    public function getSectionsText(?string $content): array
+    /**
+     * Takes a string of PDF document stream text and formats
+     * it into a multi-line string with one PDF command on each line,
+     * separated by \r\n. If the given string is null, or binary data
+     * is detected instead of a document stream then return an empty
+     * string.
+     */
+    private function formatContent(?string $content): string
     {
-        $sections = [];
-        $content = ' '.$content.' ';
-        $textCleaned = $this->cleanContent($content, '_');
+        if (null === $content) {
+            return '';
+        }
 
-        // Extract text blocks.
-        if (preg_match_all('/(\sQ)?\s+BT[\s|\(|\[]+(.*?)\s*ET(\sq)?/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
-            foreach ($matches[2] as $pos => $part) {
-                $text = $part[0];
-                if ('' === $text) {
-                    continue;
-                }
-                $offset = $part[1];
-                $section = substr($content, $offset, \strlen($text));
+        // Outside of (String) and inline image content in PDF document
+        // streams, all text should conform to UTF-8. Test for binary
+        // content by deleting everything after the first open-
+        // parenthesis ( which indicates the beginning of a string, or
+        // the first ID command which indicates the beginning of binary
+        // inline image content. Then test what remains for valid
+        // UTF-8. If it's not UTF-8, return an empty string as this
+        // $content is most likely binary. Unfortunately, using
+        // mb_check_encoding(..., 'UTF-8') is not strict enough, so the
+        // following regexp, adapted from the W3, is used. See:
+        // https://www.w3.org/International/questions/qa-forms-utf-8.en
+        // We use preg_replace() instead of preg_match() to avoid "JIT
+        // stack limit exhausted" errors on larger files.
+        $utf8Filter = preg_replace('/(
+            [\x09\x0A\x0D\x20-\x7E] |            # ASCII
+            [\xC2-\xDF][\x80-\xBF] |             # non-overlong 2-byte
+            \xE0[\xA0-\xBF][\x80-\xBF] |         # excluding overlongs
+            [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} |  # straight 3-byte
+            \xED[\x80-\x9F][\x80-\xBF] |         # excluding surrogates
+            \xF0[\x90-\xBF][\x80-\xBF]{2} |      # planes 1-3
+            [\xF1-\xF3][\x80-\xBF]{3} |          # planes 4-15
+            \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
+        )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
 
-                // Removes BDC and EMC markup.
-                $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' ');
+        if ('' !== $utf8Filter) {
+            return '';
+        }
 
-                // Add Q and q flags if detected around BT/ET.
-                // @see: https://github.com/smalot/pdfparser/issues/387
-                $section = trim((!empty($matches[1][$pos][0]) ? "Q\n" : '').$section).(!empty($matches[3][$pos][0]) ? "\nq" : '');
+        // Find all inline image content and replace them so they aren't
+        // affected by the next steps
+        $pdfInlineImages = [];
+        $offsetBI = 0;
+        while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) {
+            // Attempt to detemine if this instance of the 'BI' command
+            // actually occured within a (string) using the following
+            // steps:
 
-                $sections[] = $section;
+            // Step 1: Remove any escaped parentheses from the alleged
+            // image characteristics data
+            $para = str_replace(['\\(', '\\)'], '', $text[1][0]);
+
+            // Step 2: Remove all correctly ordered and balanced
+            // parentheses from (strings)
+            do {
+                $paraTest = $para;
+                $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest);
+            } while ($para != $paraTest);
+
+            $paraOpen = strpos($para, '(');
+            $paraClose = strpos($para, ')');
+
+            // Check: If the remaining text contains a close parenthesis
+            // ')' AND it occurs before any open parenthesis, then we
+            // are almost certain to be inside a (string)
+            if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) {
+                // Bump the search offset forward and match again
+                $offsetBI = (int) $text[1][1];
+                continue;
+            }
+
+            // Step 3: Double check that this is actually inline image
+            // data by parsing the alleged image characteristics as a
+            // dictionary
+            $dict = $this->parseDictionary('<<'.$text[1][0].'>>');
+
+            // Check if an image Width and Height are set in the dict
+            if ((isset($dict['W']) || isset($dict['Width']))
+                && (isset($dict['H']) || isset($dict['Height']))) {
+                $id = uniqid('IMAGE_', true);
+                $pdfInlineImages[$id] = [
+                    preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]),
+                    preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]),
+                ];
+                $content = preg_replace(
+                    '/'.preg_quote($text[0][0], '/').'/',
+                    '^^^'.$id.'^^^',
+                    $content,
+                    1
+                );
+            } else {
+                // If there was no valid dictionary, or a height and width
+                // weren't specified, then we don't know what this is, so
+                // just leave it alone; bump the search offset forward and
+                // match again
+                $offsetBI = (int) $text[1][1];
             }
         }
 
-        // Extract 'do' commands.
-        if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) {
-            foreach ($matches[1] as $part) {
-                $text = $part[0];
-                $offset = $part[1];
-                $section = substr($content, $offset, \strlen($text));
+        // Find all strings () and replace them so they aren't affected
+        // by the next steps
+        $pdfstrings = [];
+        $attempt = '(';
+        while (preg_match('/'.preg_quote($attempt, '/').'.*?(?<![^\\\\]\\\\)\)/s', $content, $text)) {
+            // PDF strings can contain unescaped parentheses as long as
+            // they're balanced, so check for balanced parentheses
+            $left = preg_match_all('/(?<![^\\\\]\\\\)\(/', $text[0]);
+            $right = preg_match_all('/(?<![^\\\\]\\\\)\)/', $text[0]);
 
-                $sections[] = $section;
+            if ($left == $right) {
+                // Replace the string with a unique placeholder
+                $id = uniqid('STRING_', true);
+                $pdfstrings[$id] = $text[0];
+                $content = preg_replace(
+                    '/'.preg_quote($text[0], '/').'/',
+                    '@@@'.$id.'@@@',
+                    $content,
+                    1
+                );
+
+                // Reset to search for the next string
+                $attempt = '(';
+            } else {
+                // We had unbalanced parentheses, so use the current
+                // match as a base to find a longer string
+                $attempt = $text[0];
+            }
+        }
+
+        // Remove all carriage returns and line-feeds from the document stream
+        $content = str_replace(["\r", "\n"], ' ', trim($content));
+
+        // Find all dictionary << >> commands and replace them so they
+        // aren't affected by the next steps
+        $dictstore = [];
+        while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) {
+            $dictid = uniqid('DICT_', true);
+            $dictstore[$dictid] = $dicttext[1];
+            $content = preg_replace(
+                '/'.preg_quote($dicttext[0], '/').'/',
+                ' ###'.$dictid.'###'.$dicttext[2],
+                $content,
+                1
+            );
+        }
+
+        // Normalize white-space in the document stream
+        $content = preg_replace('/\s{2,}/', ' ', $content);
+
+        // Find all valid PDF operators and add \r\n after each; this
+        // ensures there is just one command on every line
+        // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A
+        // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A
+        // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while
+        //       PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions
+        //       appear here in the list for completeness.
+        $operators = [
+            'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS',
+            'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs',
+            'g', 'G',  'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n',
+            'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC',
+            'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw',
+            'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"',
+        ];
+        foreach ($operators as $operator) {
+            $content = preg_replace(
+                '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/',
+                $operator."\r\n",
+                $content
+            );
+        }
+
+        // Restore the original content of the dictionary << >> commands
+        $dictstore = array_reverse($dictstore, true);
+        foreach ($dictstore as $id => $dict) {
+            $content = str_replace('###'.$id.'###', $dict, $content);
+        }
+
+        // Restore the original string content
+        $pdfstrings = array_reverse($pdfstrings, true);
+        foreach ($pdfstrings as $id => $text) {
+            // Strings may contain escaped newlines, or literal newlines
+            // and we should clean these up before replacing the string
+            // back into the content stream; this ensures no strings are
+            // split between two lines (every command must be on one line)
+            $text = str_replace(
+                ["\\\r\n", "\\\r", "\\\n", "\r", "\n"],
+                ['', '', '', '\r', '\n'],
+                $text
+            );
+
+            $content = str_replace('@@@'.$id.'@@@', $text, $content);
+        }
+
+        // Restore the original content of any inline images
+        $pdfInlineImages = array_reverse($pdfInlineImages, true);
+        foreach ($pdfInlineImages as $id => $image) {
+            $content = str_replace(
+                '^^^'.$id.'^^^',
+                "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n",
+                $content
+            );
+        }
+
+        $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
+
+        return $content;
+    }
+
+    /**
+     * getSectionsText() now takes an entire, unformatted
+     * document stream as a string, cleans it, then filters out
+     * commands that aren't needed for text positioning/extraction. It
+     * returns an array of unprocessed PDF commands, one command per
+     * element.
+     *
+     * @internal
+     */
+    public function getSectionsText(?string $content): array
+    {
+        $sections = [];
+
+        // A cleaned stream has one command on every line, so split the
+        // cleaned stream content on \r\n into an array
+        $textCleaned = preg_split(
+            '/(\r\n|\n|\r)/',
+            $this->formatContent($content),
+            -1,
+            \PREG_SPLIT_NO_EMPTY
+        );
+
+        $inTextBlock = false;
+        foreach ($textCleaned as $line) {
+            $line = trim($line);
+
+            // Skip empty lines
+            if ('' === $line) {
+                continue;
+            }
+
+            // If a 'BT' is encountered, set the $inTextBlock flag
+            if (preg_match('/BT$/', $line)) {
+                $inTextBlock = true;
+                $sections[] = $line;
+
+            // If an 'ET' is encountered, unset the $inTextBlock flag
+            } elseif ('ET' == $line) {
+                $inTextBlock = false;
+                $sections[] = $line;
+            } elseif ($inTextBlock) {
+                // If we are inside a BT ... ET text block, save all lines
+                $sections[] = trim($line);
+            } else {
+                // Otherwise, if we are outside of a text block, only
+                // save specific, necessary lines. Care should be taken
+                // to ensure a command being checked for *only* matches
+                // that command. For instance, a simple search for 'c'
+                // may also match the 'sc' command. See the command
+                // list in the formatContent() method above.
+                // Add more commands to save here as you find them in
+                // weird PDFs!
+                if ('q' == $line[-1] || 'Q' == $line[-1]) {
+                    // Save and restore graphics state commands
+                    $sections[] = $line;
+                } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) {
+                    // Begin marked content sequence
+                    $sections[] = $line;
+                } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) {
+                    // Marked content point
+                    $sections[] = $line;
+                } elseif (preg_match('/(?<!\w)EMC$/', $line)) {
+                    // End marked content sequence
+                    $sections[] = $line;
+                } elseif (preg_match('/(?<!\w)cm$/', $line)) {
+                    // Graphics position change commands
+                    $sections[] = $line;
+                } elseif (preg_match('/(?<!\w)Tf$/', $line)) {
+                    // Font change commands
+                    $sections[] = $line;
+                } elseif (preg_match('/(?<!\w)Do$/', $line)) {
+                    // Invoke named XObject command
+                    $sections[] = $line;
+                }
             }
         }
 
         return $sections;
     }
 
-    private function getDefaultFont(Page $page = null): Font
+    private function getDefaultFont(?Page $page = null): Font
     {
         $fonts = [];
         if (null !== $page) {
@@ -247,145 +512,262 @@ class PDFObject
     }
 
     /**
+     * Decode a '[]TJ' command and attempt to use alternate
+     * fonts if the current font results in output that contains
+     * Unicode control characters.
+     *
+     * @internal
+     *
+     * @param array<int,array<string,string|bool>> $command
+     */
+    private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string
+    {
+        $orig_text = $font->decodeText($command, $fontFactor);
+        $text = $orig_text;
+
+        // If we make this a Config option, we can add a check if it's
+        // enabled here.
+        if (null !== $page) {
+            $font_ids = array_keys($page->getFonts());
+
+            // If the decoded text contains UTF-8 control characters
+            // then the font page being used is probably the wrong one.
+            // Loop through the rest of the fonts to see if we can get
+            // a good decode. Allow x09 to x0d which are whitespace.
+            while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) {
+                // If we're out of font IDs, then give up and use the
+                // original string
+                if (0 == \count($font_ids)) {
+                    return $orig_text;
+                }
+
+                // Try the next font ID
+                $font = $page->getFont(array_shift($font_ids));
+                $text = $font->decodeText($command, $fontFactor);
+            }
+        }
+
+        return $text;
+    }
+
+    /**
+     * Expects a string that is a full PDF dictionary object,
+     * including the outer enclosing << >> angle brackets
+     *
+     * @internal
+     *
      * @throws \Exception
      */
-    public function getText(Page $page = null): string
+    public function parseDictionary(string $dictionary): array
     {
-        $result = '';
+        // Normalize whitespace
+        $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
+
+        if ('<<' != substr($dictionary, 0, 2)) {
+            throw new \Exception('Not a valid dictionary object.');
+        }
+
+        $parsed = [];
+        $stack = [];
+        $currentName = '';
+        $arrayTypeNumeric = false;
+
+        // Remove outer layer of dictionary, and split on tokens
+        $split = preg_split(
+            '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/',
+            trim(preg_replace('/^<<|>>$/', '', $dictionary)),
+            -1,
+            \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE
+        );
+
+        foreach ($split as $token) {
+            $token = trim($token);
+            switch ($token) {
+                case '':
+                    break;
+
+                    // Open numeric array
+                case '[':
+                    $parsed[$currentName] = [];
+                    $arrayTypeNumeric = true;
+
+                    // Move up one level in the stack
+                    $stack[\count($stack)] = &$parsed;
+                    $parsed = &$parsed[$currentName];
+                    $currentName = '';
+                    break;
+
+                    // Open hashed array
+                case '<<':
+                    $parsed[$currentName] = [];
+                    $arrayTypeNumeric = false;
+
+                    // Move up one level in the stack
+                    $stack[\count($stack)] = &$parsed;
+                    $parsed = &$parsed[$currentName];
+                    $currentName = '';
+                    break;
+
+                    // Close numeric array
+                case ']':
+                    // Revert string type arrays back to a single element
+                    if (\is_array($parsed) && 1 == \count($parsed)
+                        && isset($parsed[0]) && \is_string($parsed[0])
+                        && '' !== $parsed[0] && '/' != $parsed[0][0]) {
+                        $parsed = '['.$parsed[0].']';
+                    }
+                    // Close hashed array
+                    // no break
+                case '>>':
+                    $arrayTypeNumeric = false;
+
+                    // Move down one level in the stack
+                    $parsed = &$stack[\count($stack) - 1];
+                    unset($stack[\count($stack) - 1]);
+                    break;
+
+                default:
+                    // If value begins with a slash, then this is a name
+                    // Add it to the appropriate array
+                    if ('/' == substr($token, 0, 1)) {
+                        $currentName = substr($token, 1);
+                        if (true == $arrayTypeNumeric) {
+                            $parsed[] = $currentName;
+                            $currentName = '';
+                        }
+                    } elseif ('' != $currentName) {
+                        if (false == $arrayTypeNumeric) {
+                            $parsed[$currentName] = $token;
+                        }
+                        $currentName = '';
+                    } elseif ('' == $currentName) {
+                        $parsed[] = $token;
+                    }
+            }
+        }
+
+        return $parsed;
+    }
+
+    /**
+     * Returns the text content of a PDF as a string. Attempts to add
+     * whitespace for spacing and line-breaks where appropriate.
+     *
+     * getText() leverages getTextArray() to get the content
+     * of the document, setting the addPositionWhitespace flag to true
+     * so whitespace is inserted in a logical way for reading by
+     * humans.
+     */
+    public function getText(?Page $page = null): string
+    {
+        $this->addPositionWhitespace = true;
+        $result = $this->getTextArray($page);
+        $this->addPositionWhitespace = false;
+
+        return implode('', $result).' ';
+    }
+
+    /**
+     * Returns the text content of a PDF as an array of strings. No
+     * extra whitespace is inserted besides what is actually encoded in
+     * the PDF text.
+     *
+     * @throws \Exception
+     */
+    public function getTextArray(?Page $page = null): array
+    {
+        $result = [];
+        $text = [];
+
+        $marked_stack = [];
+        $last_written_position = false;
+
         $sections = $this->getSectionsText($this->content);
         $current_font = $this->getDefaultFont($page);
-        $clipped_font = $current_font;
+        $current_font_size = 1;
+        $current_text_leading = 0;
 
-        $current_position_td = ['x' => false, 'y' => false];
-        $current_position_tm = ['x' => false, 'y' => false];
+        $current_position = ['x' => false, 'y' => false];
+        $current_position_tm = [
+            'a' => 1, 'b' => 0, 'c' => 0,
+            'i' => 0, 'j' => 1, 'k' => 0,
+            'x' => 0, 'y' => 0, 'z' => 1,
+        ];
+        $current_position_td = ['x' => 0, 'y' => 0];
+        $current_position_cm = [
+            'a' => 1, 'b' => 0, 'c' => 0,
+            'i' => 0, 'j' => 1, 'k' => 0,
+            'x' => 0, 'y' => 0, 'z' => 1,
+        ];
+
+        $clipped_font = [];
+        $clipped_position_cm = [];
 
         self::$recursionStack[] = $this->getUniqueId();
 
         foreach ($sections as $section) {
             $commands = $this->getCommandsText($section);
-            $reverse_text = false;
-            $text = '';
-
             foreach ($commands as $command) {
                 switch ($command[self::OPERATOR]) {
+                    // Begin text object
+                    case 'BT':
+                        // Reset text positioning matrices
+                        $current_position_tm = [
+                            'a' => 1, 'b' => 0, 'c' => 0,
+                            'i' => 0, 'j' => 1, 'k' => 0,
+                            'x' => 0, 'y' => 0, 'z' => 1,
+                        ];
+                        $current_position_td = ['x' => 0, 'y' => 0];
+                        $current_text_leading = 0;
+                        break;
+
+                        // Begin marked content sequence with property list
+                    case 'BDC':
+                        if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) {
+                            $dict = $this->parseDictionary($match[1]);
+
+                            // Check for ActualText block
+                            if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) {
+                                if ('[' == $dict['ActualText'][0]) {
+                                    // Simulate a 'TJ' command on the stack
+                                    $marked_stack[] = [
+                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0],
+                                    ];
+                                } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) {
+                                    // Simulate a 'Tj' command on the stack
+                                    $marked_stack[] = [
+                                        'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0],
+                                    ];
+                                }
+                            }
+                        }
+                        break;
+
+                        // Begin marked content sequence
                     case 'BMC':
                         if ('ReversedChars' == $command[self::COMMAND]) {
-                            $reverse_text = true;
+                            // Upon encountering a ReversedChars command,
+                            // add the characters we've built up so far to
+                            // the result array
+                            $result = array_merge($result, $text);
+
+                            // Start a fresh $text array that will contain
+                            // reversed characters
+                            $text = [];
+
+                            // Add the reversed text flag to the stack
+                            $marked_stack[] = ['ReversedChars' => true];
                         }
                         break;
 
-                        // set character spacing
-                    case 'Tc':
-                        break;
-
-                        // move text current point
-                    case 'Td':
-                        $args = preg_split('/\s/s', $command[self::COMMAND]);
-                        $y = array_pop($args);
-                        $x = array_pop($args);
-                        if (((float) $x <= 0)
-                            || (false !== $current_position_td['y'] && (float) $y < (float) $current_position_td['y'])
-                        ) {
-                            // vertical offset
-                            $text .= "\n";
-                        } elseif (false !== $current_position_td['x'] && (float) $x > (float)
-                            $current_position_td['x']
-                        ) {
-                            $text .= $this->config->getHorizontalOffset();
-                        }
-                        $current_position_td = ['x' => $x, 'y' => $y];
-                        break;
-
-                        // move text current point and set leading
-                    case 'TD':
-                        $args = preg_split('/\s/s', $command[self::COMMAND]);
-                        $y = array_pop($args);
-                        $x = array_pop($args);
-                        if ((float) $y < 0) {
-                            $text .= "\n";
-                        } elseif ((float) $x <= 0) {
-                            $text .= ' ';
-                        }
-                        break;
-
-                    case 'Tf':
-                        list($id) = preg_split('/\s/s', $command[self::COMMAND]);
-                        $id = trim($id, '/');
-                        if (null !== $page) {
-                            $new_font = $page->getFont($id);
-                            // If an invalid font ID is given, do not update the font.
-                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
-                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
-                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
-                            // But we want to make sure that malformed PDFs do not simply crash.
-                            if (null !== $new_font) {
-                                $current_font = $new_font;
-                            }
-                        }
-                        break;
-
-                    case 'Q':
-                        // Use clip: restore font.
-                        $current_font = $clipped_font;
-                        break;
-
-                    case 'q':
-                        // Use clip: save font.
-                        $clipped_font = $current_font;
-                        break;
-
-                    case "'":
-                    case 'Tj':
-                        $command[self::COMMAND] = [$command];
-                        // no break
-                    case 'TJ':
-                        $sub_text = $current_font->decodeText($command[self::COMMAND]);
-                        $text .= $sub_text;
-                        break;
-
-                        // set leading
-                    case 'TL':
-                        $text .= ' ';
-                        break;
-
-                    case 'Tm':
-                        $args = preg_split('/\s/s', $command[self::COMMAND]);
-                        $y = array_pop($args);
-                        $x = array_pop($args);
-                        if (false !== $current_position_tm['x']) {
-                            $delta = abs((float) $x - (float) $current_position_tm['x']);
-                            if ($delta > 10) {
-                                $text .= "\t";
-                            }
-                        }
-                        if (false !== $current_position_tm['y']) {
-                            $delta = abs((float) $y - (float) $current_position_tm['y']);
-                            if ($delta > 10) {
-                                $text .= "\n";
-                            }
-                        }
-                        $current_position_tm = ['x' => $x, 'y' => $y];
-                        break;
-
-                        // set super/subscripting text rise
-                    case 'Ts':
-                        break;
-
-                        // set word spacing
-                    case 'Tw':
-                        break;
-
-                        // set horizontal scaling
-                    case 'Tz':
-                        $text .= "\n";
-                        break;
-
-                        // move to start of next line
-                    case 'T*':
-                        $text .= "\n";
-                        break;
-
-                    case 'Da':
+                        // set graphics position matrix
+                    case 'cm':
+                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
+                        $current_position_cm = [
+                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
+                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
+                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
+                        ];
                         break;
 
                     case 'Do':
@@ -395,112 +777,247 @@ class PDFObject
                             $xobject = $page->getXObject($id);
 
                             // @todo $xobject could be a ElementXRef object, which would then throw an error
-                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) {
+                            if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) {
                                 // Not a circular reference.
-                                $text .= $xobject->getText($page);
+                                $text[] = $xobject->getText($page);
                             }
                         }
                         break;
 
-                    case 'rg':
-                    case 'RG':
+                        // Marked content point with (DP) & without (MP) property list
+                    case 'DP':
+                    case 'MP':
                         break;
 
-                    case 're':
+                        // End text object
+                    case 'ET':
                         break;
 
-                    case 'co':
+                        // Store current selected font and graphics matrix
+                    case 'q':
+                        $clipped_font[] = [$current_font, $current_font_size];
+                        $clipped_position_cm[] = $current_position_cm;
                         break;
 
-                    case 'cs':
+                        // Restore previous selected font and graphics matrix
+                    case 'Q':
+                        list($current_font, $current_font_size) = array_pop($clipped_font);
+                        $current_position_cm = array_pop($clipped_position_cm);
                         break;
 
-                    case 'gs':
-                        break;
+                        // End marked content sequence
+                    case 'EMC':
+                        $data = false;
+                        if (\count($marked_stack)) {
+                            $marked = array_pop($marked_stack);
+                            $action = key($marked);
+                            $data = $marked[$action];
 
-                    case 'en':
-                        break;
+                            switch ($action) {
+                                // If we are in ReversedChars mode...
+                                case 'ReversedChars':
+                                    // Reverse the characters we've built up so far
+                                    foreach ($text as $key => $t) {
+                                        $text[$key] = implode('', array_reverse(
+                                            mb_str_split($t, 1, mb_internal_encoding())
+                                        ));
+                                    }
 
-                    case 'sc':
-                    case 'SC':
-                        break;
+                                    // Add these characters to the result array
+                                    $result = array_merge($result, $text);
 
-                    case 'g':
-                    case 'G':
-                        break;
+                                    // Start a fresh $text array that will contain
+                                    // non-reversed characters
+                                    $text = [];
+                                    break;
 
-                    case 'V':
-                        break;
-
-                    case 'vo':
-                    case 'Vo':
-                        break;
-
-                    default:
-                }
-            }
-
-            // Fix Hebrew and other reverse text oriented languages.
-            // @see: https://github.com/smalot/pdfparser/issues/398
-            if ($reverse_text) {
-                $chars = mb_str_split($text, 1, mb_internal_encoding());
-                $text = implode('', array_reverse($chars));
-            }
-
-            $result .= $text;
-        }
-
-        return $result.' ';
-    }
-
-    /**
-     * @throws \Exception
-     */
-    public function getTextArray(Page $page = null): array
-    {
-        $text = [];
-        $sections = $this->getSectionsText($this->content);
-        $current_font = new Font($this->document, null, null, $this->config);
-
-        foreach ($sections as $section) {
-            $commands = $this->getCommandsText($section);
-
-            foreach ($commands as $command) {
-                switch ($command[self::OPERATOR]) {
-                    // set character spacing
-                    case 'Tc':
-                        break;
-
-                        // move text current point
-                    case 'Td':
-                        break;
-
-                        // move text current point and set leading
-                    case 'TD':
-                        break;
-
-                    case 'Tf':
-                        if (null !== $page) {
-                            list($id) = preg_split('/\s/s', $command[self::COMMAND]);
-                            $id = trim($id, '/');
-                            $current_font = $page->getFont($id);
+                                case 'ActualText':
+                                    // Use the content of the ActualText as a command
+                                    $command = $data;
+                                    break;
+                            }
                         }
-                        break;
 
+                        // If this EMC command has been transformed into a 'Tj'
+                        // or 'TJ' command because of being ActualText, then bypass
+                        // the break to proceed to the writing section below.
+                        if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) {
+                            break;
+                        }
+
+                        // no break
                     case "'":
+                    case '"':
+                        if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) {
+                            // Move to next line and write text
+                            $current_position['x'] = 0;
+                            $current_position_td['x'] = 0;
+                            $current_position_td['y'] += $current_text_leading;
+                        }
+                        // no break
                     case 'Tj':
                         $command[self::COMMAND] = [$command];
                         // no break
                     case 'TJ':
-                        $sub_text = $current_font->decodeText($command[self::COMMAND]);
-                        $text[] = $sub_text;
+                        // Check the marked content stack for flags
+                        $actual_text = false;
+                        $reverse_text = false;
+                        foreach ($marked_stack as $marked) {
+                            if (isset($marked['ActualText'])) {
+                                $actual_text = true;
+                            }
+                            if (isset($marked['ReversedChars'])) {
+                                $reverse_text = true;
+                            }
+                        }
+
+                        // Account for text position ONLY just before we write text
+                        if (false === $actual_text && \is_array($last_written_position)) {
+                            // If $last_written_position is an array, that
+                            // means we have stored text position coordinates
+                            // for placing an ActualText
+                            $currentX = $last_written_position[0];
+                            $currentY = $last_written_position[1];
+                            $last_written_position = false;
+                        } else {
+                            $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x'];
+                            $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y'];
+                        }
+                        $whiteSpace = '';
+
+                        $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i'];
+                        $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
+
+                        if (true === $this->addPositionWhitespace && false !== $current_position['x']) {
+                            $curY = $currentY - $current_position['y'];
+                            if (abs($curY) >= abs($factorY) / 4) {
+                                $whiteSpace = "\n";
+                            } else {
+                                if (true === $reverse_text) {
+                                    $curX = $current_position['x'] - $currentX;
+                                } else {
+                                    $curX = $currentX - $current_position['x'];
+                                }
+
+                                // In abs($factorX * 7) below, the 7 is chosen arbitrarily
+                                // as the number of apparent "spaces" in a document we
+                                // would need before considering them a "tab". In the
+                                // future, we might offer this value to users as a config
+                                // option.
+                                if ($curX >= abs($factorX * 7)) {
+                                    $whiteSpace = "\t";
+                                } elseif ($curX >= abs($factorX * 2)) {
+                                    $whiteSpace = ' ';
+                                }
+                            }
+                        }
+
+                        $newtext = $this->getTJUsingFontFallback(
+                            $current_font,
+                            $command[self::COMMAND],
+                            $page,
+                            $factorX
+                        );
+
+                        // If there is no ActualText pending then write
+                        if (false === $actual_text) {
+                            $newtext = str_replace(["\r", "\n"], '', $newtext);
+                            if (false !== $reverse_text) {
+                                // If we are in ReversedChars mode, add the whitespace last
+                                $text[] = preg_replace('/  $/', ' ', $newtext.$whiteSpace);
+                            } else {
+                                // Otherwise add the whitespace first
+                                if (' ' === $whiteSpace && isset($text[\count($text) - 1])) {
+                                    $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]);
+                                }
+                                $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext);
+                            }
+
+                            // Record the position of this inserted text for comparison
+                            // with the next text block.
+                            // Provide a 'fudge' factor guess on how wide this text block
+                            // is based on the number of characters. This helps limit the
+                            // number of tabs inserted, but isn't perfect.
+                            $factor = $factorX / 2;
+                            $current_position = [
+                                'x' => $currentX - mb_strlen($newtext) * $factor,
+                                'y' => $currentY,
+                            ];
+                        } elseif (false === $last_written_position) {
+                            // If there is an ActualText in the pipeline
+                            // store the position this undisplayed text
+                            // *would* have been written to, so the
+                            // ActualText is displayed in the right spot
+                            $last_written_position = [$currentX, $currentY];
+                            $current_position['x'] = $currentX;
+                        }
+                        break;
+
+                        // move to start of next line
+                    case 'T*':
+                        $current_position['x'] = 0;
+                        $current_position_td['x'] = 0;
+                        $current_position_td['y'] += $current_text_leading;
+                        break;
+
+                        // set character spacing
+                    case 'Tc':
+                        break;
+
+                        // move text current point and set leading
+                    case 'Td':
+                    case 'TD':
+                        // move text current point
+                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
+                        $y = (float) array_pop($args);
+                        $x = (float) array_pop($args);
+
+                        if ('TD' == $command[self::OPERATOR]) {
+                            $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j'];
+                        }
+
+                        $current_position_td = [
+                            'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'],
+                            'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'],
+                        ];
+                        break;
+
+                    case 'Tf':
+                        $args = preg_split('/\s/s', $command[self::COMMAND]);
+                        $size = (float) array_pop($args);
+                        $id = trim(array_pop($args), '/');
+                        if (null !== $page) {
+                            $new_font = $page->getFont($id);
+                            // If an invalid font ID is given, do not update the font.
+                            // This should theoretically never happen, as the PDF spec states for the Tf operator:
+                            // "The specified font value shall match a resource name in the Font entry of the default resource dictionary"
+                            // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435)
+                            // But we want to make sure that malformed PDFs do not simply crash.
+                            if (null !== $new_font) {
+                                $current_font = $new_font;
+                                $current_font_size = $size;
+                            }
+                        }
                         break;
 
                         // set leading
                     case 'TL':
+                        $y = (float) $command[self::COMMAND];
+                        $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j'];
                         break;
 
+                        // set text position matrix
                     case 'Tm':
+                        $args = preg_split('/\s+/s', $command[self::COMMAND]);
+                        $current_position_tm = [
+                            'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0,
+                            'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0,
+                            'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1,
+                        ];
+                        break;
+
+                        // set text rendering mode
+                    case 'Ts':
                         break;
 
                         // set super/subscripting text rise
@@ -513,59 +1030,6 @@ class PDFObject
 
                         // set horizontal scaling
                     case 'Tz':
-                        // $text .= "\n";
-                        break;
-
-                        // move to start of next line
-                    case 'T*':
-                        // $text .= "\n";
-                        break;
-
-                    case 'Da':
-                        break;
-
-                    case 'Do':
-                        if (null !== $page) {
-                            $args = preg_split('/\s/s', $command[self::COMMAND]);
-                            $id = trim(array_pop($args), '/ ');
-                            if ($xobject = $page->getXObject($id)) {
-                                $text[] = $xobject->getText($page);
-                            }
-                        }
-                        break;
-
-                    case 'rg':
-                    case 'RG':
-                        break;
-
-                    case 're':
-                        break;
-
-                    case 'co':
-                        break;
-
-                    case 'cs':
-                        break;
-
-                    case 'gs':
-                        break;
-
-                    case 'en':
-                        break;
-
-                    case 'sc':
-                    case 'SC':
-                        break;
-
-                    case 'g':
-                    case 'G':
-                        break;
-
-                    case 'V':
-                        break;
-
-                    case 'vo':
-                    case 'Vo':
                         break;
 
                     default:
@@ -573,198 +1037,103 @@ class PDFObject
             }
         }
 
-        return $text;
+        $result = array_merge($result, $text);
+
+        return $result;
     }
 
+    /**
+     * getCommandsText() expects the content of $text_part to be an
+     * already formatted, single-line command from a document stream.
+     * The companion function getSectionsText() returns a document
+     * stream as an array of single commands for just this purpose.
+     * Because of this, the argument $offset is no longer used, and
+     * may be removed in a future PdfParser release.
+     *
+     * A better name for this function would be getCommandText()
+     * since it now always works on just one command.
+     */
     public function getCommandsText(string $text_part, int &$offset = 0): array
     {
         $commands = $matches = [];
 
-        while ($offset < \strlen($text_part)) {
-            $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset);
-            $char = $text_part[$offset];
+        preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
 
-            $operator = '';
-            $type = '';
-            $command = false;
-
-            switch ($char) {
-                case '/':
-                    $type = $char;
-                    if (preg_match(
-                        '/\G\/([A-Z0-9\._,\+]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si',
-                        $text_part,
-                        $matches,
-                        0,
-                        $offset
-                    )
-                    ) {
-                        $operator = $matches[2];
-                        $command = $matches[1];
-                        $offset += \strlen($matches[0]);
-                    } elseif (preg_match(
-                        '/\G\/([A-Z0-9\._,\+]+)\s+([A-Z]+)\s*/si',
-                        $text_part,
-                        $matches,
-                        0,
-                        $offset
-                    )
-                    ) {
-                        $operator = $matches[2];
-                        $command = $matches[1];
-                        $offset += \strlen($matches[0]);
-                    }
-                    break;
-
-                case '[':
-                case ']':
-                    // array object
-                    $type = $char;
-                    if ('[' == $char) {
-                        ++$offset;
-                        // get elements
-                        $command = $this->getCommandsText($text_part, $offset);
-
-                        if (preg_match(
-                            '/\G\s*[A-Z]{1,2}\s*/si',
-                            $text_part,
-                            $matches,
-                            0,
-                            $offset
-                        )
-                        ) {
-                            $operator = trim($matches[0]);
-                            $offset += \strlen($matches[0]);
-                        }
-                    } else {
-                        ++$offset;
-                        break;
-                    }
-                    break;
-
-                case '<':
-                case '>':
-                    // array object
-                    $type = $char;
-                    ++$offset;
-                    if ('<' == $char) {
-                        $strpos = strpos($text_part, '>', $offset);
-                        $command = substr($text_part, $offset, $strpos - $offset);
-                        $offset = $strpos + 1;
-                    }
-
-                    if (preg_match(
-                        '/\G\s*[A-Z]{1,2}\s*/si',
-                        $text_part,
-                        $matches,
-                        0,
-                        $offset
-                    )
-                    ) {
-                        $operator = trim($matches[0]);
-                        $offset += \strlen($matches[0]);
-                    }
-                    break;
-
-                case '(':
-                case ')':
-                    ++$offset;
-                    $type = $char;
-                    $strpos = $offset;
-                    if ('(' == $char) {
-                        $open_bracket = 1;
-                        while ($open_bracket > 0) {
-                            if (!isset($text_part[$strpos])) {
-                                break;
-                            }
-                            $ch = $text_part[$strpos];
-                            switch ($ch) {
-                                case '\\':
-                                    // REVERSE SOLIDUS (5Ch) (Backslash)
-                                    // skip next character
-                                    ++$strpos;
-                                    break;
-
-                                case '(':
-                                    // LEFT PARENHESIS (28h)
-                                    ++$open_bracket;
-                                    break;
-
-                                case ')':
-                                    // RIGHT PARENTHESIS (29h)
-                                    --$open_bracket;
-                                    break;
-                            }
-                            ++$strpos;
-                        }
-                        $command = substr($text_part, $offset, $strpos - $offset - 1);
-                        $offset = $strpos;
-
-                        if (preg_match(
-                            '/\G\s*([A-Z\']{1,2})\s*/si',
-                            $text_part,
-                            $matches,
-                            0,
-                            $offset
-                        )
-                        ) {
-                            $operator = $matches[1];
-                            $offset += \strlen($matches[0]);
-                        }
-                    }
-                    break;
-
-                default:
-                    if ('ET' == substr($text_part, $offset, 2)) {
-                        break;
-                    } elseif (preg_match(
-                        '/\G\s*(?P<data>([0-9\.\-]+\s*?)+)\s+(?P<id>[A-Z]{1,3})\s*/si',
-                        $text_part,
-                        $matches,
-                        0,
-                        $offset
-                    )
-                    ) {
-                        $operator = trim($matches['id']);
-                        $command = trim($matches['data']);
-                        $offset += \strlen($matches[0]);
-                    } elseif (preg_match(
-                        '/\G\s*([0-9\.\-]+\s*?)+\s*/si',
-                        $text_part,
-                        $matches,
-                        0,
-                        $offset
-                    )
-                    ) {
-                        $type = 'n';
-                        $command = trim($matches[0]);
-                        $offset += \strlen($matches[0]);
-                    } elseif (preg_match(
-                        '/\G\s*([A-Z\*]+)\s*/si',
-                        $text_part,
-                        $matches,
-                        0,
-                        $offset
-                    )
-                    ) {
-                        $type = '';
-                        $operator = $matches[1];
-                        $command = '';
-                        $offset += \strlen($matches[0]);
-                    }
-            }
-
-            if (false !== $command) {
-                $commands[] = [
-                    self::TYPE => $type,
-                    self::OPERATOR => $operator,
-                    self::COMMAND => $command,
-                ];
-            } else {
-                break;
-            }
+        // If no valid command is detected, return an empty array
+        if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) {
+            return [];
         }
 
+        $type = $matches[2];
+        $operator = $matches[3];
+        $command = trim($matches[1]);
+
+        if ('TJ' == $operator) {
+            $subcommand = [];
+            $command = trim($command, '[]');
+            do {
+                $oldCommand = $command;
+
+                // Search for parentheses string () format
+                if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) {
+                    $subcommand[] = [
+                        self::TYPE => '(',
+                        self::OPERATOR => 'TJ',
+                        self::COMMAND => $tjmatch[1],
+                    ];
+                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
+                        $subcommand[] = [
+                            self::TYPE => 'n',
+                            self::OPERATOR => '',
+                            self::COMMAND => $tjmatch[2],
+                        ];
+                    }
+                    $command = substr($command, \strlen($tjmatch[0]));
+                }
+
+                // Search for hexadecimal <> format
+                if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) {
+                    $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]);
+                    $subcommand[] = [
+                        self::TYPE => '<',
+                        self::OPERATOR => 'TJ',
+                        self::COMMAND => $tjmatch[1],
+                    ];
+                    if (isset($tjmatch[2]) && trim($tjmatch[2])) {
+                        $subcommand[] = [
+                            self::TYPE => 'n',
+                            self::OPERATOR => '',
+                            self::COMMAND => $tjmatch[2],
+                        ];
+                    }
+                    $command = substr($command, \strlen($tjmatch[0]));
+                }
+            } while ($command != $oldCommand);
+
+            $command = $subcommand;
+        } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) {
+            // Depending on the string type, trim the data of the
+            // appropriate delimiters
+            if ('(' == $type) {
+                // Don't use trim() here since a () string may end with
+                // a balanced or escaped right parentheses, and trim()
+                // will delete both. Both strings below are valid:
+                //   eg. (String())
+                //   eg. (String\))
+                $command = preg_replace('/^\(|\)$/', '', $command);
+            } elseif ('<' == $type) {
+                $command = trim($command, '<>');
+            }
+        } elseif ('/' == $type) {
+            $command = substr($command, 1);
+        }
+
+        $commands[] = [
+            self::TYPE => $type,
+            self::OPERATOR => $operator,
+            self::COMMAND => $command,
+        ];
+
         return $commands;
     }
 
@@ -772,7 +1141,7 @@ class PDFObject
         Document $document,
         Header $header,
         ?string $content,
-        Config $config = null
+        ?Config $config = null
     ): self {
         switch ($header->get('Type')->getContent()) {
             case 'XObject':
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Page.php b/orcinus/pdfparser/src/Smalot/PdfParser/Page.php
index fbc1987..d6ffaf0 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Page.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Page.php
@@ -176,7 +176,7 @@ class Page extends PDFObject
         }*/
     }
 
-    public function getText(self $page = null): string
+    public function getText(?self $page = null): string
     {
         if ($contents = $this->get('Contents')) {
             if ($contents instanceof ElementMissing) {
@@ -312,7 +312,7 @@ class Page extends PDFObject
         return new self($pdfObject->document, $header, $new_content, $config);
     }
 
-    public function getTextArray(self $page = null): array
+    public function getTextArray(?self $page = null): array
     {
         if ($this->isFpdf()) {
             $pdfObject = $this->getPDFObjectForFpdf();
@@ -400,8 +400,6 @@ class Page extends PDFObject
             }
             $sectionsText = $content->getSectionsText($content->getContent());
             foreach ($sectionsText as $sectionText) {
-                $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
-
                 $commandsText = $content->getCommandsText($sectionText);
                 foreach ($commandsText as $command) {
                     $extractedData[] = $command;
@@ -420,7 +418,7 @@ class Page extends PDFObject
      *
      * @return array An array with the data and the internal representation
      */
-    public function extractDecodedRawData(array $extractedRawData = null): array
+    public function extractDecodedRawData(?array $extractedRawData = null): array
     {
         if (!isset($extractedRawData) || !$extractedRawData) {
             $extractedRawData = $this->extractRawData();
@@ -500,7 +498,7 @@ class Page extends PDFObject
      *
      * @return array An array with the text command of the page
      */
-    public function getDataCommands(array $extractedDecodedRawData = null): array
+    public function getDataCommands(?array $extractedDecodedRawData = null): array
     {
         if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
             $extractedDecodedRawData = $this->extractDecodedRawData();
@@ -651,7 +649,7 @@ class Page extends PDFObject
      * @return array an array with the data of the page including the Tm information
      *               of any text in the page
      */
-    public function getDataTm(array $dataCommands = null): array
+    public function getDataTm(?array $dataCommands = null): array
     {
         if (!isset($dataCommands) || !$dataCommands) {
             $dataCommands = $this->getDataCommands();
@@ -701,6 +699,12 @@ class Page extends PDFObject
         $extractedTexts = $this->getTextArray();
         $extractedData = [];
         foreach ($dataCommands as $command) {
+            // If we've used up all the texts from getTextArray(), exit
+            // so we aren't accessing non-existent array indices
+            // Fixes 'undefined array key' errors in Issues #575, #576
+            if (\count($extractedTexts) <= \count($extractedData)) {
+                break;
+            }
             $currentText = $extractedTexts[\count($extractedData)];
             switch ($command['o']) {
                 /*
@@ -712,21 +716,13 @@ class Page extends PDFObject
                     $Tl = $defaultTl;
                     $Tx = 0;
                     $Ty = 0;
-                    $fontId = $defaultFontId;
-                    $fontSize = $defaultFontSize;
                     break;
 
                     /*
                      * ET
-                     * End a text object, discarding the text matrix
+                     * End a text object
                      */
                 case 'ET':
-                    $Tm = $defaultTm;
-                    $Tl = $defaultTl;
-                    $Tx = 0;
-                    $Ty = 0;
-                    $fontId = $defaultFontId;
-                    $fontSize = $defaultFontSize;
                     break;
 
                     /*
@@ -741,7 +737,7 @@ class Page extends PDFObject
 
                     /*
                      * tx ty Td
-                     * Move to the start of the next line, offset form the start of the
+                     * Move to the start of the next line, offset from the start of the
                      * current line by tx, ty.
                      */
                 case 'Td':
@@ -898,7 +894,7 @@ class Page extends PDFObject
      *               "near" the x,y coordinate, an empty array is returned. If Both, x
      *               and y coordinates are null, null is returned.
      */
-    public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
+    public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
     {
         if (!isset($this->dataTm) || !$this->dataTm) {
             $this->getDataTm();
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php b/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php
index 3078f9e..b051f11 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php
@@ -60,7 +60,7 @@ class Parser
 
     protected $rawDataParser;
 
-    public function __construct($cfg = [], Config $config = null)
+    public function __construct($cfg = [], ?Config $config = null)
     {
         $this->config = $config ?: new Config();
         $this->rawDataParser = new RawDataParser($cfg, $this->config);
@@ -77,6 +77,7 @@ class Parser
     public function parseFile(string $filename): Document
     {
         $content = file_get_contents($filename);
+
         /*
          * 2018/06/20 @doganoo as multiple times a
          * users have complained that the parseFile()
@@ -101,7 +102,7 @@ class Parser
         // Create structure from raw data.
         list($xref, $data) = $this->rawDataParser->parseData($content);
 
-        if (isset($xref['trailer']['encrypt'])) {
+        if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
             throw new \Exception('Secured pdf file are currently not supported.');
         }
 
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php
index c8d2740..a6f11b3 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php
@@ -233,32 +233,32 @@ class FilterHelper
      */
     protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
     {
-        /*
-         * gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty)
-         * the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
-         */
-        set_error_handler(function ($errNo, $errStr) {
-            if (\E_WARNING === $errNo) {
-                throw new \Exception($errStr);
-            } else {
-                // fallback to default php error handler
-                return false;
-            }
-        });
+        // Uncatchable E_WARNING for "data error" is @ suppressed
+        // so execution may proceed with an alternate decompression
+        // method.
+        $decoded = @gzuncompress($data, $decodeMemoryLimit);
 
-        $decoded = null;
-
-        // initialize string to return
-        try {
-            $decoded = gzuncompress($data, $decodeMemoryLimit);
-            if (false === $decoded) {
-                throw new \Exception('decodeFilterFlateDecode: invalid code');
+        if (false === $decoded) {
+            // If gzuncompress() failed, try again using the compress.zlib://
+            // wrapper to decode it in a file-based context.
+            // See: https://www.php.net/manual/en/function.gzuncompress.php#79042
+            // Issue: https://github.com/smalot/pdfparser/issues/592
+            $ztmp = tmpfile();
+            if (false != $ztmp) {
+                fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
+                $file = stream_get_meta_data($ztmp)['uri'];
+                if (0 === $decodeMemoryLimit) {
+                    $decoded = file_get_contents('compress.zlib://'.$file);
+                } else {
+                    $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit);
+                }
+                fclose($ztmp);
             }
-        } catch (\Exception $e) {
-            throw $e;
-        } finally {
-            // Restore old handler just in case it was customized outside of PDFParser.
-            restore_error_handler();
+        }
+
+        if (false === \is_string($decoded) || '' === $decoded) {
+            // If the decoded string is empty, that means decoding failed.
+            throw new \Exception('decodeFilterFlateDecode: invalid data');
         }
 
         return $decoded;
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php
index 1a4583c..5e17083 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php
@@ -47,12 +47,14 @@ use Smalot\PdfParser\Config;
 class RawDataParser
 {
     /**
-     * @var \Smalot\PdfParser\Config
+     * @var Config
      */
     private $config;
 
     /**
      * Configuration array.
+     *
+     * @var array<string,bool>
      */
     protected $cfg = [
         // if `true` ignore filter decoding errors
@@ -67,7 +69,7 @@ class RawDataParser
     /**
      * @param array $cfg Configuration array, default is []
      */
-    public function __construct($cfg = [], Config $config = null)
+    public function __construct($cfg = [], ?Config $config = null)
     {
         // merge given array with default values
         $this->cfg = array_merge($this->cfg, $cfg);
@@ -125,7 +127,7 @@ class RawDataParser
         // decode the stream
         $remaining_filters = [];
         foreach ($filters as $filter) {
-            if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
+            if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
                 try {
                     $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
                 } catch (\Exception $e) {
@@ -402,14 +404,19 @@ class RawDataParser
                     }
                     $prev_row = $ddata[$k];
                 } // end for each row
-                // complete decoding
+            // complete decoding
             } else {
                 // number of bytes in a row
                 $rowlen = array_sum($wb);
-                // convert the stream into an array of integers
-                $sdata = unpack('C*', $xrefcrs[1][3][0]);
-                // split the rows
-                $ddata = array_chunk($sdata, $rowlen);
+                if (0 < $rowlen) {
+                    // convert the stream into an array of integers
+                    $sdata = unpack('C*', $xrefcrs[1][3][0]);
+                    // split the rows
+                    $ddata = array_chunk($sdata, $rowlen);
+                } else {
+                    // if the row length is zero, $ddata should be an empty array as well
+                    $ddata = [];
+                }
             }
 
             $sdata = [];
@@ -609,7 +616,7 @@ class RawDataParser
      *
      * @return array containing object type, raw value and offset to next object
      */
-    protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
+    protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
     {
         $objtype = ''; // object type to be returned
         $objval = ''; // object value to be returned
@@ -756,7 +763,7 @@ class RawDataParser
                     // start stream object
                     $objtype = 'stream';
                     $offset += 6;
-                    if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
+                    if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
                         $offset += \strlen($matches[0]);
 
                         // we get stream length here to later help preg_match test less data
@@ -857,39 +864,39 @@ class RawDataParser
      */
     protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
     {
-        $startxrefPreg = preg_match(
-            '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
+        // If the $offset is currently pointed at whitespace, bump it
+        // forward until it isn't; affects loosely targetted offsets
+        // for the 'xref' keyword
+        // See: https://github.com/smalot/pdfparser/issues/673
+        $bumpOffset = $offset;
+        while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
+            ++$bumpOffset;
+        }
+
+        // Find all startxref tables from this $offset forward
+        $startxrefPreg = preg_match_all(
+            '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
             $pdfData,
-            $matches,
-            \PREG_OFFSET_CAPTURE,
+            $startxrefMatches,
+            \PREG_SET_ORDER,
             $offset
         );
 
-        if (0 == $offset) {
-            // find last startxref
-            $pregResult = preg_match_all(
-                '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
-                $pdfData,
-                $matches,
-                \PREG_SET_ORDER,
-                $offset
-            );
-            if (0 == $pregResult) {
-                throw new \Exception('Unable to find startxref');
-            }
-            $matches = array_pop($matches);
-            $startxref = $matches[1];
-        } elseif (strpos($pdfData, 'xref', $offset) == $offset) {
-            // Already pointing at the xref table
-            $startxref = $offset;
-        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
-            // Cross-Reference Stream object
-            $startxref = $offset;
-        } elseif ($startxrefPreg) {
-            // startxref found
-            $startxref = $matches[1][0];
-        } else {
+        if (0 == $startxrefPreg) {
+            // No startxref tables were found
             throw new \Exception('Unable to find startxref');
+        } elseif (0 == $offset) {
+            // Use the last startxref in the document
+            $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
+        } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
+            // Already pointing at the xref table
+            $startxref = $bumpOffset;
+        } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
+            // Cross-Reference Stream object
+            $startxref = $bumpOffset;
+        } else {
+            // Use the next startxref from this $offset
+            $startxref = (int) $startxrefMatches[0][1];
         }
 
         if ($startxref > \strlen($pdfData)) {
@@ -901,8 +908,15 @@ class RawDataParser
             // Cross-Reference
             $xref = $this->decodeXref($pdfData, $startxref, $xref);
         } else {
-            // Cross-Reference Stream
-            $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
+            // Check if the $pdfData might have the wrong line-endings
+            $pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
+            if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
+                // Return Unix-line-ending flag
+                $xref = ['Unix' => true];
+            } else {
+                // Cross-Reference Stream
+                $xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
+            }
         }
         if (empty($xref)) {
             throw new \Exception('Unable to find xref');
@@ -937,6 +951,12 @@ class RawDataParser
         // get xref and trailer data
         $xref = $this->getXrefData($pdfData);
 
+        // If we found Unix line-endings
+        if (isset($xref['Unix'])) {
+            $pdfData = str_replace("\r\n", "\n", $pdfData);
+            $xref = $this->getXrefData($pdfData);
+        }
+
         // parse all document objects
         $objects = [];
         foreach ($xref['xref'] as $obj => $offset) {
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php
index 7caec8c..8e60647 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php
@@ -41,7 +41,7 @@ use Smalot\PdfParser\PDFObject;
  */
 class Form extends Page
 {
-    public function getText(Page $page = null): string
+    public function getText(?Page $page = null): string
     {
         $header = new Header([], $this->document);
         $contents = new PDFObject($this->document, $header, $this->content, $this->config);
diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php
index 1265582..6dc6b0a 100644
--- a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php
@@ -40,7 +40,7 @@ use Smalot\PdfParser\PDFObject;
  */
 class Image extends PDFObject
 {
-    public function getText(Page $page = null): string
+    public function getText(?Page $page = null): string
     {
         return '';
     }