From fb7e29549024da73706906fafb6f2f8d3a217bcc Mon Sep 17 00:00:00 2001 From: Brian Huisman Date: Thu, 16 May 2024 12:36:43 -0400 Subject: [PATCH] Update PdfParser to 2.10.0 --- .../pdfparser/src/Smalot/PdfParser/Config.php | 21 + .../src/Smalot/PdfParser/Document.php | 28 +- .../src/Smalot/PdfParser/Element.php | 6 +- .../Smalot/PdfParser/Element/ElementArray.php | 4 +- .../PdfParser/Element/ElementBoolean.php | 2 +- .../Smalot/PdfParser/Element/ElementDate.php | 4 +- .../Smalot/PdfParser/Element/ElementHexa.php | 16 +- .../Smalot/PdfParser/Element/ElementName.php | 2 +- .../Smalot/PdfParser/Element/ElementNull.php | 2 +- .../PdfParser/Element/ElementNumeric.php | 2 +- .../PdfParser/Element/ElementString.php | 2 +- .../PdfParser/Element/ElementStruct.php | 2 +- .../Smalot/PdfParser/Element/ElementXRef.php | 2 +- .../src/Smalot/PdfParser/Encoding.php | 6 + .../PdfParser/Encoding/PDFDocEncoding.php | 4 +- .../pdfparser/src/Smalot/PdfParser/Font.php | 106 +- .../pdfparser/src/Smalot/PdfParser/Header.php | 4 +- .../src/Smalot/PdfParser/PDFObject.php | 1299 +++++++++++------ .../pdfparser/src/Smalot/PdfParser/Page.php | 32 +- .../pdfparser/src/Smalot/PdfParser/Parser.php | 5 +- .../Smalot/PdfParser/RawData/FilterHelper.php | 48 +- .../PdfParser/RawData/RawDataParser.php | 100 +- .../src/Smalot/PdfParser/XObject/Form.php | 2 +- .../src/Smalot/PdfParser/XObject/Image.php | 2 +- 24 files changed, 1080 insertions(+), 621 deletions(-) diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Config.php b/orcinus/pdfparser/src/Smalot/PdfParser/Config.php index ff69d3e..e44b164 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Config.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Config.php @@ -82,6 +82,13 @@ class Config */ private $dataTmFontInfoHasToBeIncluded = false; + /** + * Whether to attempt to read PDFs even if they are marked as encrypted. + * + * @var bool + */ + private $ignoreEncryption = false; + public function getFontSpaceLimit() { return $this->fontSpaceLimit; @@ -151,4 +158,18 @@ class Config { $this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded; } + + public function getIgnoreEncryption(): bool + { + return $this->ignoreEncryption; + } + + /** + * @deprecated this is a temporary workaround, don't rely on it + * @see https://github.com/smalot/pdfparser/pull/653 + */ + public function setIgnoreEncryption(bool $ignoreEncryption): void + { + $this->ignoreEncryption = $ignoreEncryption; + } } diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Document.php b/orcinus/pdfparser/src/Smalot/PdfParser/Document.php index d2cec38..016787a 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Document.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Document.php @@ -255,7 +255,7 @@ class Document if ('rdf:li' == $val['tag']) { $metadata[] = $val['value']; - // Else assign a value to this property + // Else assign a value to this property } else { $metadata[$val['tag']] = $val['value']; } @@ -263,12 +263,20 @@ class Document break; case 'close': - // If the value of this property is a single- - // element array where the element is of type - // string, use the value of the first list item - // as the value for this property - if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) { - $metadata = $metadata[0]; + // If the value of this property is an array + if (\is_array($metadata)) { + // If the value is a single element array + // where the element is of type string, use + // the value of the first list item as the + // value for this property + if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) { + $metadata = $metadata[0]; + } elseif (0 == \count($metadata)) { + // if the value is an empty array, set + // the value of this property to the empty + // string + $metadata = ''; + } } // Move down one level in the stack @@ -328,12 +336,12 @@ class Document return null; } - public function hasObjectsByType(string $type, string $subtype = null): bool + public function hasObjectsByType(string $type, ?string $subtype = null): bool { return 0 < \count($this->getObjectsByType($type, $subtype)); } - public function getObjectsByType(string $type, string $subtype = null): array + public function getObjectsByType(string $type, ?string $subtype = null): array { if (!isset($this->dictionary[$type])) { return []; @@ -410,7 +418,7 @@ class Document throw new \Exception('Missing catalog.'); } - public function getText(int $pageLimit = null): string + public function getText(?int $pageLimit = null): string { $texts = []; $pages = $this->getPages(); diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element.php index 0ce6c42..8066030 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element.php @@ -49,13 +49,13 @@ use Smalot\PdfParser\Element\ElementXRef; class Element { /** - * @var Document + * @var Document|null */ protected $document; protected $value; - public function __construct($value, Document $document = null) + public function __construct($value, ?Document $document = null) { $this->value = $value; $this->document = $document; @@ -96,7 +96,7 @@ class Element return (string) $this->value; } - public static function parse(string $content, Document $document = null, int &$position = 0) + public static function parse(string $content, ?Document $document = null, int &$position = 0) { $args = \func_get_args(); $only_values = isset($args[3]) ? $args[3] : false; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php index 6ad2220..b54bf84 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementArray.php @@ -42,7 +42,7 @@ use Smalot\PdfParser\PDFObject; */ class ElementArray extends Element { - public function __construct($value, Document $document = null) + public function __construct($value, ?Document $document = null) { parent::__construct($value, $document); } @@ -107,7 +107,7 @@ class ElementArray extends Element * * @return bool|ElementArray */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\[(?P.*)/is', $content, $match)) { preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches); diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php index 4831a4a..55fb463 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementBoolean.php @@ -61,7 +61,7 @@ class ElementBoolean extends Element /** * @return bool|ElementBoolean */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(?Ptrue|false)/is', $content, $match)) { $value = $match['value']; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php index c4d3984..f1f2df6 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementDate.php @@ -40,7 +40,7 @@ use Smalot\PdfParser\Document; class ElementDate extends ElementString { /** - * @var array + * @var array */ protected static $formats = [ 4 => 'Y', @@ -98,7 +98,7 @@ class ElementDate extends ElementString /** * @return bool|ElementDate */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\(D\:(?P.*?)\)/s', $content, $match)) { $name = $match['name']; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php index d031461..3fc3413 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementHexa.php @@ -42,7 +42,7 @@ class ElementHexa extends ElementString /** * @return bool|ElementHexa|ElementDate */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\<(?P[A-F0-9]+)\>/is', $content, $match)) { $name = $match['name']; @@ -64,15 +64,21 @@ class ElementHexa extends ElementString public static function decode(string $value): string { $text = ''; - $length = \strlen($value); - if ('00' === substr($value, 0, 2)) { - for ($i = 0; $i < $length; $i += 4) { + // Filter $value of non-hexadecimal characters + $value = (string) preg_replace('/[^0-9a-f]/i', '', $value); + + // Check for leading zeros (4-byte hexadecimal indicator), or + // the BE BOM + if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) { + $value = (string) preg_replace('/^feff/i', '', $value); + for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) { $hex = substr($value, $i, 4); $text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';'; } } else { - for ($i = 0; $i < $length; $i += 2) { + // Otherwise decode this as 2-byte hexadecimal + for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) { $hex = substr($value, $i, 2); $text .= \chr(hexdec($hex)); } diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php index 0f8d06b..6e8d97a 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementName.php @@ -54,7 +54,7 @@ class ElementName extends Element /** * @return bool|ElementName */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) { $name = $match[1]; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php index 8757630..9af8843 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNull.php @@ -58,7 +58,7 @@ class ElementNull extends Element /** * @return bool|ElementNull */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(null)/s', $content, $match)) { $offset += strpos($content, 'null') + \strlen('null'); diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php index 80885c1..5454acc 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementNumeric.php @@ -48,7 +48,7 @@ class ElementNumeric extends Element /** * @return bool|ElementNumeric */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(?P\-?[0-9\.]+)/s', $content, $match)) { $value = $match['value']; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php index a18ba5f..011bcf4 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementString.php @@ -54,7 +54,7 @@ class ElementString extends Element /** * @return bool|ElementString */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*\((?P.*)/s', $content, $match)) { $name = $match['name']; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php index 7c95559..c37b6da 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementStruct.php @@ -44,7 +44,7 @@ class ElementStruct extends Element /** * @return false|Header */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*<<(?P.*)/is', $content)) { preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches); diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php index 50531a7..ebba71a 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Element/ElementXRef.php @@ -83,7 +83,7 @@ class ElementXRef extends Element /** * @return bool|ElementXRef */ - public static function parse(string $content, Document $document = null, int &$offset = 0) + public static function parse(string $content, ?Document $document = null, int &$offset = 0) { if (preg_match('/^\s*(?P[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) { $id = $match['id']; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php index 6018eec..511411b 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php @@ -145,6 +145,12 @@ class Encoding extends PDFObject { // Load reference table charset. $baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent()); + + // Check for empty BaseEncoding field value + if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) { + $baseEncoding = 'StandardEncoding'; + } + $className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding; if (!class_exists($className)) { diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php index 60e5616..70bc48c 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php @@ -178,7 +178,7 @@ class PDFDocEncoding "\xfc" => "\u{00fc}", // udieresis "\xfd" => "\u{00fd}", // yacute "\xfe" => "\u{00fe}", // thorn - "\xff" => "\u{00ff}", // ydieresis + "\xff" => "\u{00ff}", // ydieresis ]; } @@ -186,4 +186,4 @@ class PDFDocEncoding { return strtr($content, static::getCodePage()); } -} \ No newline at end of file +} diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Font.php b/orcinus/pdfparser/src/Smalot/PdfParser/Font.php index 9e4db9f..cfe85d7 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Font.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Font.php @@ -134,9 +134,16 @@ class Font extends PDFObject /** * Convert unicode character code to "utf-8" encoded string. + * + * @param int|float $code Unicode character code. Will be casted to int internally! */ - public static function uchr(int $code): string + public static function uchr($code): string { + // note: + // $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623 + // because in some cases uchr was called with a float instead of an integer. + $code = (int) $code; + if (!isset(self::$uchrCache[$code])) { // html_entity_decode() will not work with UTF-16 or UTF-32 char entities, // therefore, we use mb_convert_encoding() instead @@ -272,11 +279,13 @@ class Font extends PDFObject /** * Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array. */ - public function calculateTextWidth(string $text, array &$missing = null): ?float + public function calculateTextWidth(string $text, ?array &$missing = null): ?float { $index_map = array_flip($this->table); $details = $this->getDetails(); - $widths = $details['Widths']; + + // Usually, Widths key is set in $details array, but if it isn't use an empty array instead. + $widths = $details['Widths'] ?? []; // Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar $width_map = array_flip(range($details['FirstChar'], $details['LastChar'])); @@ -312,12 +321,12 @@ class Font extends PDFObject } $text = ''; - $parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); + $parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); foreach ($parts as $part) { - if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '$/si', $part)) { + // strip whitespace + $part = preg_replace("/\s/", '', $part); $part = trim($part, '<>'); if ($add_braces) { $text .= '('; @@ -342,18 +351,20 @@ class Font extends PDFObject */ public static function decodeOctal(string $text): string { - $parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); - $text = ''; + // Replace all double backslashes \\ with a special string + $text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']); - foreach ($parts as $part) { - if (preg_match('/^\\\\[0-7]{3}$/', $part)) { - $text .= \chr(octdec(trim($part, '\\'))); - } else { - $text .= $part; - } - } + // Now we can replace all octal codes without worrying about + // escaped backslashes + $text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) { + return \chr(octdec($m[1])); + }, $text); - return $text; + // Unescape any parentheses + $text = str_replace(['\\(', '\\)'], ['(', ')'], $text); + + // Replace instances of the special string with a single backslash + return str_replace('[**pdfparserdblslsh**]', '\\', $text); } /** @@ -361,18 +372,9 @@ class Font extends PDFObject */ public static function decodeEntities(string $text): string { - $parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE); - $text = ''; - - foreach ($parts as $part) { - if (preg_match('/^#\d{2}$/', $part)) { - $text .= \chr(hexdec(trim($part, '#'))); - } else { - $text .= $part; - } - } - - return $text; + return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) { + return \chr(hexdec($m[1])); + }, $text); } /** @@ -384,7 +386,7 @@ class Font extends PDFObject */ public static function decodeUnicode(string $text): string { - if (preg_match('/^\xFE\xFF/i', $text)) { + if ("\xFE\xFF" === substr($text, 0, 2)) { // Strip U+FEFF byte order marker. $decode = substr($text, 2); $text = ''; @@ -409,16 +411,17 @@ class Font extends PDFObject /** * Decode text by commands array. */ - public function decodeText(array $commands): string + public function decodeText(array $commands, float $fontFactor = 4): string { $word_position = 0; $words = []; - $font_space = $this->getFontSpaceLimit(); + $font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4; foreach ($commands as $command) { switch ($command[PDFObject::TYPE]) { case 'n': - if ((float) trim($command[PDFObject::COMMAND]) < $font_space) { + $offset = (float) trim($command[PDFObject::COMMAND]); + if ($offset - (float) $font_space < 0) { $word_position = \count($words); } continue 2; @@ -434,8 +437,8 @@ class Font extends PDFObject // replace escaped chars $text = str_replace( - ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '], - ['\\', '(', ')', "\n", "\r", "\t", "\f", ' '], + ['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'], + [\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)], $text ); @@ -449,9 +452,32 @@ class Font extends PDFObject foreach ($words as &$word) { $word = $this->decodeContent($word); + $word = str_replace("\t", ' ', $word); } - return implode(' ', $words); + // Remove internal "words" that are just spaces, but leave them + // if they are at either end of the array of words. This fixes, + // for example, lines that are justified to fill + // a whole row. + for ($x = \count($words) - 2; $x >= 1; --$x) { + if ('' === trim($words[$x], ' ')) { + unset($words[$x]); + } + } + $words = array_values($words); + + // Cut down on the number of unnecessary internal spaces by + // imploding the string on the null byte, and checking if the + // text includes extra spaces on either side. If so, merge + // where appropriate. + $words = implode("\x00\x00", $words); + $words = str_replace( + [" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"], + [' ', ' ', ' ', ' '], + $words + ); + + return $words; } /** @@ -459,8 +485,14 @@ class Font extends PDFObject * * @param bool $unicode This parameter is deprecated and might be removed in a future release */ - public function decodeContent(string $text, bool &$unicode = null): string + public function decodeContent(string $text, ?bool &$unicode = null): string { + // If this string begins with a UTF-16BE BOM, then decode it + // directly as Unicode + if ("\xFE\xFF" === substr($text, 0, 2)) { + return $this->decodeUnicode($text); + } + if ($this->has('ToUnicode')) { return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text); } diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Header.php b/orcinus/pdfparser/src/Smalot/PdfParser/Header.php index 562897c..b58773a 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Header.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Header.php @@ -43,7 +43,7 @@ use Smalot\PdfParser\Element\ElementXRef; class Header { /** - * @var Document + * @var Document|null */ protected $document; @@ -56,7 +56,7 @@ class Header * @param Element[] $elements list of elements * @param Document $document document */ - public function __construct(array $elements = [], Document $document = null) + public function __construct(array $elements = [], ?Document $document = null) { $this->elements = $elements; $this->document = $document; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php b/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php index c879176..87b5a6c 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/PDFObject.php @@ -54,7 +54,7 @@ class PDFObject public static $recursionStack = []; /** - * @var Document + * @var Document|null */ protected $document; @@ -69,15 +69,20 @@ class PDFObject protected $content; /** - * @var Config + * @var Config|null */ protected $config; + /** + * @var bool + */ + protected $addPositionWhitespace = false; + public function __construct( Document $document, - Header $header = null, - string $content = null, - Config $config = null + ?Header $header = null, + ?string $content = null, + ?Config $config = null ) { $this->document = $document; $this->header = $header ?? new Header(); @@ -127,6 +132,16 @@ class PDFObject return $this->content; } + /** + * Creates a duplicate of the document stream with + * strings and other items replaced by $char. Formerly + * getSectionsText() used this output to more easily gather offset + * values to extract text from the *actual* document stream. + * + * @deprecated function is no longer used and will be removed in a future release + * + * @internal + */ public function cleanContent(string $content, string $char = 'X') { $char = $char[0]; @@ -186,48 +201,298 @@ class PDFObject return $content; } - public function getSectionsText(?string $content): array + /** + * Takes a string of PDF document stream text and formats + * it into a multi-line string with one PDF command on each line, + * separated by \r\n. If the given string is null, or binary data + * is detected instead of a document stream then return an empty + * string. + */ + private function formatContent(?string $content): string { - $sections = []; - $content = ' '.$content.' '; - $textCleaned = $this->cleanContent($content, '_'); + if (null === $content) { + return ''; + } - // Extract text blocks. - if (preg_match_all('/(\sQ)?\s+BT[\s|\(|\[]+(.*?)\s*ET(\sq)?/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) { - foreach ($matches[2] as $pos => $part) { - $text = $part[0]; - if ('' === $text) { - continue; - } - $offset = $part[1]; - $section = substr($content, $offset, \strlen($text)); + // Outside of (String) and inline image content in PDF document + // streams, all text should conform to UTF-8. Test for binary + // content by deleting everything after the first open- + // parenthesis ( which indicates the beginning of a string, or + // the first ID command which indicates the beginning of binary + // inline image content. Then test what remains for valid + // UTF-8. If it's not UTF-8, return an empty string as this + // $content is most likely binary. Unfortunately, using + // mb_check_encoding(..., 'UTF-8') is not strict enough, so the + // following regexp, adapted from the W3, is used. See: + // https://www.w3.org/International/questions/qa-forms-utf-8.en + // We use preg_replace() instead of preg_match() to avoid "JIT + // stack limit exhausted" errors on larger files. + $utf8Filter = preg_replace('/( + [\x09\x0A\x0D\x20-\x7E] | # ASCII + [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte + \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs + [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte + \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates + \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 + [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 + \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content)); - // Removes BDC and EMC markup. - $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' '); + if ('' !== $utf8Filter) { + return ''; + } - // Add Q and q flags if detected around BT/ET. - // @see: https://github.com/smalot/pdfparser/issues/387 - $section = trim((!empty($matches[1][$pos][0]) ? "Q\n" : '').$section).(!empty($matches[3][$pos][0]) ? "\nq" : ''); + // Find all inline image content and replace them so they aren't + // affected by the next steps + $pdfInlineImages = []; + $offsetBI = 0; + while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { + // Attempt to detemine if this instance of the 'BI' command + // actually occured within a (string) using the following + // steps: - $sections[] = $section; + // Step 1: Remove any escaped parentheses from the alleged + // image characteristics data + $para = str_replace(['\\(', '\\)'], '', $text[1][0]); + + // Step 2: Remove all correctly ordered and balanced + // parentheses from (strings) + do { + $paraTest = $para; + $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); + } while ($para != $paraTest); + + $paraOpen = strpos($para, '('); + $paraClose = strpos($para, ')'); + + // Check: If the remaining text contains a close parenthesis + // ')' AND it occurs before any open parenthesis, then we + // are almost certain to be inside a (string) + if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { + // Bump the search offset forward and match again + $offsetBI = (int) $text[1][1]; + continue; + } + + // Step 3: Double check that this is actually inline image + // data by parsing the alleged image characteristics as a + // dictionary + $dict = $this->parseDictionary('<<'.$text[1][0].'>>'); + + // Check if an image Width and Height are set in the dict + if ((isset($dict['W']) || isset($dict['Width'])) + && (isset($dict['H']) || isset($dict['Height']))) { + $id = uniqid('IMAGE_', true); + $pdfInlineImages[$id] = [ + preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), + preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), + ]; + $content = preg_replace( + '/'.preg_quote($text[0][0], '/').'/', + '^^^'.$id.'^^^', + $content, + 1 + ); + } else { + // If there was no valid dictionary, or a height and width + // weren't specified, then we don't know what this is, so + // just leave it alone; bump the search offset forward and + // match again + $offsetBI = (int) $text[1][1]; } } - // Extract 'do' commands. - if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) { - foreach ($matches[1] as $part) { - $text = $part[0]; - $offset = $part[1]; - $section = substr($content, $offset, \strlen($text)); + // Find all strings () and replace them so they aren't affected + // by the next steps + $pdfstrings = []; + $attempt = '('; + while (preg_match('/'.preg_quote($attempt, '/').'.*?(?> commands and replace them so they + // aren't affected by the next steps + $dictstore = []; + while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { + $dictid = uniqid('DICT_', true); + $dictstore[$dictid] = $dicttext[1]; + $content = preg_replace( + '/'.preg_quote($dicttext[0], '/').'/', + ' ###'.$dictid.'###'.$dicttext[2], + $content, + 1 + ); + } + + // Normalize white-space in the document stream + $content = preg_replace('/\s{2,}/', ' ', $content); + + // Find all valid PDF operators and add \r\n after each; this + // ensures there is just one command on every line + // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A + // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A + // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while + // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions + // appear here in the list for completeness. + $operators = [ + 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', + 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', + 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', + 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', + 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', + 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', + ]; + foreach ($operators as $operator) { + $content = preg_replace( + '/(?> commands + $dictstore = array_reverse($dictstore, true); + foreach ($dictstore as $id => $dict) { + $content = str_replace('###'.$id.'###', $dict, $content); + } + + // Restore the original string content + $pdfstrings = array_reverse($pdfstrings, true); + foreach ($pdfstrings as $id => $text) { + // Strings may contain escaped newlines, or literal newlines + // and we should clean these up before replacing the string + // back into the content stream; this ensures no strings are + // split between two lines (every command must be on one line) + $text = str_replace( + ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], + ['', '', '', '\r', '\n'], + $text + ); + + $content = str_replace('@@@'.$id.'@@@', $text, $content); + } + + // Restore the original content of any inline images + $pdfInlineImages = array_reverse($pdfInlineImages, true); + foreach ($pdfInlineImages as $id => $image) { + $content = str_replace( + '^^^'.$id.'^^^', + "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", + $content + ); + } + + $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); + + return $content; + } + + /** + * getSectionsText() now takes an entire, unformatted + * document stream as a string, cleans it, then filters out + * commands that aren't needed for text positioning/extraction. It + * returns an array of unprocessed PDF commands, one command per + * element. + * + * @internal + */ + public function getSectionsText(?string $content): array + { + $sections = []; + + // A cleaned stream has one command on every line, so split the + // cleaned stream content on \r\n into an array + $textCleaned = preg_split( + '/(\r\n|\n|\r)/', + $this->formatContent($content), + -1, + \PREG_SPLIT_NO_EMPTY + ); + + $inTextBlock = false; + foreach ($textCleaned as $line) { + $line = trim($line); + + // Skip empty lines + if ('' === $line) { + continue; + } + + // If a 'BT' is encountered, set the $inTextBlock flag + if (preg_match('/BT$/', $line)) { + $inTextBlock = true; + $sections[] = $line; + + // If an 'ET' is encountered, unset the $inTextBlock flag + } elseif ('ET' == $line) { + $inTextBlock = false; + $sections[] = $line; + } elseif ($inTextBlock) { + // If we are inside a BT ... ET text block, save all lines + $sections[] = trim($line); + } else { + // Otherwise, if we are outside of a text block, only + // save specific, necessary lines. Care should be taken + // to ensure a command being checked for *only* matches + // that command. For instance, a simple search for 'c' + // may also match the 'sc' command. See the command + // list in the formatContent() method above. + // Add more commands to save here as you find them in + // weird PDFs! + if ('q' == $line[-1] || 'Q' == $line[-1]) { + // Save and restore graphics state commands + $sections[] = $line; + } elseif (preg_match('/(?> $command + */ + private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string + { + $orig_text = $font->decodeText($command, $fontFactor); + $text = $orig_text; + + // If we make this a Config option, we can add a check if it's + // enabled here. + if (null !== $page) { + $font_ids = array_keys($page->getFonts()); + + // If the decoded text contains UTF-8 control characters + // then the font page being used is probably the wrong one. + // Loop through the rest of the fonts to see if we can get + // a good decode. Allow x09 to x0d which are whitespace. + while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { + // If we're out of font IDs, then give up and use the + // original string + if (0 == \count($font_ids)) { + return $orig_text; + } + + // Try the next font ID + $font = $page->getFont(array_shift($font_ids)); + $text = $font->decodeText($command, $fontFactor); + } + } + + return $text; + } + + /** + * Expects a string that is a full PDF dictionary object, + * including the outer enclosing << >> angle brackets + * + * @internal + * * @throws \Exception */ - public function getText(Page $page = null): string + public function parseDictionary(string $dictionary): array { - $result = ''; + // Normalize whitespace + $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); + + if ('<<' != substr($dictionary, 0, 2)) { + throw new \Exception('Not a valid dictionary object.'); + } + + $parsed = []; + $stack = []; + $currentName = ''; + $arrayTypeNumeric = false; + + // Remove outer layer of dictionary, and split on tokens + $split = preg_split( + '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', + trim(preg_replace('/^<<|>>$/', '', $dictionary)), + -1, + \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE + ); + + foreach ($split as $token) { + $token = trim($token); + switch ($token) { + case '': + break; + + // Open numeric array + case '[': + $parsed[$currentName] = []; + $arrayTypeNumeric = true; + + // Move up one level in the stack + $stack[\count($stack)] = &$parsed; + $parsed = &$parsed[$currentName]; + $currentName = ''; + break; + + // Open hashed array + case '<<': + $parsed[$currentName] = []; + $arrayTypeNumeric = false; + + // Move up one level in the stack + $stack[\count($stack)] = &$parsed; + $parsed = &$parsed[$currentName]; + $currentName = ''; + break; + + // Close numeric array + case ']': + // Revert string type arrays back to a single element + if (\is_array($parsed) && 1 == \count($parsed) + && isset($parsed[0]) && \is_string($parsed[0]) + && '' !== $parsed[0] && '/' != $parsed[0][0]) { + $parsed = '['.$parsed[0].']'; + } + // Close hashed array + // no break + case '>>': + $arrayTypeNumeric = false; + + // Move down one level in the stack + $parsed = &$stack[\count($stack) - 1]; + unset($stack[\count($stack) - 1]); + break; + + default: + // If value begins with a slash, then this is a name + // Add it to the appropriate array + if ('/' == substr($token, 0, 1)) { + $currentName = substr($token, 1); + if (true == $arrayTypeNumeric) { + $parsed[] = $currentName; + $currentName = ''; + } + } elseif ('' != $currentName) { + if (false == $arrayTypeNumeric) { + $parsed[$currentName] = $token; + } + $currentName = ''; + } elseif ('' == $currentName) { + $parsed[] = $token; + } + } + } + + return $parsed; + } + + /** + * Returns the text content of a PDF as a string. Attempts to add + * whitespace for spacing and line-breaks where appropriate. + * + * getText() leverages getTextArray() to get the content + * of the document, setting the addPositionWhitespace flag to true + * so whitespace is inserted in a logical way for reading by + * humans. + */ + public function getText(?Page $page = null): string + { + $this->addPositionWhitespace = true; + $result = $this->getTextArray($page); + $this->addPositionWhitespace = false; + + return implode('', $result).' '; + } + + /** + * Returns the text content of a PDF as an array of strings. No + * extra whitespace is inserted besides what is actually encoded in + * the PDF text. + * + * @throws \Exception + */ + public function getTextArray(?Page $page = null): array + { + $result = []; + $text = []; + + $marked_stack = []; + $last_written_position = false; + $sections = $this->getSectionsText($this->content); $current_font = $this->getDefaultFont($page); - $clipped_font = $current_font; + $current_font_size = 1; + $current_text_leading = 0; - $current_position_td = ['x' => false, 'y' => false]; - $current_position_tm = ['x' => false, 'y' => false]; + $current_position = ['x' => false, 'y' => false]; + $current_position_tm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + $current_position_td = ['x' => 0, 'y' => 0]; + $current_position_cm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + + $clipped_font = []; + $clipped_position_cm = []; self::$recursionStack[] = $this->getUniqueId(); foreach ($sections as $section) { $commands = $this->getCommandsText($section); - $reverse_text = false; - $text = ''; - foreach ($commands as $command) { switch ($command[self::OPERATOR]) { + // Begin text object + case 'BT': + // Reset text positioning matrices + $current_position_tm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + $current_position_td = ['x' => 0, 'y' => 0]; + $current_text_leading = 0; + break; + + // Begin marked content sequence with property list + case 'BDC': + if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { + $dict = $this->parseDictionary($match[1]); + + // Check for ActualText block + if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { + if ('[' == $dict['ActualText'][0]) { + // Simulate a 'TJ' command on the stack + $marked_stack[] = [ + 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], + ]; + } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { + // Simulate a 'Tj' command on the stack + $marked_stack[] = [ + 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], + ]; + } + } + } + break; + + // Begin marked content sequence case 'BMC': if ('ReversedChars' == $command[self::COMMAND]) { - $reverse_text = true; + // Upon encountering a ReversedChars command, + // add the characters we've built up so far to + // the result array + $result = array_merge($result, $text); + + // Start a fresh $text array that will contain + // reversed characters + $text = []; + + // Add the reversed text flag to the stack + $marked_stack[] = ['ReversedChars' => true]; } break; - // set character spacing - case 'Tc': - break; - - // move text current point - case 'Td': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if (((float) $x <= 0) - || (false !== $current_position_td['y'] && (float) $y < (float) $current_position_td['y']) - ) { - // vertical offset - $text .= "\n"; - } elseif (false !== $current_position_td['x'] && (float) $x > (float) - $current_position_td['x'] - ) { - $text .= $this->config->getHorizontalOffset(); - } - $current_position_td = ['x' => $x, 'y' => $y]; - break; - - // move text current point and set leading - case 'TD': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if ((float) $y < 0) { - $text .= "\n"; - } elseif ((float) $x <= 0) { - $text .= ' '; - } - break; - - case 'Tf': - list($id) = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim($id, '/'); - if (null !== $page) { - $new_font = $page->getFont($id); - // If an invalid font ID is given, do not update the font. - // This should theoretically never happen, as the PDF spec states for the Tf operator: - // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" - // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) - // But we want to make sure that malformed PDFs do not simply crash. - if (null !== $new_font) { - $current_font = $new_font; - } - } - break; - - case 'Q': - // Use clip: restore font. - $current_font = $clipped_font; - break; - - case 'q': - // Use clip: save font. - $clipped_font = $current_font; - break; - - case "'": - case 'Tj': - $command[self::COMMAND] = [$command]; - // no break - case 'TJ': - $sub_text = $current_font->decodeText($command[self::COMMAND]); - $text .= $sub_text; - break; - - // set leading - case 'TL': - $text .= ' '; - break; - - case 'Tm': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if (false !== $current_position_tm['x']) { - $delta = abs((float) $x - (float) $current_position_tm['x']); - if ($delta > 10) { - $text .= "\t"; - } - } - if (false !== $current_position_tm['y']) { - $delta = abs((float) $y - (float) $current_position_tm['y']); - if ($delta > 10) { - $text .= "\n"; - } - } - $current_position_tm = ['x' => $x, 'y' => $y]; - break; - - // set super/subscripting text rise - case 'Ts': - break; - - // set word spacing - case 'Tw': - break; - - // set horizontal scaling - case 'Tz': - $text .= "\n"; - break; - - // move to start of next line - case 'T*': - $text .= "\n"; - break; - - case 'Da': + // set graphics position matrix + case 'cm': + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $current_position_cm = [ + 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, + 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, + 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, + ]; break; case 'Do': @@ -395,112 +777,247 @@ class PDFObject $xobject = $page->getXObject($id); // @todo $xobject could be a ElementXRef object, which would then throw an error - if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) { + if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { // Not a circular reference. - $text .= $xobject->getText($page); + $text[] = $xobject->getText($page); } } break; - case 'rg': - case 'RG': + // Marked content point with (DP) & without (MP) property list + case 'DP': + case 'MP': break; - case 're': + // End text object + case 'ET': break; - case 'co': + // Store current selected font and graphics matrix + case 'q': + $clipped_font[] = [$current_font, $current_font_size]; + $clipped_position_cm[] = $current_position_cm; break; - case 'cs': + // Restore previous selected font and graphics matrix + case 'Q': + list($current_font, $current_font_size) = array_pop($clipped_font); + $current_position_cm = array_pop($clipped_position_cm); break; - case 'gs': - break; + // End marked content sequence + case 'EMC': + $data = false; + if (\count($marked_stack)) { + $marked = array_pop($marked_stack); + $action = key($marked); + $data = $marked[$action]; - case 'en': - break; + switch ($action) { + // If we are in ReversedChars mode... + case 'ReversedChars': + // Reverse the characters we've built up so far + foreach ($text as $key => $t) { + $text[$key] = implode('', array_reverse( + mb_str_split($t, 1, mb_internal_encoding()) + )); + } - case 'sc': - case 'SC': - break; + // Add these characters to the result array + $result = array_merge($result, $text); - case 'g': - case 'G': - break; + // Start a fresh $text array that will contain + // non-reversed characters + $text = []; + break; - case 'V': - break; - - case 'vo': - case 'Vo': - break; - - default: - } - } - - // Fix Hebrew and other reverse text oriented languages. - // @see: https://github.com/smalot/pdfparser/issues/398 - if ($reverse_text) { - $chars = mb_str_split($text, 1, mb_internal_encoding()); - $text = implode('', array_reverse($chars)); - } - - $result .= $text; - } - - return $result.' '; - } - - /** - * @throws \Exception - */ - public function getTextArray(Page $page = null): array - { - $text = []; - $sections = $this->getSectionsText($this->content); - $current_font = new Font($this->document, null, null, $this->config); - - foreach ($sections as $section) { - $commands = $this->getCommandsText($section); - - foreach ($commands as $command) { - switch ($command[self::OPERATOR]) { - // set character spacing - case 'Tc': - break; - - // move text current point - case 'Td': - break; - - // move text current point and set leading - case 'TD': - break; - - case 'Tf': - if (null !== $page) { - list($id) = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim($id, '/'); - $current_font = $page->getFont($id); + case 'ActualText': + // Use the content of the ActualText as a command + $command = $data; + break; + } } - break; + // If this EMC command has been transformed into a 'Tj' + // or 'TJ' command because of being ActualText, then bypass + // the break to proceed to the writing section below. + if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { + break; + } + + // no break case "'": + case '"': + if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { + // Move to next line and write text + $current_position['x'] = 0; + $current_position_td['x'] = 0; + $current_position_td['y'] += $current_text_leading; + } + // no break case 'Tj': $command[self::COMMAND] = [$command]; // no break case 'TJ': - $sub_text = $current_font->decodeText($command[self::COMMAND]); - $text[] = $sub_text; + // Check the marked content stack for flags + $actual_text = false; + $reverse_text = false; + foreach ($marked_stack as $marked) { + if (isset($marked['ActualText'])) { + $actual_text = true; + } + if (isset($marked['ReversedChars'])) { + $reverse_text = true; + } + } + + // Account for text position ONLY just before we write text + if (false === $actual_text && \is_array($last_written_position)) { + // If $last_written_position is an array, that + // means we have stored text position coordinates + // for placing an ActualText + $currentX = $last_written_position[0]; + $currentY = $last_written_position[1]; + $last_written_position = false; + } else { + $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; + $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; + } + $whiteSpace = ''; + + $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; + $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; + + if (true === $this->addPositionWhitespace && false !== $current_position['x']) { + $curY = $currentY - $current_position['y']; + if (abs($curY) >= abs($factorY) / 4) { + $whiteSpace = "\n"; + } else { + if (true === $reverse_text) { + $curX = $current_position['x'] - $currentX; + } else { + $curX = $currentX - $current_position['x']; + } + + // In abs($factorX * 7) below, the 7 is chosen arbitrarily + // as the number of apparent "spaces" in a document we + // would need before considering them a "tab". In the + // future, we might offer this value to users as a config + // option. + if ($curX >= abs($factorX * 7)) { + $whiteSpace = "\t"; + } elseif ($curX >= abs($factorX * 2)) { + $whiteSpace = ' '; + } + } + } + + $newtext = $this->getTJUsingFontFallback( + $current_font, + $command[self::COMMAND], + $page, + $factorX + ); + + // If there is no ActualText pending then write + if (false === $actual_text) { + $newtext = str_replace(["\r", "\n"], '', $newtext); + if (false !== $reverse_text) { + // If we are in ReversedChars mode, add the whitespace last + $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); + } else { + // Otherwise add the whitespace first + if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { + $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); + } + $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); + } + + // Record the position of this inserted text for comparison + // with the next text block. + // Provide a 'fudge' factor guess on how wide this text block + // is based on the number of characters. This helps limit the + // number of tabs inserted, but isn't perfect. + $factor = $factorX / 2; + $current_position = [ + 'x' => $currentX - mb_strlen($newtext) * $factor, + 'y' => $currentY, + ]; + } elseif (false === $last_written_position) { + // If there is an ActualText in the pipeline + // store the position this undisplayed text + // *would* have been written to, so the + // ActualText is displayed in the right spot + $last_written_position = [$currentX, $currentY]; + $current_position['x'] = $currentX; + } + break; + + // move to start of next line + case 'T*': + $current_position['x'] = 0; + $current_position_td['x'] = 0; + $current_position_td['y'] += $current_text_leading; + break; + + // set character spacing + case 'Tc': + break; + + // move text current point and set leading + case 'Td': + case 'TD': + // move text current point + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $y = (float) array_pop($args); + $x = (float) array_pop($args); + + if ('TD' == $command[self::OPERATOR]) { + $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; + } + + $current_position_td = [ + 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], + 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], + ]; + break; + + case 'Tf': + $args = preg_split('/\s/s', $command[self::COMMAND]); + $size = (float) array_pop($args); + $id = trim(array_pop($args), '/'); + if (null !== $page) { + $new_font = $page->getFont($id); + // If an invalid font ID is given, do not update the font. + // This should theoretically never happen, as the PDF spec states for the Tf operator: + // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" + // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) + // But we want to make sure that malformed PDFs do not simply crash. + if (null !== $new_font) { + $current_font = $new_font; + $current_font_size = $size; + } + } break; // set leading case 'TL': + $y = (float) $command[self::COMMAND]; + $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; break; + // set text position matrix case 'Tm': + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $current_position_tm = [ + 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, + 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, + 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, + ]; + break; + + // set text rendering mode + case 'Ts': break; // set super/subscripting text rise @@ -513,59 +1030,6 @@ class PDFObject // set horizontal scaling case 'Tz': - // $text .= "\n"; - break; - - // move to start of next line - case 'T*': - // $text .= "\n"; - break; - - case 'Da': - break; - - case 'Do': - if (null !== $page) { - $args = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim(array_pop($args), '/ '); - if ($xobject = $page->getXObject($id)) { - $text[] = $xobject->getText($page); - } - } - break; - - case 'rg': - case 'RG': - break; - - case 're': - break; - - case 'co': - break; - - case 'cs': - break; - - case 'gs': - break; - - case 'en': - break; - - case 'sc': - case 'SC': - break; - - case 'g': - case 'G': - break; - - case 'V': - break; - - case 'vo': - case 'Vo': break; default: @@ -573,198 +1037,103 @@ class PDFObject } } - return $text; + $result = array_merge($result, $text); + + return $result; } + /** + * getCommandsText() expects the content of $text_part to be an + * already formatted, single-line command from a document stream. + * The companion function getSectionsText() returns a document + * stream as an array of single commands for just this purpose. + * Because of this, the argument $offset is no longer used, and + * may be removed in a future PdfParser release. + * + * A better name for this function would be getCommandText() + * since it now always works on just one command. + */ public function getCommandsText(string $text_part, int &$offset = 0): array { $commands = $matches = []; - while ($offset < \strlen($text_part)) { - $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset); - $char = $text_part[$offset]; + preg_match('/^(([\/\[\(<])?.*)(?getCommandsText($text_part, $offset); - - if (preg_match( - '/\G\s*[A-Z]{1,2}\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = trim($matches[0]); - $offset += \strlen($matches[0]); - } - } else { - ++$offset; - break; - } - break; - - case '<': - case '>': - // array object - $type = $char; - ++$offset; - if ('<' == $char) { - $strpos = strpos($text_part, '>', $offset); - $command = substr($text_part, $offset, $strpos - $offset); - $offset = $strpos + 1; - } - - if (preg_match( - '/\G\s*[A-Z]{1,2}\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = trim($matches[0]); - $offset += \strlen($matches[0]); - } - break; - - case '(': - case ')': - ++$offset; - $type = $char; - $strpos = $offset; - if ('(' == $char) { - $open_bracket = 1; - while ($open_bracket > 0) { - if (!isset($text_part[$strpos])) { - break; - } - $ch = $text_part[$strpos]; - switch ($ch) { - case '\\': - // REVERSE SOLIDUS (5Ch) (Backslash) - // skip next character - ++$strpos; - break; - - case '(': - // LEFT PARENHESIS (28h) - ++$open_bracket; - break; - - case ')': - // RIGHT PARENTHESIS (29h) - --$open_bracket; - break; - } - ++$strpos; - } - $command = substr($text_part, $offset, $strpos - $offset - 1); - $offset = $strpos; - - if (preg_match( - '/\G\s*([A-Z\']{1,2})\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = $matches[1]; - $offset += \strlen($matches[0]); - } - } - break; - - default: - if ('ET' == substr($text_part, $offset, 2)) { - break; - } elseif (preg_match( - '/\G\s*(?P([0-9\.\-]+\s*?)+)\s+(?P[A-Z]{1,3})\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = trim($matches['id']); - $command = trim($matches['data']); - $offset += \strlen($matches[0]); - } elseif (preg_match( - '/\G\s*([0-9\.\-]+\s*?)+\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $type = 'n'; - $command = trim($matches[0]); - $offset += \strlen($matches[0]); - } elseif (preg_match( - '/\G\s*([A-Z\*]+)\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $type = ''; - $operator = $matches[1]; - $command = ''; - $offset += \strlen($matches[0]); - } - } - - if (false !== $command) { - $commands[] = [ - self::TYPE => $type, - self::OPERATOR => $operator, - self::COMMAND => $command, - ]; - } else { - break; - } + // If no valid command is detected, return an empty array + if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { + return []; } + $type = $matches[2]; + $operator = $matches[3]; + $command = trim($matches[1]); + + if ('TJ' == $operator) { + $subcommand = []; + $command = trim($command, '[]'); + do { + $oldCommand = $command; + + // Search for parentheses string () format + if (preg_match('/^ *\((.*?)(? '(', + self::OPERATOR => 'TJ', + self::COMMAND => $tjmatch[1], + ]; + if (isset($tjmatch[2]) && trim($tjmatch[2])) { + $subcommand[] = [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => $tjmatch[2], + ]; + } + $command = substr($command, \strlen($tjmatch[0])); + } + + // Search for hexadecimal <> format + if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { + $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); + $subcommand[] = [ + self::TYPE => '<', + self::OPERATOR => 'TJ', + self::COMMAND => $tjmatch[1], + ]; + if (isset($tjmatch[2]) && trim($tjmatch[2])) { + $subcommand[] = [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => $tjmatch[2], + ]; + } + $command = substr($command, \strlen($tjmatch[0])); + } + } while ($command != $oldCommand); + + $command = $subcommand; + } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { + // Depending on the string type, trim the data of the + // appropriate delimiters + if ('(' == $type) { + // Don't use trim() here since a () string may end with + // a balanced or escaped right parentheses, and trim() + // will delete both. Both strings below are valid: + // eg. (String()) + // eg. (String\)) + $command = preg_replace('/^\(|\)$/', '', $command); + } elseif ('<' == $type) { + $command = trim($command, '<>'); + } + } elseif ('/' == $type) { + $command = substr($command, 1); + } + + $commands[] = [ + self::TYPE => $type, + self::OPERATOR => $operator, + self::COMMAND => $command, + ]; + return $commands; } @@ -772,7 +1141,7 @@ class PDFObject Document $document, Header $header, ?string $content, - Config $config = null + ?Config $config = null ): self { switch ($header->get('Type')->getContent()) { case 'XObject': diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Page.php b/orcinus/pdfparser/src/Smalot/PdfParser/Page.php index fbc1987..d6ffaf0 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Page.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Page.php @@ -176,7 +176,7 @@ class Page extends PDFObject }*/ } - public function getText(self $page = null): string + public function getText(?self $page = null): string { if ($contents = $this->get('Contents')) { if ($contents instanceof ElementMissing) { @@ -312,7 +312,7 @@ class Page extends PDFObject return new self($pdfObject->document, $header, $new_content, $config); } - public function getTextArray(self $page = null): array + public function getTextArray(?self $page = null): array { if ($this->isFpdf()) { $pdfObject = $this->getPDFObjectForFpdf(); @@ -400,8 +400,6 @@ class Page extends PDFObject } $sectionsText = $content->getSectionsText($content->getContent()); foreach ($sectionsText as $sectionText) { - $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => '']; - $commandsText = $content->getCommandsText($sectionText); foreach ($commandsText as $command) { $extractedData[] = $command; @@ -420,7 +418,7 @@ class Page extends PDFObject * * @return array An array with the data and the internal representation */ - public function extractDecodedRawData(array $extractedRawData = null): array + public function extractDecodedRawData(?array $extractedRawData = null): array { if (!isset($extractedRawData) || !$extractedRawData) { $extractedRawData = $this->extractRawData(); @@ -500,7 +498,7 @@ class Page extends PDFObject * * @return array An array with the text command of the page */ - public function getDataCommands(array $extractedDecodedRawData = null): array + public function getDataCommands(?array $extractedDecodedRawData = null): array { if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) { $extractedDecodedRawData = $this->extractDecodedRawData(); @@ -651,7 +649,7 @@ class Page extends PDFObject * @return array an array with the data of the page including the Tm information * of any text in the page */ - public function getDataTm(array $dataCommands = null): array + public function getDataTm(?array $dataCommands = null): array { if (!isset($dataCommands) || !$dataCommands) { $dataCommands = $this->getDataCommands(); @@ -701,6 +699,12 @@ class Page extends PDFObject $extractedTexts = $this->getTextArray(); $extractedData = []; foreach ($dataCommands as $command) { + // If we've used up all the texts from getTextArray(), exit + // so we aren't accessing non-existent array indices + // Fixes 'undefined array key' errors in Issues #575, #576 + if (\count($extractedTexts) <= \count($extractedData)) { + break; + } $currentText = $extractedTexts[\count($extractedData)]; switch ($command['o']) { /* @@ -712,21 +716,13 @@ class Page extends PDFObject $Tl = $defaultTl; $Tx = 0; $Ty = 0; - $fontId = $defaultFontId; - $fontSize = $defaultFontSize; break; /* * ET - * End a text object, discarding the text matrix + * End a text object */ case 'ET': - $Tm = $defaultTm; - $Tl = $defaultTl; - $Tx = 0; - $Ty = 0; - $fontId = $defaultFontId; - $fontSize = $defaultFontSize; break; /* @@ -741,7 +737,7 @@ class Page extends PDFObject /* * tx ty Td - * Move to the start of the next line, offset form the start of the + * Move to the start of the next line, offset from the start of the * current line by tx, ty. */ case 'Td': @@ -898,7 +894,7 @@ class Page extends PDFObject * "near" the x,y coordinate, an empty array is returned. If Both, x * and y coordinates are null, null is returned. */ - public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array + public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array { if (!isset($this->dataTm) || !$this->dataTm) { $this->getDataTm(); diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php b/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php index 3078f9e..b051f11 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/Parser.php @@ -60,7 +60,7 @@ class Parser protected $rawDataParser; - public function __construct($cfg = [], Config $config = null) + public function __construct($cfg = [], ?Config $config = null) { $this->config = $config ?: new Config(); $this->rawDataParser = new RawDataParser($cfg, $this->config); @@ -77,6 +77,7 @@ class Parser public function parseFile(string $filename): Document { $content = file_get_contents($filename); + /* * 2018/06/20 @doganoo as multiple times a * users have complained that the parseFile() @@ -101,7 +102,7 @@ class Parser // Create structure from raw data. list($xref, $data) = $this->rawDataParser->parseData($content); - if (isset($xref['trailer']['encrypt'])) { + if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) { throw new \Exception('Secured pdf file are currently not supported.'); } diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php index c8d2740..a6f11b3 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/FilterHelper.php @@ -233,32 +233,32 @@ class FilterHelper */ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string { - /* - * gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty) - * the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable. - */ - set_error_handler(function ($errNo, $errStr) { - if (\E_WARNING === $errNo) { - throw new \Exception($errStr); - } else { - // fallback to default php error handler - return false; - } - }); + // Uncatchable E_WARNING for "data error" is @ suppressed + // so execution may proceed with an alternate decompression + // method. + $decoded = @gzuncompress($data, $decodeMemoryLimit); - $decoded = null; - - // initialize string to return - try { - $decoded = gzuncompress($data, $decodeMemoryLimit); - if (false === $decoded) { - throw new \Exception('decodeFilterFlateDecode: invalid code'); + if (false === $decoded) { + // If gzuncompress() failed, try again using the compress.zlib:// + // wrapper to decode it in a file-based context. + // See: https://www.php.net/manual/en/function.gzuncompress.php#79042 + // Issue: https://github.com/smalot/pdfparser/issues/592 + $ztmp = tmpfile(); + if (false != $ztmp) { + fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data); + $file = stream_get_meta_data($ztmp)['uri']; + if (0 === $decodeMemoryLimit) { + $decoded = file_get_contents('compress.zlib://'.$file); + } else { + $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit); + } + fclose($ztmp); } - } catch (\Exception $e) { - throw $e; - } finally { - // Restore old handler just in case it was customized outside of PDFParser. - restore_error_handler(); + } + + if (false === \is_string($decoded) || '' === $decoded) { + // If the decoded string is empty, that means decoding failed. + throw new \Exception('decodeFilterFlateDecode: invalid data'); } return $decoded; diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php index 1a4583c..5e17083 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -47,12 +47,14 @@ use Smalot\PdfParser\Config; class RawDataParser { /** - * @var \Smalot\PdfParser\Config + * @var Config */ private $config; /** * Configuration array. + * + * @var array */ protected $cfg = [ // if `true` ignore filter decoding errors @@ -67,7 +69,7 @@ class RawDataParser /** * @param array $cfg Configuration array, default is [] */ - public function __construct($cfg = [], Config $config = null) + public function __construct($cfg = [], ?Config $config = null) { // merge given array with default values $this->cfg = array_merge($this->cfg, $cfg); @@ -125,7 +127,7 @@ class RawDataParser // decode the stream $remaining_filters = []; foreach ($filters as $filter) { - if (\in_array($filter, $this->filterHelper->getAvailableFilters())) { + if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) { try { $stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit()); } catch (\Exception $e) { @@ -402,14 +404,19 @@ class RawDataParser } $prev_row = $ddata[$k]; } // end for each row - // complete decoding + // complete decoding } else { // number of bytes in a row $rowlen = array_sum($wb); - // convert the stream into an array of integers - $sdata = unpack('C*', $xrefcrs[1][3][0]); - // split the rows - $ddata = array_chunk($sdata, $rowlen); + if (0 < $rowlen) { + // convert the stream into an array of integers + $sdata = unpack('C*', $xrefcrs[1][3][0]); + // split the rows + $ddata = array_chunk($sdata, $rowlen); + } else { + // if the row length is zero, $ddata should be an empty array as well + $ddata = []; + } } $sdata = []; @@ -609,7 +616,7 @@ class RawDataParser * * @return array containing object type, raw value and offset to next object */ - protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array + protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array { $objtype = ''; // object type to be returned $objval = ''; // object value to be returned @@ -756,7 +763,7 @@ class RawDataParser // start stream object $objtype = 'stream'; $offset += 6; - if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) { + if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) { $offset += \strlen($matches[0]); // we get stream length here to later help preg_match test less data @@ -857,39 +864,39 @@ class RawDataParser */ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array { - $startxrefPreg = preg_match( - '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', + // If the $offset is currently pointed at whitespace, bump it + // forward until it isn't; affects loosely targetted offsets + // for the 'xref' keyword + // See: https://github.com/smalot/pdfparser/issues/673 + $bumpOffset = $offset; + while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) { + ++$bumpOffset; + } + + // Find all startxref tables from this $offset forward + $startxrefPreg = preg_match_all( + '/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $pdfData, - $matches, - \PREG_OFFSET_CAPTURE, + $startxrefMatches, + \PREG_SET_ORDER, $offset ); - if (0 == $offset) { - // find last startxref - $pregResult = preg_match_all( - '/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', - $pdfData, - $matches, - \PREG_SET_ORDER, - $offset - ); - if (0 == $pregResult) { - throw new \Exception('Unable to find startxref'); - } - $matches = array_pop($matches); - $startxref = $matches[1]; - } elseif (strpos($pdfData, 'xref', $offset) == $offset) { - // Already pointing at the xref table - $startxref = $offset; - } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) { - // Cross-Reference Stream object - $startxref = $offset; - } elseif ($startxrefPreg) { - // startxref found - $startxref = $matches[1][0]; - } else { + if (0 == $startxrefPreg) { + // No startxref tables were found throw new \Exception('Unable to find startxref'); + } elseif (0 == $offset) { + // Use the last startxref in the document + $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1]; + } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { + // Already pointing at the xref table + $startxref = $bumpOffset; + } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) { + // Cross-Reference Stream object + $startxref = $bumpOffset; + } else { + // Use the next startxref from this $offset + $startxref = (int) $startxrefMatches[0][1]; } if ($startxref > \strlen($pdfData)) { @@ -901,8 +908,15 @@ class RawDataParser // Cross-Reference $xref = $this->decodeXref($pdfData, $startxref, $xref); } else { - // Cross-Reference Stream - $xref = $this->decodeXrefStream($pdfData, $startxref, $xref); + // Check if the $pdfData might have the wrong line-endings + $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); + if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + // Return Unix-line-ending flag + $xref = ['Unix' => true]; + } else { + // Cross-Reference Stream + $xref = $this->decodeXrefStream($pdfData, $startxref, $xref); + } } if (empty($xref)) { throw new \Exception('Unable to find xref'); @@ -937,6 +951,12 @@ class RawDataParser // get xref and trailer data $xref = $this->getXrefData($pdfData); + // If we found Unix line-endings + if (isset($xref['Unix'])) { + $pdfData = str_replace("\r\n", "\n", $pdfData); + $xref = $this->getXrefData($pdfData); + } + // parse all document objects $objects = []; foreach ($xref['xref'] as $obj => $offset) { diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php index 7caec8c..8e60647 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Form.php @@ -41,7 +41,7 @@ use Smalot\PdfParser\PDFObject; */ class Form extends Page { - public function getText(Page $page = null): string + public function getText(?Page $page = null): string { $header = new Header([], $this->document); $contents = new PDFObject($this->document, $header, $this->content, $this->config); diff --git a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php index 1265582..6dc6b0a 100644 --- a/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php +++ b/orcinus/pdfparser/src/Smalot/PdfParser/XObject/Image.php @@ -40,7 +40,7 @@ use Smalot\PdfParser\PDFObject; */ class Image extends PDFObject { - public function getText(Page $page = null): string + public function getText(?Page $page = null): string { return ''; }