Update PdfParser to 2.10.0

This commit is contained in:
Brian Huisman 2024-05-16 12:36:43 -04:00
parent 4f679114c3
commit fb7e295490
24 changed files with 1080 additions and 621 deletions

View file

@ -82,6 +82,13 @@ class Config
*/
private $dataTmFontInfoHasToBeIncluded = false;
/**
* Whether to attempt to read PDFs even if they are marked as encrypted.
*
* @var bool
*/
private $ignoreEncryption = false;
public function getFontSpaceLimit()
{
return $this->fontSpaceLimit;
@ -151,4 +158,18 @@ class Config
{
$this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
}
public function getIgnoreEncryption(): bool
{
return $this->ignoreEncryption;
}
/**
* @deprecated this is a temporary workaround, don't rely on it
* @see https://github.com/smalot/pdfparser/pull/653
*/
public function setIgnoreEncryption(bool $ignoreEncryption): void
{
$this->ignoreEncryption = $ignoreEncryption;
}
}

View file

@ -255,7 +255,7 @@ class Document
if ('rdf:li' == $val['tag']) {
$metadata[] = $val['value'];
// Else assign a value to this property
// Else assign a value to this property
} else {
$metadata[$val['tag']] = $val['value'];
}
@ -263,12 +263,20 @@ class Document
break;
case 'close':
// If the value of this property is a single-
// element array where the element is of type
// string, use the value of the first list item
// as the value for this property
if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
$metadata = $metadata[0];
// If the value of this property is an array
if (\is_array($metadata)) {
// If the value is a single element array
// where the element is of type string, use
// the value of the first list item as the
// value for this property
if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
$metadata = $metadata[0];
} elseif (0 == \count($metadata)) {
// if the value is an empty array, set
// the value of this property to the empty
// string
$metadata = '';
}
}
// Move down one level in the stack
@ -328,12 +336,12 @@ class Document
return null;
}
public function hasObjectsByType(string $type, string $subtype = null): bool
public function hasObjectsByType(string $type, ?string $subtype = null): bool
{
return 0 < \count($this->getObjectsByType($type, $subtype));
}
public function getObjectsByType(string $type, string $subtype = null): array
public function getObjectsByType(string $type, ?string $subtype = null): array
{
if (!isset($this->dictionary[$type])) {
return [];
@ -410,7 +418,7 @@ class Document
throw new \Exception('Missing catalog.');
}
public function getText(int $pageLimit = null): string
public function getText(?int $pageLimit = null): string
{
$texts = [];
$pages = $this->getPages();

View file

@ -49,13 +49,13 @@ use Smalot\PdfParser\Element\ElementXRef;
class Element
{
/**
* @var Document
* @var Document|null
*/
protected $document;
protected $value;
public function __construct($value, Document $document = null)
public function __construct($value, ?Document $document = null)
{
$this->value = $value;
$this->document = $document;
@ -96,7 +96,7 @@ class Element
return (string) $this->value;
}
public static function parse(string $content, Document $document = null, int &$position = 0)
public static function parse(string $content, ?Document $document = null, int &$position = 0)
{
$args = \func_get_args();
$only_values = isset($args[3]) ? $args[3] : false;

View file

@ -42,7 +42,7 @@ use Smalot\PdfParser\PDFObject;
*/
class ElementArray extends Element
{
public function __construct($value, Document $document = null)
public function __construct($value, ?Document $document = null)
{
parent::__construct($value, $document);
}
@ -107,7 +107,7 @@ class ElementArray extends Element
*
* @return bool|ElementArray
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*\[(?P<array>.*)/is', $content, $match)) {
preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches);

View file

@ -61,7 +61,7 @@ class ElementBoolean extends Element
/**
* @return bool|ElementBoolean
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*(?P<value>true|false)/is', $content, $match)) {
$value = $match['value'];

View file

@ -40,7 +40,7 @@ use Smalot\PdfParser\Document;
class ElementDate extends ElementString
{
/**
* @var array
* @var array<int,string>
*/
protected static $formats = [
4 => 'Y',
@ -98,7 +98,7 @@ class ElementDate extends ElementString
/**
* @return bool|ElementDate
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*\(D\:(?P<name>.*?)\)/s', $content, $match)) {
$name = $match['name'];

View file

@ -42,7 +42,7 @@ class ElementHexa extends ElementString
/**
* @return bool|ElementHexa|ElementDate
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
$name = $match['name'];
@ -64,15 +64,21 @@ class ElementHexa extends ElementString
public static function decode(string $value): string
{
$text = '';
$length = \strlen($value);
if ('00' === substr($value, 0, 2)) {
for ($i = 0; $i < $length; $i += 4) {
// Filter $value of non-hexadecimal characters
$value = (string) preg_replace('/[^0-9a-f]/i', '', $value);
// Check for leading zeros (4-byte hexadecimal indicator), or
// the BE BOM
if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) {
$value = (string) preg_replace('/^feff/i', '', $value);
for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) {
$hex = substr($value, $i, 4);
$text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
}
} else {
for ($i = 0; $i < $length; $i += 2) {
// Otherwise decode this as 2-byte hexadecimal
for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) {
$hex = substr($value, $i, 2);
$text .= \chr(hexdec($hex));
}

View file

@ -54,7 +54,7 @@ class ElementName extends Element
/**
* @return bool|ElementName
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) {
$name = $match[1];

View file

@ -58,7 +58,7 @@ class ElementNull extends Element
/**
* @return bool|ElementNull
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*(null)/s', $content, $match)) {
$offset += strpos($content, 'null') + \strlen('null');

View file

@ -48,7 +48,7 @@ class ElementNumeric extends Element
/**
* @return bool|ElementNumeric
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*(?P<value>\-?[0-9\.]+)/s', $content, $match)) {
$value = $match['value'];

View file

@ -54,7 +54,7 @@ class ElementString extends Element
/**
* @return bool|ElementString
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
$name = $match['name'];

View file

@ -44,7 +44,7 @@ class ElementStruct extends Element
/**
* @return false|Header
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*<<(?P<struct>.*)/is', $content)) {
preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches);

View file

@ -83,7 +83,7 @@ class ElementXRef extends Element
/**
* @return bool|ElementXRef
*/
public static function parse(string $content, Document $document = null, int &$offset = 0)
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
{
if (preg_match('/^\s*(?P<id>[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) {
$id = $match['id'];

View file

@ -145,6 +145,12 @@ class Encoding extends PDFObject
{
// Load reference table charset.
$baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent());
// Check for empty BaseEncoding field value
if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) {
$baseEncoding = 'StandardEncoding';
}
$className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding;
if (!class_exists($className)) {

View file

@ -178,7 +178,7 @@ class PDFDocEncoding
"\xfc" => "\u{00fc}", // udieresis
"\xfd" => "\u{00fd}", // yacute
"\xfe" => "\u{00fe}", // thorn
"\xff" => "\u{00ff}", // ydieresis
"\xff" => "\u{00ff}", // ydieresis
];
}
@ -186,4 +186,4 @@ class PDFDocEncoding
{
return strtr($content, static::getCodePage());
}
}
}

View file

@ -134,9 +134,16 @@ class Font extends PDFObject
/**
* Convert unicode character code to "utf-8" encoded string.
*
* @param int|float $code Unicode character code. Will be casted to int internally!
*/
public static function uchr(int $code): string
public static function uchr($code): string
{
// note:
// $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623
// because in some cases uchr was called with a float instead of an integer.
$code = (int) $code;
if (!isset(self::$uchrCache[$code])) {
// html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
// therefore, we use mb_convert_encoding() instead
@ -272,11 +279,13 @@ class Font extends PDFObject
/**
* Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
*/
public function calculateTextWidth(string $text, array &$missing = null): ?float
public function calculateTextWidth(string $text, ?array &$missing = null): ?float
{
$index_map = array_flip($this->table);
$details = $this->getDetails();
$widths = $details['Widths'];
// Usually, Widths key is set in $details array, but if it isn't use an empty array instead.
$widths = $details['Widths'] ?? [];
// Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
$width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
@ -312,12 +321,12 @@ class Font extends PDFObject
}
$text = '';
$parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
$parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
foreach ($parts as $part) {
if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
// strip line breaks
$part = preg_replace("/[\r\n]/", '', $part);
if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
// strip whitespace
$part = preg_replace("/\s/", '', $part);
$part = trim($part, '<>');
if ($add_braces) {
$text .= '(';
@ -342,18 +351,20 @@ class Font extends PDFObject
*/
public static function decodeOctal(string $text): string
{
$parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
$text = '';
// Replace all double backslashes \\ with a special string
$text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);
foreach ($parts as $part) {
if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
$text .= \chr(octdec(trim($part, '\\')));
} else {
$text .= $part;
}
}
// Now we can replace all octal codes without worrying about
// escaped backslashes
$text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
return \chr(octdec($m[1]));
}, $text);
return $text;
// Unescape any parentheses
$text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
// Replace instances of the special string with a single backslash
return str_replace('[**pdfparserdblslsh**]', '\\', $text);
}
/**
@ -361,18 +372,9 @@ class Font extends PDFObject
*/
public static function decodeEntities(string $text): string
{
$parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
$text = '';
foreach ($parts as $part) {
if (preg_match('/^#\d{2}$/', $part)) {
$text .= \chr(hexdec(trim($part, '#')));
} else {
$text .= $part;
}
}
return $text;
return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
return \chr(hexdec($m[1]));
}, $text);
}
/**
@ -384,7 +386,7 @@ class Font extends PDFObject
*/
public static function decodeUnicode(string $text): string
{
if (preg_match('/^\xFE\xFF/i', $text)) {
if ("\xFE\xFF" === substr($text, 0, 2)) {
// Strip U+FEFF byte order marker.
$decode = substr($text, 2);
$text = '';
@ -409,16 +411,17 @@ class Font extends PDFObject
/**
* Decode text by commands array.
*/
public function decodeText(array $commands): string
public function decodeText(array $commands, float $fontFactor = 4): string
{
$word_position = 0;
$words = [];
$font_space = $this->getFontSpaceLimit();
$font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4;
foreach ($commands as $command) {
switch ($command[PDFObject::TYPE]) {
case 'n':
if ((float) trim($command[PDFObject::COMMAND]) < $font_space) {
$offset = (float) trim($command[PDFObject::COMMAND]);
if ($offset - (float) $font_space < 0) {
$word_position = \count($words);
}
continue 2;
@ -434,8 +437,8 @@ class Font extends PDFObject
// replace escaped chars
$text = str_replace(
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'],
[\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)],
$text
);
@ -449,9 +452,32 @@ class Font extends PDFObject
foreach ($words as &$word) {
$word = $this->decodeContent($word);
$word = str_replace("\t", ' ', $word);
}
return implode(' ', $words);
// Remove internal "words" that are just spaces, but leave them
// if they are at either end of the array of words. This fixes,
// for example, lines that are justified to fill
// a whole row.
for ($x = \count($words) - 2; $x >= 1; --$x) {
if ('' === trim($words[$x], ' ')) {
unset($words[$x]);
}
}
$words = array_values($words);
// Cut down on the number of unnecessary internal spaces by
// imploding the string on the null byte, and checking if the
// text includes extra spaces on either side. If so, merge
// where appropriate.
$words = implode("\x00\x00", $words);
$words = str_replace(
[" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"],
[' ', ' ', ' ', ' '],
$words
);
return $words;
}
/**
@ -459,8 +485,14 @@ class Font extends PDFObject
*
* @param bool $unicode This parameter is deprecated and might be removed in a future release
*/
public function decodeContent(string $text, bool &$unicode = null): string
public function decodeContent(string $text, ?bool &$unicode = null): string
{
// If this string begins with a UTF-16BE BOM, then decode it
// directly as Unicode
if ("\xFE\xFF" === substr($text, 0, 2)) {
return $this->decodeUnicode($text);
}
if ($this->has('ToUnicode')) {
return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
}

View file

@ -43,7 +43,7 @@ use Smalot\PdfParser\Element\ElementXRef;
class Header
{
/**
* @var Document
* @var Document|null
*/
protected $document;
@ -56,7 +56,7 @@ class Header
* @param Element[] $elements list of elements
* @param Document $document document
*/
public function __construct(array $elements = [], Document $document = null)
public function __construct(array $elements = [], ?Document $document = null)
{
$this->elements = $elements;
$this->document = $document;

File diff suppressed because it is too large Load diff

View file

@ -176,7 +176,7 @@ class Page extends PDFObject
}*/
}
public function getText(self $page = null): string
public function getText(?self $page = null): string
{
if ($contents = $this->get('Contents')) {
if ($contents instanceof ElementMissing) {
@ -312,7 +312,7 @@ class Page extends PDFObject
return new self($pdfObject->document, $header, $new_content, $config);
}
public function getTextArray(self $page = null): array
public function getTextArray(?self $page = null): array
{
if ($this->isFpdf()) {
$pdfObject = $this->getPDFObjectForFpdf();
@ -400,8 +400,6 @@ class Page extends PDFObject
}
$sectionsText = $content->getSectionsText($content->getContent());
foreach ($sectionsText as $sectionText) {
$extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
$commandsText = $content->getCommandsText($sectionText);
foreach ($commandsText as $command) {
$extractedData[] = $command;
@ -420,7 +418,7 @@ class Page extends PDFObject
*
* @return array An array with the data and the internal representation
*/
public function extractDecodedRawData(array $extractedRawData = null): array
public function extractDecodedRawData(?array $extractedRawData = null): array
{
if (!isset($extractedRawData) || !$extractedRawData) {
$extractedRawData = $this->extractRawData();
@ -500,7 +498,7 @@ class Page extends PDFObject
*
* @return array An array with the text command of the page
*/
public function getDataCommands(array $extractedDecodedRawData = null): array
public function getDataCommands(?array $extractedDecodedRawData = null): array
{
if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
$extractedDecodedRawData = $this->extractDecodedRawData();
@ -651,7 +649,7 @@ class Page extends PDFObject
* @return array an array with the data of the page including the Tm information
* of any text in the page
*/
public function getDataTm(array $dataCommands = null): array
public function getDataTm(?array $dataCommands = null): array
{
if (!isset($dataCommands) || !$dataCommands) {
$dataCommands = $this->getDataCommands();
@ -701,6 +699,12 @@ class Page extends PDFObject
$extractedTexts = $this->getTextArray();
$extractedData = [];
foreach ($dataCommands as $command) {
// If we've used up all the texts from getTextArray(), exit
// so we aren't accessing non-existent array indices
// Fixes 'undefined array key' errors in Issues #575, #576
if (\count($extractedTexts) <= \count($extractedData)) {
break;
}
$currentText = $extractedTexts[\count($extractedData)];
switch ($command['o']) {
/*
@ -712,21 +716,13 @@ class Page extends PDFObject
$Tl = $defaultTl;
$Tx = 0;
$Ty = 0;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;
break;
/*
* ET
* End a text object, discarding the text matrix
* End a text object
*/
case 'ET':
$Tm = $defaultTm;
$Tl = $defaultTl;
$Tx = 0;
$Ty = 0;
$fontId = $defaultFontId;
$fontSize = $defaultFontSize;
break;
/*
@ -741,7 +737,7 @@ class Page extends PDFObject
/*
* tx ty Td
* Move to the start of the next line, offset form the start of the
* Move to the start of the next line, offset from the start of the
* current line by tx, ty.
*/
case 'Td':
@ -898,7 +894,7 @@ class Page extends PDFObject
* "near" the x,y coordinate, an empty array is returned. If Both, x
* and y coordinates are null, null is returned.
*/
public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
{
if (!isset($this->dataTm) || !$this->dataTm) {
$this->getDataTm();

View file

@ -60,7 +60,7 @@ class Parser
protected $rawDataParser;
public function __construct($cfg = [], Config $config = null)
public function __construct($cfg = [], ?Config $config = null)
{
$this->config = $config ?: new Config();
$this->rawDataParser = new RawDataParser($cfg, $this->config);
@ -77,6 +77,7 @@ class Parser
public function parseFile(string $filename): Document
{
$content = file_get_contents($filename);
/*
* 2018/06/20 @doganoo as multiple times a
* users have complained that the parseFile()
@ -101,7 +102,7 @@ class Parser
// Create structure from raw data.
list($xref, $data) = $this->rawDataParser->parseData($content);
if (isset($xref['trailer']['encrypt'])) {
if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
throw new \Exception('Secured pdf file are currently not supported.');
}

View file

@ -233,32 +233,32 @@ class FilterHelper
*/
protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
{
/*
* gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty)
* the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
*/
set_error_handler(function ($errNo, $errStr) {
if (\E_WARNING === $errNo) {
throw new \Exception($errStr);
} else {
// fallback to default php error handler
return false;
}
});
// Uncatchable E_WARNING for "data error" is @ suppressed
// so execution may proceed with an alternate decompression
// method.
$decoded = @gzuncompress($data, $decodeMemoryLimit);
$decoded = null;
// initialize string to return
try {
$decoded = gzuncompress($data, $decodeMemoryLimit);
if (false === $decoded) {
throw new \Exception('decodeFilterFlateDecode: invalid code');
if (false === $decoded) {
// If gzuncompress() failed, try again using the compress.zlib://
// wrapper to decode it in a file-based context.
// See: https://www.php.net/manual/en/function.gzuncompress.php#79042
// Issue: https://github.com/smalot/pdfparser/issues/592
$ztmp = tmpfile();
if (false != $ztmp) {
fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
$file = stream_get_meta_data($ztmp)['uri'];
if (0 === $decodeMemoryLimit) {
$decoded = file_get_contents('compress.zlib://'.$file);
} else {
$decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit);
}
fclose($ztmp);
}
} catch (\Exception $e) {
throw $e;
} finally {
// Restore old handler just in case it was customized outside of PDFParser.
restore_error_handler();
}
if (false === \is_string($decoded) || '' === $decoded) {
// If the decoded string is empty, that means decoding failed.
throw new \Exception('decodeFilterFlateDecode: invalid data');
}
return $decoded;

View file

@ -47,12 +47,14 @@ use Smalot\PdfParser\Config;
class RawDataParser
{
/**
* @var \Smalot\PdfParser\Config
* @var Config
*/
private $config;
/**
* Configuration array.
*
* @var array<string,bool>
*/
protected $cfg = [
// if `true` ignore filter decoding errors
@ -67,7 +69,7 @@ class RawDataParser
/**
* @param array $cfg Configuration array, default is []
*/
public function __construct($cfg = [], Config $config = null)
public function __construct($cfg = [], ?Config $config = null)
{
// merge given array with default values
$this->cfg = array_merge($this->cfg, $cfg);
@ -125,7 +127,7 @@ class RawDataParser
// decode the stream
$remaining_filters = [];
foreach ($filters as $filter) {
if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
try {
$stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
} catch (\Exception $e) {
@ -402,14 +404,19 @@ class RawDataParser
}
$prev_row = $ddata[$k];
} // end for each row
// complete decoding
// complete decoding
} else {
// number of bytes in a row
$rowlen = array_sum($wb);
// convert the stream into an array of integers
$sdata = unpack('C*', $xrefcrs[1][3][0]);
// split the rows
$ddata = array_chunk($sdata, $rowlen);
if (0 < $rowlen) {
// convert the stream into an array of integers
$sdata = unpack('C*', $xrefcrs[1][3][0]);
// split the rows
$ddata = array_chunk($sdata, $rowlen);
} else {
// if the row length is zero, $ddata should be an empty array as well
$ddata = [];
}
}
$sdata = [];
@ -609,7 +616,7 @@ class RawDataParser
*
* @return array containing object type, raw value and offset to next object
*/
protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
{
$objtype = ''; // object type to be returned
$objval = ''; // object value to be returned
@ -756,7 +763,7 @@ class RawDataParser
// start stream object
$objtype = 'stream';
$offset += 6;
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
$offset += \strlen($matches[0]);
// we get stream length here to later help preg_match test less data
@ -857,39 +864,39 @@ class RawDataParser
*/
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
{
$startxrefPreg = preg_match(
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
// If the $offset is currently pointed at whitespace, bump it
// forward until it isn't; affects loosely targetted offsets
// for the 'xref' keyword
// See: https://github.com/smalot/pdfparser/issues/673
$bumpOffset = $offset;
while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
++$bumpOffset;
}
// Find all startxref tables from this $offset forward
$startxrefPreg = preg_match_all(
'/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
$pdfData,
$matches,
\PREG_OFFSET_CAPTURE,
$startxrefMatches,
\PREG_SET_ORDER,
$offset
);
if (0 == $offset) {
// find last startxref
$pregResult = preg_match_all(
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
$pdfData,
$matches,
\PREG_SET_ORDER,
$offset
);
if (0 == $pregResult) {
throw new \Exception('Unable to find startxref');
}
$matches = array_pop($matches);
$startxref = $matches[1];
} elseif (strpos($pdfData, 'xref', $offset) == $offset) {
// Already pointing at the xref table
$startxref = $offset;
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
// Cross-Reference Stream object
$startxref = $offset;
} elseif ($startxrefPreg) {
// startxref found
$startxref = $matches[1][0];
} else {
if (0 == $startxrefPreg) {
// No startxref tables were found
throw new \Exception('Unable to find startxref');
} elseif (0 == $offset) {
// Use the last startxref in the document
$startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
} elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
// Already pointing at the xref table
$startxref = $bumpOffset;
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
// Cross-Reference Stream object
$startxref = $bumpOffset;
} else {
// Use the next startxref from this $offset
$startxref = (int) $startxrefMatches[0][1];
}
if ($startxref > \strlen($pdfData)) {
@ -901,8 +908,15 @@ class RawDataParser
// Cross-Reference
$xref = $this->decodeXref($pdfData, $startxref, $xref);
} else {
// Cross-Reference Stream
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
// Check if the $pdfData might have the wrong line-endings
$pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
// Return Unix-line-ending flag
$xref = ['Unix' => true];
} else {
// Cross-Reference Stream
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
}
}
if (empty($xref)) {
throw new \Exception('Unable to find xref');
@ -937,6 +951,12 @@ class RawDataParser
// get xref and trailer data
$xref = $this->getXrefData($pdfData);
// If we found Unix line-endings
if (isset($xref['Unix'])) {
$pdfData = str_replace("\r\n", "\n", $pdfData);
$xref = $this->getXrefData($pdfData);
}
// parse all document objects
$objects = [];
foreach ($xref['xref'] as $obj => $offset) {

View file

@ -41,7 +41,7 @@ use Smalot\PdfParser\PDFObject;
*/
class Form extends Page
{
public function getText(Page $page = null): string
public function getText(?Page $page = null): string
{
$header = new Header([], $this->document);
$contents = new PDFObject($this->document, $header, $this->content, $this->config);

View file

@ -40,7 +40,7 @@ use Smalot\PdfParser\PDFObject;
*/
class Image extends PDFObject
{
public function getText(Page $page = null): string
public function getText(?Page $page = null): string
{
return '';
}