Update PdfParser to 2.10.0
This commit is contained in:
parent
4f679114c3
commit
fb7e295490
|
@ -82,6 +82,13 @@ class Config
|
|||
*/
|
||||
private $dataTmFontInfoHasToBeIncluded = false;
|
||||
|
||||
/**
|
||||
* Whether to attempt to read PDFs even if they are marked as encrypted.
|
||||
*
|
||||
* @var bool
|
||||
*/
|
||||
private $ignoreEncryption = false;
|
||||
|
||||
public function getFontSpaceLimit()
|
||||
{
|
||||
return $this->fontSpaceLimit;
|
||||
|
@ -151,4 +158,18 @@ class Config
|
|||
{
|
||||
$this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
|
||||
}
|
||||
|
||||
public function getIgnoreEncryption(): bool
|
||||
{
|
||||
return $this->ignoreEncryption;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated this is a temporary workaround, don't rely on it
|
||||
* @see https://github.com/smalot/pdfparser/pull/653
|
||||
*/
|
||||
public function setIgnoreEncryption(bool $ignoreEncryption): void
|
||||
{
|
||||
$this->ignoreEncryption = $ignoreEncryption;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -263,12 +263,20 @@ class Document
|
|||
break;
|
||||
|
||||
case 'close':
|
||||
// If the value of this property is a single-
|
||||
// element array where the element is of type
|
||||
// string, use the value of the first list item
|
||||
// as the value for this property
|
||||
if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
|
||||
// If the value of this property is an array
|
||||
if (\is_array($metadata)) {
|
||||
// If the value is a single element array
|
||||
// where the element is of type string, use
|
||||
// the value of the first list item as the
|
||||
// value for this property
|
||||
if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
|
||||
$metadata = $metadata[0];
|
||||
} elseif (0 == \count($metadata)) {
|
||||
// if the value is an empty array, set
|
||||
// the value of this property to the empty
|
||||
// string
|
||||
$metadata = '';
|
||||
}
|
||||
}
|
||||
|
||||
// Move down one level in the stack
|
||||
|
@ -328,12 +336,12 @@ class Document
|
|||
return null;
|
||||
}
|
||||
|
||||
public function hasObjectsByType(string $type, string $subtype = null): bool
|
||||
public function hasObjectsByType(string $type, ?string $subtype = null): bool
|
||||
{
|
||||
return 0 < \count($this->getObjectsByType($type, $subtype));
|
||||
}
|
||||
|
||||
public function getObjectsByType(string $type, string $subtype = null): array
|
||||
public function getObjectsByType(string $type, ?string $subtype = null): array
|
||||
{
|
||||
if (!isset($this->dictionary[$type])) {
|
||||
return [];
|
||||
|
@ -410,7 +418,7 @@ class Document
|
|||
throw new \Exception('Missing catalog.');
|
||||
}
|
||||
|
||||
public function getText(int $pageLimit = null): string
|
||||
public function getText(?int $pageLimit = null): string
|
||||
{
|
||||
$texts = [];
|
||||
$pages = $this->getPages();
|
||||
|
|
|
@ -49,13 +49,13 @@ use Smalot\PdfParser\Element\ElementXRef;
|
|||
class Element
|
||||
{
|
||||
/**
|
||||
* @var Document
|
||||
* @var Document|null
|
||||
*/
|
||||
protected $document;
|
||||
|
||||
protected $value;
|
||||
|
||||
public function __construct($value, Document $document = null)
|
||||
public function __construct($value, ?Document $document = null)
|
||||
{
|
||||
$this->value = $value;
|
||||
$this->document = $document;
|
||||
|
@ -96,7 +96,7 @@ class Element
|
|||
return (string) $this->value;
|
||||
}
|
||||
|
||||
public static function parse(string $content, Document $document = null, int &$position = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$position = 0)
|
||||
{
|
||||
$args = \func_get_args();
|
||||
$only_values = isset($args[3]) ? $args[3] : false;
|
||||
|
|
|
@ -42,7 +42,7 @@ use Smalot\PdfParser\PDFObject;
|
|||
*/
|
||||
class ElementArray extends Element
|
||||
{
|
||||
public function __construct($value, Document $document = null)
|
||||
public function __construct($value, ?Document $document = null)
|
||||
{
|
||||
parent::__construct($value, $document);
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ class ElementArray extends Element
|
|||
*
|
||||
* @return bool|ElementArray
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*\[(?P<array>.*)/is', $content, $match)) {
|
||||
preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches);
|
||||
|
|
|
@ -61,7 +61,7 @@ class ElementBoolean extends Element
|
|||
/**
|
||||
* @return bool|ElementBoolean
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*(?P<value>true|false)/is', $content, $match)) {
|
||||
$value = $match['value'];
|
||||
|
|
|
@ -40,7 +40,7 @@ use Smalot\PdfParser\Document;
|
|||
class ElementDate extends ElementString
|
||||
{
|
||||
/**
|
||||
* @var array
|
||||
* @var array<int,string>
|
||||
*/
|
||||
protected static $formats = [
|
||||
4 => 'Y',
|
||||
|
@ -98,7 +98,7 @@ class ElementDate extends ElementString
|
|||
/**
|
||||
* @return bool|ElementDate
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*\(D\:(?P<name>.*?)\)/s', $content, $match)) {
|
||||
$name = $match['name'];
|
||||
|
|
|
@ -42,7 +42,7 @@ class ElementHexa extends ElementString
|
|||
/**
|
||||
* @return bool|ElementHexa|ElementDate
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
|
||||
$name = $match['name'];
|
||||
|
@ -64,15 +64,21 @@ class ElementHexa extends ElementString
|
|||
public static function decode(string $value): string
|
||||
{
|
||||
$text = '';
|
||||
$length = \strlen($value);
|
||||
|
||||
if ('00' === substr($value, 0, 2)) {
|
||||
for ($i = 0; $i < $length; $i += 4) {
|
||||
// Filter $value of non-hexadecimal characters
|
||||
$value = (string) preg_replace('/[^0-9a-f]/i', '', $value);
|
||||
|
||||
// Check for leading zeros (4-byte hexadecimal indicator), or
|
||||
// the BE BOM
|
||||
if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) {
|
||||
$value = (string) preg_replace('/^feff/i', '', $value);
|
||||
for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) {
|
||||
$hex = substr($value, $i, 4);
|
||||
$text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
|
||||
}
|
||||
} else {
|
||||
for ($i = 0; $i < $length; $i += 2) {
|
||||
// Otherwise decode this as 2-byte hexadecimal
|
||||
for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) {
|
||||
$hex = substr($value, $i, 2);
|
||||
$text .= \chr(hexdec($hex));
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@ class ElementName extends Element
|
|||
/**
|
||||
* @return bool|ElementName
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) {
|
||||
$name = $match[1];
|
||||
|
|
|
@ -58,7 +58,7 @@ class ElementNull extends Element
|
|||
/**
|
||||
* @return bool|ElementNull
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*(null)/s', $content, $match)) {
|
||||
$offset += strpos($content, 'null') + \strlen('null');
|
||||
|
|
|
@ -48,7 +48,7 @@ class ElementNumeric extends Element
|
|||
/**
|
||||
* @return bool|ElementNumeric
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*(?P<value>\-?[0-9\.]+)/s', $content, $match)) {
|
||||
$value = $match['value'];
|
||||
|
|
|
@ -54,7 +54,7 @@ class ElementString extends Element
|
|||
/**
|
||||
* @return bool|ElementString
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
|
||||
$name = $match['name'];
|
||||
|
|
|
@ -44,7 +44,7 @@ class ElementStruct extends Element
|
|||
/**
|
||||
* @return false|Header
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*<<(?P<struct>.*)/is', $content)) {
|
||||
preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches);
|
||||
|
|
|
@ -83,7 +83,7 @@ class ElementXRef extends Element
|
|||
/**
|
||||
* @return bool|ElementXRef
|
||||
*/
|
||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
||||
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||
{
|
||||
if (preg_match('/^\s*(?P<id>[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) {
|
||||
$id = $match['id'];
|
||||
|
|
|
@ -145,6 +145,12 @@ class Encoding extends PDFObject
|
|||
{
|
||||
// Load reference table charset.
|
||||
$baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent());
|
||||
|
||||
// Check for empty BaseEncoding field value
|
||||
if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) {
|
||||
$baseEncoding = 'StandardEncoding';
|
||||
}
|
||||
|
||||
$className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding;
|
||||
|
||||
if (!class_exists($className)) {
|
||||
|
|
|
@ -134,9 +134,16 @@ class Font extends PDFObject
|
|||
|
||||
/**
|
||||
* Convert unicode character code to "utf-8" encoded string.
|
||||
*
|
||||
* @param int|float $code Unicode character code. Will be casted to int internally!
|
||||
*/
|
||||
public static function uchr(int $code): string
|
||||
public static function uchr($code): string
|
||||
{
|
||||
// note:
|
||||
// $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623
|
||||
// because in some cases uchr was called with a float instead of an integer.
|
||||
$code = (int) $code;
|
||||
|
||||
if (!isset(self::$uchrCache[$code])) {
|
||||
// html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
|
||||
// therefore, we use mb_convert_encoding() instead
|
||||
|
@ -272,11 +279,13 @@ class Font extends PDFObject
|
|||
/**
|
||||
* Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
|
||||
*/
|
||||
public function calculateTextWidth(string $text, array &$missing = null): ?float
|
||||
public function calculateTextWidth(string $text, ?array &$missing = null): ?float
|
||||
{
|
||||
$index_map = array_flip($this->table);
|
||||
$details = $this->getDetails();
|
||||
$widths = $details['Widths'];
|
||||
|
||||
// Usually, Widths key is set in $details array, but if it isn't use an empty array instead.
|
||||
$widths = $details['Widths'] ?? [];
|
||||
|
||||
// Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
|
||||
$width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
|
||||
|
@ -312,12 +321,12 @@ class Font extends PDFObject
|
|||
}
|
||||
|
||||
$text = '';
|
||||
$parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
||||
$parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
||||
|
||||
foreach ($parts as $part) {
|
||||
if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
|
||||
// strip line breaks
|
||||
$part = preg_replace("/[\r\n]/", '', $part);
|
||||
if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
|
||||
// strip whitespace
|
||||
$part = preg_replace("/\s/", '', $part);
|
||||
$part = trim($part, '<>');
|
||||
if ($add_braces) {
|
||||
$text .= '(';
|
||||
|
@ -342,18 +351,20 @@ class Font extends PDFObject
|
|||
*/
|
||||
public static function decodeOctal(string $text): string
|
||||
{
|
||||
$parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
||||
$text = '';
|
||||
// Replace all double backslashes \\ with a special string
|
||||
$text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);
|
||||
|
||||
foreach ($parts as $part) {
|
||||
if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
|
||||
$text .= \chr(octdec(trim($part, '\\')));
|
||||
} else {
|
||||
$text .= $part;
|
||||
}
|
||||
}
|
||||
// Now we can replace all octal codes without worrying about
|
||||
// escaped backslashes
|
||||
$text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
|
||||
return \chr(octdec($m[1]));
|
||||
}, $text);
|
||||
|
||||
return $text;
|
||||
// Unescape any parentheses
|
||||
$text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
|
||||
|
||||
// Replace instances of the special string with a single backslash
|
||||
return str_replace('[**pdfparserdblslsh**]', '\\', $text);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -361,18 +372,9 @@ class Font extends PDFObject
|
|||
*/
|
||||
public static function decodeEntities(string $text): string
|
||||
{
|
||||
$parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
||||
$text = '';
|
||||
|
||||
foreach ($parts as $part) {
|
||||
if (preg_match('/^#\d{2}$/', $part)) {
|
||||
$text .= \chr(hexdec(trim($part, '#')));
|
||||
} else {
|
||||
$text .= $part;
|
||||
}
|
||||
}
|
||||
|
||||
return $text;
|
||||
return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
|
||||
return \chr(hexdec($m[1]));
|
||||
}, $text);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -384,7 +386,7 @@ class Font extends PDFObject
|
|||
*/
|
||||
public static function decodeUnicode(string $text): string
|
||||
{
|
||||
if (preg_match('/^\xFE\xFF/i', $text)) {
|
||||
if ("\xFE\xFF" === substr($text, 0, 2)) {
|
||||
// Strip U+FEFF byte order marker.
|
||||
$decode = substr($text, 2);
|
||||
$text = '';
|
||||
|
@ -409,16 +411,17 @@ class Font extends PDFObject
|
|||
/**
|
||||
* Decode text by commands array.
|
||||
*/
|
||||
public function decodeText(array $commands): string
|
||||
public function decodeText(array $commands, float $fontFactor = 4): string
|
||||
{
|
||||
$word_position = 0;
|
||||
$words = [];
|
||||
$font_space = $this->getFontSpaceLimit();
|
||||
$font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4;
|
||||
|
||||
foreach ($commands as $command) {
|
||||
switch ($command[PDFObject::TYPE]) {
|
||||
case 'n':
|
||||
if ((float) trim($command[PDFObject::COMMAND]) < $font_space) {
|
||||
$offset = (float) trim($command[PDFObject::COMMAND]);
|
||||
if ($offset - (float) $font_space < 0) {
|
||||
$word_position = \count($words);
|
||||
}
|
||||
continue 2;
|
||||
|
@ -434,8 +437,8 @@ class Font extends PDFObject
|
|||
|
||||
// replace escaped chars
|
||||
$text = str_replace(
|
||||
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
|
||||
['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
|
||||
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'],
|
||||
[\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)],
|
||||
$text
|
||||
);
|
||||
|
||||
|
@ -449,9 +452,32 @@ class Font extends PDFObject
|
|||
|
||||
foreach ($words as &$word) {
|
||||
$word = $this->decodeContent($word);
|
||||
$word = str_replace("\t", ' ', $word);
|
||||
}
|
||||
|
||||
return implode(' ', $words);
|
||||
// Remove internal "words" that are just spaces, but leave them
|
||||
// if they are at either end of the array of words. This fixes,
|
||||
// for example, lines that are justified to fill
|
||||
// a whole row.
|
||||
for ($x = \count($words) - 2; $x >= 1; --$x) {
|
||||
if ('' === trim($words[$x], ' ')) {
|
||||
unset($words[$x]);
|
||||
}
|
||||
}
|
||||
$words = array_values($words);
|
||||
|
||||
// Cut down on the number of unnecessary internal spaces by
|
||||
// imploding the string on the null byte, and checking if the
|
||||
// text includes extra spaces on either side. If so, merge
|
||||
// where appropriate.
|
||||
$words = implode("\x00\x00", $words);
|
||||
$words = str_replace(
|
||||
[" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"],
|
||||
[' ', ' ', ' ', ' '],
|
||||
$words
|
||||
);
|
||||
|
||||
return $words;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -459,8 +485,14 @@ class Font extends PDFObject
|
|||
*
|
||||
* @param bool $unicode This parameter is deprecated and might be removed in a future release
|
||||
*/
|
||||
public function decodeContent(string $text, bool &$unicode = null): string
|
||||
public function decodeContent(string $text, ?bool &$unicode = null): string
|
||||
{
|
||||
// If this string begins with a UTF-16BE BOM, then decode it
|
||||
// directly as Unicode
|
||||
if ("\xFE\xFF" === substr($text, 0, 2)) {
|
||||
return $this->decodeUnicode($text);
|
||||
}
|
||||
|
||||
if ($this->has('ToUnicode')) {
|
||||
return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ use Smalot\PdfParser\Element\ElementXRef;
|
|||
class Header
|
||||
{
|
||||
/**
|
||||
* @var Document
|
||||
* @var Document|null
|
||||
*/
|
||||
protected $document;
|
||||
|
||||
|
@ -56,7 +56,7 @@ class Header
|
|||
* @param Element[] $elements list of elements
|
||||
* @param Document $document document
|
||||
*/
|
||||
public function __construct(array $elements = [], Document $document = null)
|
||||
public function __construct(array $elements = [], ?Document $document = null)
|
||||
{
|
||||
$this->elements = $elements;
|
||||
$this->document = $document;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -176,7 +176,7 @@ class Page extends PDFObject
|
|||
}*/
|
||||
}
|
||||
|
||||
public function getText(self $page = null): string
|
||||
public function getText(?self $page = null): string
|
||||
{
|
||||
if ($contents = $this->get('Contents')) {
|
||||
if ($contents instanceof ElementMissing) {
|
||||
|
@ -312,7 +312,7 @@ class Page extends PDFObject
|
|||
return new self($pdfObject->document, $header, $new_content, $config);
|
||||
}
|
||||
|
||||
public function getTextArray(self $page = null): array
|
||||
public function getTextArray(?self $page = null): array
|
||||
{
|
||||
if ($this->isFpdf()) {
|
||||
$pdfObject = $this->getPDFObjectForFpdf();
|
||||
|
@ -400,8 +400,6 @@ class Page extends PDFObject
|
|||
}
|
||||
$sectionsText = $content->getSectionsText($content->getContent());
|
||||
foreach ($sectionsText as $sectionText) {
|
||||
$extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
|
||||
|
||||
$commandsText = $content->getCommandsText($sectionText);
|
||||
foreach ($commandsText as $command) {
|
||||
$extractedData[] = $command;
|
||||
|
@ -420,7 +418,7 @@ class Page extends PDFObject
|
|||
*
|
||||
* @return array An array with the data and the internal representation
|
||||
*/
|
||||
public function extractDecodedRawData(array $extractedRawData = null): array
|
||||
public function extractDecodedRawData(?array $extractedRawData = null): array
|
||||
{
|
||||
if (!isset($extractedRawData) || !$extractedRawData) {
|
||||
$extractedRawData = $this->extractRawData();
|
||||
|
@ -500,7 +498,7 @@ class Page extends PDFObject
|
|||
*
|
||||
* @return array An array with the text command of the page
|
||||
*/
|
||||
public function getDataCommands(array $extractedDecodedRawData = null): array
|
||||
public function getDataCommands(?array $extractedDecodedRawData = null): array
|
||||
{
|
||||
if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
|
||||
$extractedDecodedRawData = $this->extractDecodedRawData();
|
||||
|
@ -651,7 +649,7 @@ class Page extends PDFObject
|
|||
* @return array an array with the data of the page including the Tm information
|
||||
* of any text in the page
|
||||
*/
|
||||
public function getDataTm(array $dataCommands = null): array
|
||||
public function getDataTm(?array $dataCommands = null): array
|
||||
{
|
||||
if (!isset($dataCommands) || !$dataCommands) {
|
||||
$dataCommands = $this->getDataCommands();
|
||||
|
@ -701,6 +699,12 @@ class Page extends PDFObject
|
|||
$extractedTexts = $this->getTextArray();
|
||||
$extractedData = [];
|
||||
foreach ($dataCommands as $command) {
|
||||
// If we've used up all the texts from getTextArray(), exit
|
||||
// so we aren't accessing non-existent array indices
|
||||
// Fixes 'undefined array key' errors in Issues #575, #576
|
||||
if (\count($extractedTexts) <= \count($extractedData)) {
|
||||
break;
|
||||
}
|
||||
$currentText = $extractedTexts[\count($extractedData)];
|
||||
switch ($command['o']) {
|
||||
/*
|
||||
|
@ -712,21 +716,13 @@ class Page extends PDFObject
|
|||
$Tl = $defaultTl;
|
||||
$Tx = 0;
|
||||
$Ty = 0;
|
||||
$fontId = $defaultFontId;
|
||||
$fontSize = $defaultFontSize;
|
||||
break;
|
||||
|
||||
/*
|
||||
* ET
|
||||
* End a text object, discarding the text matrix
|
||||
* End a text object
|
||||
*/
|
||||
case 'ET':
|
||||
$Tm = $defaultTm;
|
||||
$Tl = $defaultTl;
|
||||
$Tx = 0;
|
||||
$Ty = 0;
|
||||
$fontId = $defaultFontId;
|
||||
$fontSize = $defaultFontSize;
|
||||
break;
|
||||
|
||||
/*
|
||||
|
@ -741,7 +737,7 @@ class Page extends PDFObject
|
|||
|
||||
/*
|
||||
* tx ty Td
|
||||
* Move to the start of the next line, offset form the start of the
|
||||
* Move to the start of the next line, offset from the start of the
|
||||
* current line by tx, ty.
|
||||
*/
|
||||
case 'Td':
|
||||
|
@ -898,7 +894,7 @@ class Page extends PDFObject
|
|||
* "near" the x,y coordinate, an empty array is returned. If Both, x
|
||||
* and y coordinates are null, null is returned.
|
||||
*/
|
||||
public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
|
||||
public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
|
||||
{
|
||||
if (!isset($this->dataTm) || !$this->dataTm) {
|
||||
$this->getDataTm();
|
||||
|
|
|
@ -60,7 +60,7 @@ class Parser
|
|||
|
||||
protected $rawDataParser;
|
||||
|
||||
public function __construct($cfg = [], Config $config = null)
|
||||
public function __construct($cfg = [], ?Config $config = null)
|
||||
{
|
||||
$this->config = $config ?: new Config();
|
||||
$this->rawDataParser = new RawDataParser($cfg, $this->config);
|
||||
|
@ -77,6 +77,7 @@ class Parser
|
|||
public function parseFile(string $filename): Document
|
||||
{
|
||||
$content = file_get_contents($filename);
|
||||
|
||||
/*
|
||||
* 2018/06/20 @doganoo as multiple times a
|
||||
* users have complained that the parseFile()
|
||||
|
@ -101,7 +102,7 @@ class Parser
|
|||
// Create structure from raw data.
|
||||
list($xref, $data) = $this->rawDataParser->parseData($content);
|
||||
|
||||
if (isset($xref['trailer']['encrypt'])) {
|
||||
if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
|
||||
throw new \Exception('Secured pdf file are currently not supported.');
|
||||
}
|
||||
|
||||
|
|
|
@ -233,32 +233,32 @@ class FilterHelper
|
|||
*/
|
||||
protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
|
||||
{
|
||||
/*
|
||||
* gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty)
|
||||
* the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
|
||||
*/
|
||||
set_error_handler(function ($errNo, $errStr) {
|
||||
if (\E_WARNING === $errNo) {
|
||||
throw new \Exception($errStr);
|
||||
} else {
|
||||
// fallback to default php error handler
|
||||
return false;
|
||||
}
|
||||
});
|
||||
// Uncatchable E_WARNING for "data error" is @ suppressed
|
||||
// so execution may proceed with an alternate decompression
|
||||
// method.
|
||||
$decoded = @gzuncompress($data, $decodeMemoryLimit);
|
||||
|
||||
$decoded = null;
|
||||
|
||||
// initialize string to return
|
||||
try {
|
||||
$decoded = gzuncompress($data, $decodeMemoryLimit);
|
||||
if (false === $decoded) {
|
||||
throw new \Exception('decodeFilterFlateDecode: invalid code');
|
||||
// If gzuncompress() failed, try again using the compress.zlib://
|
||||
// wrapper to decode it in a file-based context.
|
||||
// See: https://www.php.net/manual/en/function.gzuncompress.php#79042
|
||||
// Issue: https://github.com/smalot/pdfparser/issues/592
|
||||
$ztmp = tmpfile();
|
||||
if (false != $ztmp) {
|
||||
fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
|
||||
$file = stream_get_meta_data($ztmp)['uri'];
|
||||
if (0 === $decodeMemoryLimit) {
|
||||
$decoded = file_get_contents('compress.zlib://'.$file);
|
||||
} else {
|
||||
$decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit);
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
throw $e;
|
||||
} finally {
|
||||
// Restore old handler just in case it was customized outside of PDFParser.
|
||||
restore_error_handler();
|
||||
fclose($ztmp);
|
||||
}
|
||||
}
|
||||
|
||||
if (false === \is_string($decoded) || '' === $decoded) {
|
||||
// If the decoded string is empty, that means decoding failed.
|
||||
throw new \Exception('decodeFilterFlateDecode: invalid data');
|
||||
}
|
||||
|
||||
return $decoded;
|
||||
|
|
|
@ -47,12 +47,14 @@ use Smalot\PdfParser\Config;
|
|||
class RawDataParser
|
||||
{
|
||||
/**
|
||||
* @var \Smalot\PdfParser\Config
|
||||
* @var Config
|
||||
*/
|
||||
private $config;
|
||||
|
||||
/**
|
||||
* Configuration array.
|
||||
*
|
||||
* @var array<string,bool>
|
||||
*/
|
||||
protected $cfg = [
|
||||
// if `true` ignore filter decoding errors
|
||||
|
@ -67,7 +69,7 @@ class RawDataParser
|
|||
/**
|
||||
* @param array $cfg Configuration array, default is []
|
||||
*/
|
||||
public function __construct($cfg = [], Config $config = null)
|
||||
public function __construct($cfg = [], ?Config $config = null)
|
||||
{
|
||||
// merge given array with default values
|
||||
$this->cfg = array_merge($this->cfg, $cfg);
|
||||
|
@ -125,7 +127,7 @@ class RawDataParser
|
|||
// decode the stream
|
||||
$remaining_filters = [];
|
||||
foreach ($filters as $filter) {
|
||||
if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
|
||||
if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
|
||||
try {
|
||||
$stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
|
||||
} catch (\Exception $e) {
|
||||
|
@ -406,10 +408,15 @@ class RawDataParser
|
|||
} else {
|
||||
// number of bytes in a row
|
||||
$rowlen = array_sum($wb);
|
||||
if (0 < $rowlen) {
|
||||
// convert the stream into an array of integers
|
||||
$sdata = unpack('C*', $xrefcrs[1][3][0]);
|
||||
// split the rows
|
||||
$ddata = array_chunk($sdata, $rowlen);
|
||||
} else {
|
||||
// if the row length is zero, $ddata should be an empty array as well
|
||||
$ddata = [];
|
||||
}
|
||||
}
|
||||
|
||||
$sdata = [];
|
||||
|
@ -609,7 +616,7 @@ class RawDataParser
|
|||
*
|
||||
* @return array containing object type, raw value and offset to next object
|
||||
*/
|
||||
protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
|
||||
protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
|
||||
{
|
||||
$objtype = ''; // object type to be returned
|
||||
$objval = ''; // object value to be returned
|
||||
|
@ -756,7 +763,7 @@ class RawDataParser
|
|||
// start stream object
|
||||
$objtype = 'stream';
|
||||
$offset += 6;
|
||||
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
|
||||
if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
|
||||
$offset += \strlen($matches[0]);
|
||||
|
||||
// we get stream length here to later help preg_match test less data
|
||||
|
@ -857,39 +864,39 @@ class RawDataParser
|
|||
*/
|
||||
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
|
||||
{
|
||||
$startxrefPreg = preg_match(
|
||||
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
||||
$pdfData,
|
||||
$matches,
|
||||
\PREG_OFFSET_CAPTURE,
|
||||
$offset
|
||||
);
|
||||
// If the $offset is currently pointed at whitespace, bump it
|
||||
// forward until it isn't; affects loosely targetted offsets
|
||||
// for the 'xref' keyword
|
||||
// See: https://github.com/smalot/pdfparser/issues/673
|
||||
$bumpOffset = $offset;
|
||||
while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
|
||||
++$bumpOffset;
|
||||
}
|
||||
|
||||
if (0 == $offset) {
|
||||
// find last startxref
|
||||
$pregResult = preg_match_all(
|
||||
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
||||
// Find all startxref tables from this $offset forward
|
||||
$startxrefPreg = preg_match_all(
|
||||
'/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
||||
$pdfData,
|
||||
$matches,
|
||||
$startxrefMatches,
|
||||
\PREG_SET_ORDER,
|
||||
$offset
|
||||
);
|
||||
if (0 == $pregResult) {
|
||||
|
||||
if (0 == $startxrefPreg) {
|
||||
// No startxref tables were found
|
||||
throw new \Exception('Unable to find startxref');
|
||||
}
|
||||
$matches = array_pop($matches);
|
||||
$startxref = $matches[1];
|
||||
} elseif (strpos($pdfData, 'xref', $offset) == $offset) {
|
||||
} elseif (0 == $offset) {
|
||||
// Use the last startxref in the document
|
||||
$startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
|
||||
} elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
|
||||
// Already pointing at the xref table
|
||||
$startxref = $offset;
|
||||
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
|
||||
$startxref = $bumpOffset;
|
||||
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
|
||||
// Cross-Reference Stream object
|
||||
$startxref = $offset;
|
||||
} elseif ($startxrefPreg) {
|
||||
// startxref found
|
||||
$startxref = $matches[1][0];
|
||||
$startxref = $bumpOffset;
|
||||
} else {
|
||||
throw new \Exception('Unable to find startxref');
|
||||
// Use the next startxref from this $offset
|
||||
$startxref = (int) $startxrefMatches[0][1];
|
||||
}
|
||||
|
||||
if ($startxref > \strlen($pdfData)) {
|
||||
|
@ -900,10 +907,17 @@ class RawDataParser
|
|||
if (strpos($pdfData, 'xref', $startxref) == $startxref) {
|
||||
// Cross-Reference
|
||||
$xref = $this->decodeXref($pdfData, $startxref, $xref);
|
||||
} else {
|
||||
// Check if the $pdfData might have the wrong line-endings
|
||||
$pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
|
||||
if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
|
||||
// Return Unix-line-ending flag
|
||||
$xref = ['Unix' => true];
|
||||
} else {
|
||||
// Cross-Reference Stream
|
||||
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
|
||||
}
|
||||
}
|
||||
if (empty($xref)) {
|
||||
throw new \Exception('Unable to find xref');
|
||||
}
|
||||
|
@ -937,6 +951,12 @@ class RawDataParser
|
|||
// get xref and trailer data
|
||||
$xref = $this->getXrefData($pdfData);
|
||||
|
||||
// If we found Unix line-endings
|
||||
if (isset($xref['Unix'])) {
|
||||
$pdfData = str_replace("\r\n", "\n", $pdfData);
|
||||
$xref = $this->getXrefData($pdfData);
|
||||
}
|
||||
|
||||
// parse all document objects
|
||||
$objects = [];
|
||||
foreach ($xref['xref'] as $obj => $offset) {
|
||||
|
|
|
@ -41,7 +41,7 @@ use Smalot\PdfParser\PDFObject;
|
|||
*/
|
||||
class Form extends Page
|
||||
{
|
||||
public function getText(Page $page = null): string
|
||||
public function getText(?Page $page = null): string
|
||||
{
|
||||
$header = new Header([], $this->document);
|
||||
$contents = new PDFObject($this->document, $header, $this->content, $this->config);
|
||||
|
|
|
@ -40,7 +40,7 @@ use Smalot\PdfParser\PDFObject;
|
|||
*/
|
||||
class Image extends PDFObject
|
||||
{
|
||||
public function getText(Page $page = null): string
|
||||
public function getText(?Page $page = null): string
|
||||
{
|
||||
return '';
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue