Update PdfParser to 2.10.0
This commit is contained in:
parent
4f679114c3
commit
fb7e295490
|
@ -82,6 +82,13 @@ class Config
|
||||||
*/
|
*/
|
||||||
private $dataTmFontInfoHasToBeIncluded = false;
|
private $dataTmFontInfoHasToBeIncluded = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether to attempt to read PDFs even if they are marked as encrypted.
|
||||||
|
*
|
||||||
|
* @var bool
|
||||||
|
*/
|
||||||
|
private $ignoreEncryption = false;
|
||||||
|
|
||||||
public function getFontSpaceLimit()
|
public function getFontSpaceLimit()
|
||||||
{
|
{
|
||||||
return $this->fontSpaceLimit;
|
return $this->fontSpaceLimit;
|
||||||
|
@ -151,4 +158,18 @@ class Config
|
||||||
{
|
{
|
||||||
$this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
|
$this->dataTmFontInfoHasToBeIncluded = $dataTmFontInfoHasToBeIncluded;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function getIgnoreEncryption(): bool
|
||||||
|
{
|
||||||
|
return $this->ignoreEncryption;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated this is a temporary workaround, don't rely on it
|
||||||
|
* @see https://github.com/smalot/pdfparser/pull/653
|
||||||
|
*/
|
||||||
|
public function setIgnoreEncryption(bool $ignoreEncryption): void
|
||||||
|
{
|
||||||
|
$this->ignoreEncryption = $ignoreEncryption;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -255,7 +255,7 @@ class Document
|
||||||
if ('rdf:li' == $val['tag']) {
|
if ('rdf:li' == $val['tag']) {
|
||||||
$metadata[] = $val['value'];
|
$metadata[] = $val['value'];
|
||||||
|
|
||||||
// Else assign a value to this property
|
// Else assign a value to this property
|
||||||
} else {
|
} else {
|
||||||
$metadata[$val['tag']] = $val['value'];
|
$metadata[$val['tag']] = $val['value'];
|
||||||
}
|
}
|
||||||
|
@ -263,12 +263,20 @@ class Document
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'close':
|
case 'close':
|
||||||
// If the value of this property is a single-
|
// If the value of this property is an array
|
||||||
// element array where the element is of type
|
if (\is_array($metadata)) {
|
||||||
// string, use the value of the first list item
|
// If the value is a single element array
|
||||||
// as the value for this property
|
// where the element is of type string, use
|
||||||
if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
|
// the value of the first list item as the
|
||||||
$metadata = $metadata[0];
|
// value for this property
|
||||||
|
if (1 == \count($metadata) && isset($metadata[0]) && \is_string($metadata[0])) {
|
||||||
|
$metadata = $metadata[0];
|
||||||
|
} elseif (0 == \count($metadata)) {
|
||||||
|
// if the value is an empty array, set
|
||||||
|
// the value of this property to the empty
|
||||||
|
// string
|
||||||
|
$metadata = '';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Move down one level in the stack
|
// Move down one level in the stack
|
||||||
|
@ -328,12 +336,12 @@ class Document
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function hasObjectsByType(string $type, string $subtype = null): bool
|
public function hasObjectsByType(string $type, ?string $subtype = null): bool
|
||||||
{
|
{
|
||||||
return 0 < \count($this->getObjectsByType($type, $subtype));
|
return 0 < \count($this->getObjectsByType($type, $subtype));
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getObjectsByType(string $type, string $subtype = null): array
|
public function getObjectsByType(string $type, ?string $subtype = null): array
|
||||||
{
|
{
|
||||||
if (!isset($this->dictionary[$type])) {
|
if (!isset($this->dictionary[$type])) {
|
||||||
return [];
|
return [];
|
||||||
|
@ -410,7 +418,7 @@ class Document
|
||||||
throw new \Exception('Missing catalog.');
|
throw new \Exception('Missing catalog.');
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getText(int $pageLimit = null): string
|
public function getText(?int $pageLimit = null): string
|
||||||
{
|
{
|
||||||
$texts = [];
|
$texts = [];
|
||||||
$pages = $this->getPages();
|
$pages = $this->getPages();
|
||||||
|
|
|
@ -49,13 +49,13 @@ use Smalot\PdfParser\Element\ElementXRef;
|
||||||
class Element
|
class Element
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* @var Document
|
* @var Document|null
|
||||||
*/
|
*/
|
||||||
protected $document;
|
protected $document;
|
||||||
|
|
||||||
protected $value;
|
protected $value;
|
||||||
|
|
||||||
public function __construct($value, Document $document = null)
|
public function __construct($value, ?Document $document = null)
|
||||||
{
|
{
|
||||||
$this->value = $value;
|
$this->value = $value;
|
||||||
$this->document = $document;
|
$this->document = $document;
|
||||||
|
@ -96,7 +96,7 @@ class Element
|
||||||
return (string) $this->value;
|
return (string) $this->value;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static function parse(string $content, Document $document = null, int &$position = 0)
|
public static function parse(string $content, ?Document $document = null, int &$position = 0)
|
||||||
{
|
{
|
||||||
$args = \func_get_args();
|
$args = \func_get_args();
|
||||||
$only_values = isset($args[3]) ? $args[3] : false;
|
$only_values = isset($args[3]) ? $args[3] : false;
|
||||||
|
|
|
@ -42,7 +42,7 @@ use Smalot\PdfParser\PDFObject;
|
||||||
*/
|
*/
|
||||||
class ElementArray extends Element
|
class ElementArray extends Element
|
||||||
{
|
{
|
||||||
public function __construct($value, Document $document = null)
|
public function __construct($value, ?Document $document = null)
|
||||||
{
|
{
|
||||||
parent::__construct($value, $document);
|
parent::__construct($value, $document);
|
||||||
}
|
}
|
||||||
|
@ -107,7 +107,7 @@ class ElementArray extends Element
|
||||||
*
|
*
|
||||||
* @return bool|ElementArray
|
* @return bool|ElementArray
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*\[(?P<array>.*)/is', $content, $match)) {
|
if (preg_match('/^\s*\[(?P<array>.*)/is', $content, $match)) {
|
||||||
preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches);
|
preg_match_all('/(.*?)(\[|\])/s', trim($content), $matches);
|
||||||
|
|
|
@ -61,7 +61,7 @@ class ElementBoolean extends Element
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementBoolean
|
* @return bool|ElementBoolean
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*(?P<value>true|false)/is', $content, $match)) {
|
if (preg_match('/^\s*(?P<value>true|false)/is', $content, $match)) {
|
||||||
$value = $match['value'];
|
$value = $match['value'];
|
||||||
|
|
|
@ -40,7 +40,7 @@ use Smalot\PdfParser\Document;
|
||||||
class ElementDate extends ElementString
|
class ElementDate extends ElementString
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* @var array
|
* @var array<int,string>
|
||||||
*/
|
*/
|
||||||
protected static $formats = [
|
protected static $formats = [
|
||||||
4 => 'Y',
|
4 => 'Y',
|
||||||
|
@ -98,7 +98,7 @@ class ElementDate extends ElementString
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementDate
|
* @return bool|ElementDate
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*\(D\:(?P<name>.*?)\)/s', $content, $match)) {
|
if (preg_match('/^\s*\(D\:(?P<name>.*?)\)/s', $content, $match)) {
|
||||||
$name = $match['name'];
|
$name = $match['name'];
|
||||||
|
|
|
@ -42,7 +42,7 @@ class ElementHexa extends ElementString
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementHexa|ElementDate
|
* @return bool|ElementHexa|ElementDate
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
|
if (preg_match('/^\s*\<(?P<name>[A-F0-9]+)\>/is', $content, $match)) {
|
||||||
$name = $match['name'];
|
$name = $match['name'];
|
||||||
|
@ -64,15 +64,21 @@ class ElementHexa extends ElementString
|
||||||
public static function decode(string $value): string
|
public static function decode(string $value): string
|
||||||
{
|
{
|
||||||
$text = '';
|
$text = '';
|
||||||
$length = \strlen($value);
|
|
||||||
|
|
||||||
if ('00' === substr($value, 0, 2)) {
|
// Filter $value of non-hexadecimal characters
|
||||||
for ($i = 0; $i < $length; $i += 4) {
|
$value = (string) preg_replace('/[^0-9a-f]/i', '', $value);
|
||||||
|
|
||||||
|
// Check for leading zeros (4-byte hexadecimal indicator), or
|
||||||
|
// the BE BOM
|
||||||
|
if ('00' === substr($value, 0, 2) || 'feff' === strtolower(substr($value, 0, 4))) {
|
||||||
|
$value = (string) preg_replace('/^feff/i', '', $value);
|
||||||
|
for ($i = 0, $length = \strlen($value); $i < $length; $i += 4) {
|
||||||
$hex = substr($value, $i, 4);
|
$hex = substr($value, $i, 4);
|
||||||
$text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
|
$text .= '&#'.str_pad(hexdec($hex), 4, '0', \STR_PAD_LEFT).';';
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for ($i = 0; $i < $length; $i += 2) {
|
// Otherwise decode this as 2-byte hexadecimal
|
||||||
|
for ($i = 0, $length = \strlen($value); $i < $length; $i += 2) {
|
||||||
$hex = substr($value, $i, 2);
|
$hex = substr($value, $i, 2);
|
||||||
$text .= \chr(hexdec($hex));
|
$text .= \chr(hexdec($hex));
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,7 +54,7 @@ class ElementName extends Element
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementName
|
* @return bool|ElementName
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) {
|
if (preg_match('/^\s*\/([A-Z0-9\-\+,#\.]+)/is', $content, $match)) {
|
||||||
$name = $match[1];
|
$name = $match[1];
|
||||||
|
|
|
@ -58,7 +58,7 @@ class ElementNull extends Element
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementNull
|
* @return bool|ElementNull
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*(null)/s', $content, $match)) {
|
if (preg_match('/^\s*(null)/s', $content, $match)) {
|
||||||
$offset += strpos($content, 'null') + \strlen('null');
|
$offset += strpos($content, 'null') + \strlen('null');
|
||||||
|
|
|
@ -48,7 +48,7 @@ class ElementNumeric extends Element
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementNumeric
|
* @return bool|ElementNumeric
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*(?P<value>\-?[0-9\.]+)/s', $content, $match)) {
|
if (preg_match('/^\s*(?P<value>\-?[0-9\.]+)/s', $content, $match)) {
|
||||||
$value = $match['value'];
|
$value = $match['value'];
|
||||||
|
|
|
@ -54,7 +54,7 @@ class ElementString extends Element
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementString
|
* @return bool|ElementString
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
|
if (preg_match('/^\s*\((?P<name>.*)/s', $content, $match)) {
|
||||||
$name = $match['name'];
|
$name = $match['name'];
|
||||||
|
|
|
@ -44,7 +44,7 @@ class ElementStruct extends Element
|
||||||
/**
|
/**
|
||||||
* @return false|Header
|
* @return false|Header
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*<<(?P<struct>.*)/is', $content)) {
|
if (preg_match('/^\s*<<(?P<struct>.*)/is', $content)) {
|
||||||
preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches);
|
preg_match_all('/(.*?)(<<|>>)/s', trim($content), $matches);
|
||||||
|
|
|
@ -83,7 +83,7 @@ class ElementXRef extends Element
|
||||||
/**
|
/**
|
||||||
* @return bool|ElementXRef
|
* @return bool|ElementXRef
|
||||||
*/
|
*/
|
||||||
public static function parse(string $content, Document $document = null, int &$offset = 0)
|
public static function parse(string $content, ?Document $document = null, int &$offset = 0)
|
||||||
{
|
{
|
||||||
if (preg_match('/^\s*(?P<id>[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) {
|
if (preg_match('/^\s*(?P<id>[0-9]+\s+[0-9]+\s+R)/s', $content, $match)) {
|
||||||
$id = $match['id'];
|
$id = $match['id'];
|
||||||
|
|
|
@ -145,6 +145,12 @@ class Encoding extends PDFObject
|
||||||
{
|
{
|
||||||
// Load reference table charset.
|
// Load reference table charset.
|
||||||
$baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent());
|
$baseEncoding = preg_replace('/[^A-Z0-9]/is', '', $this->get('BaseEncoding')->getContent());
|
||||||
|
|
||||||
|
// Check for empty BaseEncoding field value
|
||||||
|
if (!\is_string($baseEncoding) || 0 == \strlen($baseEncoding)) {
|
||||||
|
$baseEncoding = 'StandardEncoding';
|
||||||
|
}
|
||||||
|
|
||||||
$className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding;
|
$className = '\\Smalot\\PdfParser\\Encoding\\'.$baseEncoding;
|
||||||
|
|
||||||
if (!class_exists($className)) {
|
if (!class_exists($className)) {
|
||||||
|
|
|
@ -178,7 +178,7 @@ class PDFDocEncoding
|
||||||
"\xfc" => "\u{00fc}", // udieresis
|
"\xfc" => "\u{00fc}", // udieresis
|
||||||
"\xfd" => "\u{00fd}", // yacute
|
"\xfd" => "\u{00fd}", // yacute
|
||||||
"\xfe" => "\u{00fe}", // thorn
|
"\xfe" => "\u{00fe}", // thorn
|
||||||
"\xff" => "\u{00ff}", // ydieresis
|
"\xff" => "\u{00ff}", // ydieresis
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -134,9 +134,16 @@ class Font extends PDFObject
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert unicode character code to "utf-8" encoded string.
|
* Convert unicode character code to "utf-8" encoded string.
|
||||||
|
*
|
||||||
|
* @param int|float $code Unicode character code. Will be casted to int internally!
|
||||||
*/
|
*/
|
||||||
public static function uchr(int $code): string
|
public static function uchr($code): string
|
||||||
{
|
{
|
||||||
|
// note:
|
||||||
|
// $code was typed as int before, but changed in https://github.com/smalot/pdfparser/pull/623
|
||||||
|
// because in some cases uchr was called with a float instead of an integer.
|
||||||
|
$code = (int) $code;
|
||||||
|
|
||||||
if (!isset(self::$uchrCache[$code])) {
|
if (!isset(self::$uchrCache[$code])) {
|
||||||
// html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
|
// html_entity_decode() will not work with UTF-16 or UTF-32 char entities,
|
||||||
// therefore, we use mb_convert_encoding() instead
|
// therefore, we use mb_convert_encoding() instead
|
||||||
|
@ -272,11 +279,13 @@ class Font extends PDFObject
|
||||||
/**
|
/**
|
||||||
* Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
|
* Calculate text width with data from header 'Widths'. If width of character is not found then character is added to missing array.
|
||||||
*/
|
*/
|
||||||
public function calculateTextWidth(string $text, array &$missing = null): ?float
|
public function calculateTextWidth(string $text, ?array &$missing = null): ?float
|
||||||
{
|
{
|
||||||
$index_map = array_flip($this->table);
|
$index_map = array_flip($this->table);
|
||||||
$details = $this->getDetails();
|
$details = $this->getDetails();
|
||||||
$widths = $details['Widths'];
|
|
||||||
|
// Usually, Widths key is set in $details array, but if it isn't use an empty array instead.
|
||||||
|
$widths = $details['Widths'] ?? [];
|
||||||
|
|
||||||
// Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
|
// Widths array is zero indexed but table is not. We must map them based on FirstChar and LastChar
|
||||||
$width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
|
$width_map = array_flip(range($details['FirstChar'], $details['LastChar']));
|
||||||
|
@ -312,12 +321,12 @@ class Font extends PDFObject
|
||||||
}
|
}
|
||||||
|
|
||||||
$text = '';
|
$text = '';
|
||||||
$parts = preg_split('/(<[a-f0-9]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
$parts = preg_split('/(<[a-f0-9\s]+>)/si', $hexa, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
||||||
|
|
||||||
foreach ($parts as $part) {
|
foreach ($parts as $part) {
|
||||||
if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
|
if (preg_match('/^<[a-f0-9\s]+>$/si', $part)) {
|
||||||
// strip line breaks
|
// strip whitespace
|
||||||
$part = preg_replace("/[\r\n]/", '', $part);
|
$part = preg_replace("/\s/", '', $part);
|
||||||
$part = trim($part, '<>');
|
$part = trim($part, '<>');
|
||||||
if ($add_braces) {
|
if ($add_braces) {
|
||||||
$text .= '(';
|
$text .= '(';
|
||||||
|
@ -342,18 +351,20 @@ class Font extends PDFObject
|
||||||
*/
|
*/
|
||||||
public static function decodeOctal(string $text): string
|
public static function decodeOctal(string $text): string
|
||||||
{
|
{
|
||||||
$parts = preg_split('/(\\\\[0-7]{3})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
// Replace all double backslashes \\ with a special string
|
||||||
$text = '';
|
$text = strtr($text, ['\\\\' => '[**pdfparserdblslsh**]']);
|
||||||
|
|
||||||
foreach ($parts as $part) {
|
// Now we can replace all octal codes without worrying about
|
||||||
if (preg_match('/^\\\\[0-7]{3}$/', $part)) {
|
// escaped backslashes
|
||||||
$text .= \chr(octdec(trim($part, '\\')));
|
$text = preg_replace_callback('/\\\\([0-7]{1,3})/', function ($m) {
|
||||||
} else {
|
return \chr(octdec($m[1]));
|
||||||
$text .= $part;
|
}, $text);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $text;
|
// Unescape any parentheses
|
||||||
|
$text = str_replace(['\\(', '\\)'], ['(', ')'], $text);
|
||||||
|
|
||||||
|
// Replace instances of the special string with a single backslash
|
||||||
|
return str_replace('[**pdfparserdblslsh**]', '\\', $text);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -361,18 +372,9 @@ class Font extends PDFObject
|
||||||
*/
|
*/
|
||||||
public static function decodeEntities(string $text): string
|
public static function decodeEntities(string $text): string
|
||||||
{
|
{
|
||||||
$parts = preg_split('/(#\d{2})/s', $text, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
|
return preg_replace_callback('/#([0-9a-f]{2})/i', function ($m) {
|
||||||
$text = '';
|
return \chr(hexdec($m[1]));
|
||||||
|
}, $text);
|
||||||
foreach ($parts as $part) {
|
|
||||||
if (preg_match('/^#\d{2}$/', $part)) {
|
|
||||||
$text .= \chr(hexdec(trim($part, '#')));
|
|
||||||
} else {
|
|
||||||
$text .= $part;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return $text;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -384,7 +386,7 @@ class Font extends PDFObject
|
||||||
*/
|
*/
|
||||||
public static function decodeUnicode(string $text): string
|
public static function decodeUnicode(string $text): string
|
||||||
{
|
{
|
||||||
if (preg_match('/^\xFE\xFF/i', $text)) {
|
if ("\xFE\xFF" === substr($text, 0, 2)) {
|
||||||
// Strip U+FEFF byte order marker.
|
// Strip U+FEFF byte order marker.
|
||||||
$decode = substr($text, 2);
|
$decode = substr($text, 2);
|
||||||
$text = '';
|
$text = '';
|
||||||
|
@ -409,16 +411,17 @@ class Font extends PDFObject
|
||||||
/**
|
/**
|
||||||
* Decode text by commands array.
|
* Decode text by commands array.
|
||||||
*/
|
*/
|
||||||
public function decodeText(array $commands): string
|
public function decodeText(array $commands, float $fontFactor = 4): string
|
||||||
{
|
{
|
||||||
$word_position = 0;
|
$word_position = 0;
|
||||||
$words = [];
|
$words = [];
|
||||||
$font_space = $this->getFontSpaceLimit();
|
$font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4;
|
||||||
|
|
||||||
foreach ($commands as $command) {
|
foreach ($commands as $command) {
|
||||||
switch ($command[PDFObject::TYPE]) {
|
switch ($command[PDFObject::TYPE]) {
|
||||||
case 'n':
|
case 'n':
|
||||||
if ((float) trim($command[PDFObject::COMMAND]) < $font_space) {
|
$offset = (float) trim($command[PDFObject::COMMAND]);
|
||||||
|
if ($offset - (float) $font_space < 0) {
|
||||||
$word_position = \count($words);
|
$word_position = \count($words);
|
||||||
}
|
}
|
||||||
continue 2;
|
continue 2;
|
||||||
|
@ -434,8 +437,8 @@ class Font extends PDFObject
|
||||||
|
|
||||||
// replace escaped chars
|
// replace escaped chars
|
||||||
$text = str_replace(
|
$text = str_replace(
|
||||||
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ '],
|
['\\\\', '\(', '\)', '\n', '\r', '\t', '\f', '\ ', '\b'],
|
||||||
['\\', '(', ')', "\n", "\r", "\t", "\f", ' '],
|
[\chr(92), \chr(40), \chr(41), \chr(10), \chr(13), \chr(9), \chr(12), \chr(32), \chr(8)],
|
||||||
$text
|
$text
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -449,9 +452,32 @@ class Font extends PDFObject
|
||||||
|
|
||||||
foreach ($words as &$word) {
|
foreach ($words as &$word) {
|
||||||
$word = $this->decodeContent($word);
|
$word = $this->decodeContent($word);
|
||||||
|
$word = str_replace("\t", ' ', $word);
|
||||||
}
|
}
|
||||||
|
|
||||||
return implode(' ', $words);
|
// Remove internal "words" that are just spaces, but leave them
|
||||||
|
// if they are at either end of the array of words. This fixes,
|
||||||
|
// for example, lines that are justified to fill
|
||||||
|
// a whole row.
|
||||||
|
for ($x = \count($words) - 2; $x >= 1; --$x) {
|
||||||
|
if ('' === trim($words[$x], ' ')) {
|
||||||
|
unset($words[$x]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$words = array_values($words);
|
||||||
|
|
||||||
|
// Cut down on the number of unnecessary internal spaces by
|
||||||
|
// imploding the string on the null byte, and checking if the
|
||||||
|
// text includes extra spaces on either side. If so, merge
|
||||||
|
// where appropriate.
|
||||||
|
$words = implode("\x00\x00", $words);
|
||||||
|
$words = str_replace(
|
||||||
|
[" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"],
|
||||||
|
[' ', ' ', ' ', ' '],
|
||||||
|
$words
|
||||||
|
);
|
||||||
|
|
||||||
|
return $words;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -459,8 +485,14 @@ class Font extends PDFObject
|
||||||
*
|
*
|
||||||
* @param bool $unicode This parameter is deprecated and might be removed in a future release
|
* @param bool $unicode This parameter is deprecated and might be removed in a future release
|
||||||
*/
|
*/
|
||||||
public function decodeContent(string $text, bool &$unicode = null): string
|
public function decodeContent(string $text, ?bool &$unicode = null): string
|
||||||
{
|
{
|
||||||
|
// If this string begins with a UTF-16BE BOM, then decode it
|
||||||
|
// directly as Unicode
|
||||||
|
if ("\xFE\xFF" === substr($text, 0, 2)) {
|
||||||
|
return $this->decodeUnicode($text);
|
||||||
|
}
|
||||||
|
|
||||||
if ($this->has('ToUnicode')) {
|
if ($this->has('ToUnicode')) {
|
||||||
return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
|
return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,7 +43,7 @@ use Smalot\PdfParser\Element\ElementXRef;
|
||||||
class Header
|
class Header
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* @var Document
|
* @var Document|null
|
||||||
*/
|
*/
|
||||||
protected $document;
|
protected $document;
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ class Header
|
||||||
* @param Element[] $elements list of elements
|
* @param Element[] $elements list of elements
|
||||||
* @param Document $document document
|
* @param Document $document document
|
||||||
*/
|
*/
|
||||||
public function __construct(array $elements = [], Document $document = null)
|
public function __construct(array $elements = [], ?Document $document = null)
|
||||||
{
|
{
|
||||||
$this->elements = $elements;
|
$this->elements = $elements;
|
||||||
$this->document = $document;
|
$this->document = $document;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -176,7 +176,7 @@ class Page extends PDFObject
|
||||||
}*/
|
}*/
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getText(self $page = null): string
|
public function getText(?self $page = null): string
|
||||||
{
|
{
|
||||||
if ($contents = $this->get('Contents')) {
|
if ($contents = $this->get('Contents')) {
|
||||||
if ($contents instanceof ElementMissing) {
|
if ($contents instanceof ElementMissing) {
|
||||||
|
@ -312,7 +312,7 @@ class Page extends PDFObject
|
||||||
return new self($pdfObject->document, $header, $new_content, $config);
|
return new self($pdfObject->document, $header, $new_content, $config);
|
||||||
}
|
}
|
||||||
|
|
||||||
public function getTextArray(self $page = null): array
|
public function getTextArray(?self $page = null): array
|
||||||
{
|
{
|
||||||
if ($this->isFpdf()) {
|
if ($this->isFpdf()) {
|
||||||
$pdfObject = $this->getPDFObjectForFpdf();
|
$pdfObject = $this->getPDFObjectForFpdf();
|
||||||
|
@ -400,8 +400,6 @@ class Page extends PDFObject
|
||||||
}
|
}
|
||||||
$sectionsText = $content->getSectionsText($content->getContent());
|
$sectionsText = $content->getSectionsText($content->getContent());
|
||||||
foreach ($sectionsText as $sectionText) {
|
foreach ($sectionsText as $sectionText) {
|
||||||
$extractedData[] = ['t' => '', 'o' => 'BT', 'c' => ''];
|
|
||||||
|
|
||||||
$commandsText = $content->getCommandsText($sectionText);
|
$commandsText = $content->getCommandsText($sectionText);
|
||||||
foreach ($commandsText as $command) {
|
foreach ($commandsText as $command) {
|
||||||
$extractedData[] = $command;
|
$extractedData[] = $command;
|
||||||
|
@ -420,7 +418,7 @@ class Page extends PDFObject
|
||||||
*
|
*
|
||||||
* @return array An array with the data and the internal representation
|
* @return array An array with the data and the internal representation
|
||||||
*/
|
*/
|
||||||
public function extractDecodedRawData(array $extractedRawData = null): array
|
public function extractDecodedRawData(?array $extractedRawData = null): array
|
||||||
{
|
{
|
||||||
if (!isset($extractedRawData) || !$extractedRawData) {
|
if (!isset($extractedRawData) || !$extractedRawData) {
|
||||||
$extractedRawData = $this->extractRawData();
|
$extractedRawData = $this->extractRawData();
|
||||||
|
@ -500,7 +498,7 @@ class Page extends PDFObject
|
||||||
*
|
*
|
||||||
* @return array An array with the text command of the page
|
* @return array An array with the text command of the page
|
||||||
*/
|
*/
|
||||||
public function getDataCommands(array $extractedDecodedRawData = null): array
|
public function getDataCommands(?array $extractedDecodedRawData = null): array
|
||||||
{
|
{
|
||||||
if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
|
if (!isset($extractedDecodedRawData) || !$extractedDecodedRawData) {
|
||||||
$extractedDecodedRawData = $this->extractDecodedRawData();
|
$extractedDecodedRawData = $this->extractDecodedRawData();
|
||||||
|
@ -651,7 +649,7 @@ class Page extends PDFObject
|
||||||
* @return array an array with the data of the page including the Tm information
|
* @return array an array with the data of the page including the Tm information
|
||||||
* of any text in the page
|
* of any text in the page
|
||||||
*/
|
*/
|
||||||
public function getDataTm(array $dataCommands = null): array
|
public function getDataTm(?array $dataCommands = null): array
|
||||||
{
|
{
|
||||||
if (!isset($dataCommands) || !$dataCommands) {
|
if (!isset($dataCommands) || !$dataCommands) {
|
||||||
$dataCommands = $this->getDataCommands();
|
$dataCommands = $this->getDataCommands();
|
||||||
|
@ -701,6 +699,12 @@ class Page extends PDFObject
|
||||||
$extractedTexts = $this->getTextArray();
|
$extractedTexts = $this->getTextArray();
|
||||||
$extractedData = [];
|
$extractedData = [];
|
||||||
foreach ($dataCommands as $command) {
|
foreach ($dataCommands as $command) {
|
||||||
|
// If we've used up all the texts from getTextArray(), exit
|
||||||
|
// so we aren't accessing non-existent array indices
|
||||||
|
// Fixes 'undefined array key' errors in Issues #575, #576
|
||||||
|
if (\count($extractedTexts) <= \count($extractedData)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
$currentText = $extractedTexts[\count($extractedData)];
|
$currentText = $extractedTexts[\count($extractedData)];
|
||||||
switch ($command['o']) {
|
switch ($command['o']) {
|
||||||
/*
|
/*
|
||||||
|
@ -712,21 +716,13 @@ class Page extends PDFObject
|
||||||
$Tl = $defaultTl;
|
$Tl = $defaultTl;
|
||||||
$Tx = 0;
|
$Tx = 0;
|
||||||
$Ty = 0;
|
$Ty = 0;
|
||||||
$fontId = $defaultFontId;
|
|
||||||
$fontSize = $defaultFontSize;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ET
|
* ET
|
||||||
* End a text object, discarding the text matrix
|
* End a text object
|
||||||
*/
|
*/
|
||||||
case 'ET':
|
case 'ET':
|
||||||
$Tm = $defaultTm;
|
|
||||||
$Tl = $defaultTl;
|
|
||||||
$Tx = 0;
|
|
||||||
$Ty = 0;
|
|
||||||
$fontId = $defaultFontId;
|
|
||||||
$fontSize = $defaultFontSize;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -741,7 +737,7 @@ class Page extends PDFObject
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* tx ty Td
|
* tx ty Td
|
||||||
* Move to the start of the next line, offset form the start of the
|
* Move to the start of the next line, offset from the start of the
|
||||||
* current line by tx, ty.
|
* current line by tx, ty.
|
||||||
*/
|
*/
|
||||||
case 'Td':
|
case 'Td':
|
||||||
|
@ -898,7 +894,7 @@ class Page extends PDFObject
|
||||||
* "near" the x,y coordinate, an empty array is returned. If Both, x
|
* "near" the x,y coordinate, an empty array is returned. If Both, x
|
||||||
* and y coordinates are null, null is returned.
|
* and y coordinates are null, null is returned.
|
||||||
*/
|
*/
|
||||||
public function getTextXY(float $x = null, float $y = null, float $xError = 0, float $yError = 0): array
|
public function getTextXY(?float $x = null, ?float $y = null, float $xError = 0, float $yError = 0): array
|
||||||
{
|
{
|
||||||
if (!isset($this->dataTm) || !$this->dataTm) {
|
if (!isset($this->dataTm) || !$this->dataTm) {
|
||||||
$this->getDataTm();
|
$this->getDataTm();
|
||||||
|
|
|
@ -60,7 +60,7 @@ class Parser
|
||||||
|
|
||||||
protected $rawDataParser;
|
protected $rawDataParser;
|
||||||
|
|
||||||
public function __construct($cfg = [], Config $config = null)
|
public function __construct($cfg = [], ?Config $config = null)
|
||||||
{
|
{
|
||||||
$this->config = $config ?: new Config();
|
$this->config = $config ?: new Config();
|
||||||
$this->rawDataParser = new RawDataParser($cfg, $this->config);
|
$this->rawDataParser = new RawDataParser($cfg, $this->config);
|
||||||
|
@ -77,6 +77,7 @@ class Parser
|
||||||
public function parseFile(string $filename): Document
|
public function parseFile(string $filename): Document
|
||||||
{
|
{
|
||||||
$content = file_get_contents($filename);
|
$content = file_get_contents($filename);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 2018/06/20 @doganoo as multiple times a
|
* 2018/06/20 @doganoo as multiple times a
|
||||||
* users have complained that the parseFile()
|
* users have complained that the parseFile()
|
||||||
|
@ -101,7 +102,7 @@ class Parser
|
||||||
// Create structure from raw data.
|
// Create structure from raw data.
|
||||||
list($xref, $data) = $this->rawDataParser->parseData($content);
|
list($xref, $data) = $this->rawDataParser->parseData($content);
|
||||||
|
|
||||||
if (isset($xref['trailer']['encrypt'])) {
|
if (isset($xref['trailer']['encrypt']) && false === $this->config->getIgnoreEncryption()) {
|
||||||
throw new \Exception('Secured pdf file are currently not supported.');
|
throw new \Exception('Secured pdf file are currently not supported.');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -233,32 +233,32 @@ class FilterHelper
|
||||||
*/
|
*/
|
||||||
protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
|
protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
|
||||||
{
|
{
|
||||||
/*
|
// Uncatchable E_WARNING for "data error" is @ suppressed
|
||||||
* gzuncompress may throw a not catchable E_WARNING in case of an error (like $data is empty)
|
// so execution may proceed with an alternate decompression
|
||||||
* the following set_error_handler changes an E_WARNING to an E_ERROR, which is catchable.
|
// method.
|
||||||
*/
|
$decoded = @gzuncompress($data, $decodeMemoryLimit);
|
||||||
set_error_handler(function ($errNo, $errStr) {
|
|
||||||
if (\E_WARNING === $errNo) {
|
|
||||||
throw new \Exception($errStr);
|
|
||||||
} else {
|
|
||||||
// fallback to default php error handler
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
$decoded = null;
|
if (false === $decoded) {
|
||||||
|
// If gzuncompress() failed, try again using the compress.zlib://
|
||||||
// initialize string to return
|
// wrapper to decode it in a file-based context.
|
||||||
try {
|
// See: https://www.php.net/manual/en/function.gzuncompress.php#79042
|
||||||
$decoded = gzuncompress($data, $decodeMemoryLimit);
|
// Issue: https://github.com/smalot/pdfparser/issues/592
|
||||||
if (false === $decoded) {
|
$ztmp = tmpfile();
|
||||||
throw new \Exception('decodeFilterFlateDecode: invalid code');
|
if (false != $ztmp) {
|
||||||
|
fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
|
||||||
|
$file = stream_get_meta_data($ztmp)['uri'];
|
||||||
|
if (0 === $decodeMemoryLimit) {
|
||||||
|
$decoded = file_get_contents('compress.zlib://'.$file);
|
||||||
|
} else {
|
||||||
|
$decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit);
|
||||||
|
}
|
||||||
|
fclose($ztmp);
|
||||||
}
|
}
|
||||||
} catch (\Exception $e) {
|
}
|
||||||
throw $e;
|
|
||||||
} finally {
|
if (false === \is_string($decoded) || '' === $decoded) {
|
||||||
// Restore old handler just in case it was customized outside of PDFParser.
|
// If the decoded string is empty, that means decoding failed.
|
||||||
restore_error_handler();
|
throw new \Exception('decodeFilterFlateDecode: invalid data');
|
||||||
}
|
}
|
||||||
|
|
||||||
return $decoded;
|
return $decoded;
|
||||||
|
|
|
@ -47,12 +47,14 @@ use Smalot\PdfParser\Config;
|
||||||
class RawDataParser
|
class RawDataParser
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* @var \Smalot\PdfParser\Config
|
* @var Config
|
||||||
*/
|
*/
|
||||||
private $config;
|
private $config;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Configuration array.
|
* Configuration array.
|
||||||
|
*
|
||||||
|
* @var array<string,bool>
|
||||||
*/
|
*/
|
||||||
protected $cfg = [
|
protected $cfg = [
|
||||||
// if `true` ignore filter decoding errors
|
// if `true` ignore filter decoding errors
|
||||||
|
@ -67,7 +69,7 @@ class RawDataParser
|
||||||
/**
|
/**
|
||||||
* @param array $cfg Configuration array, default is []
|
* @param array $cfg Configuration array, default is []
|
||||||
*/
|
*/
|
||||||
public function __construct($cfg = [], Config $config = null)
|
public function __construct($cfg = [], ?Config $config = null)
|
||||||
{
|
{
|
||||||
// merge given array with default values
|
// merge given array with default values
|
||||||
$this->cfg = array_merge($this->cfg, $cfg);
|
$this->cfg = array_merge($this->cfg, $cfg);
|
||||||
|
@ -125,7 +127,7 @@ class RawDataParser
|
||||||
// decode the stream
|
// decode the stream
|
||||||
$remaining_filters = [];
|
$remaining_filters = [];
|
||||||
foreach ($filters as $filter) {
|
foreach ($filters as $filter) {
|
||||||
if (\in_array($filter, $this->filterHelper->getAvailableFilters())) {
|
if (\in_array($filter, $this->filterHelper->getAvailableFilters(), true)) {
|
||||||
try {
|
try {
|
||||||
$stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
|
$stream = $this->filterHelper->decodeFilter($filter, $stream, $this->config->getDecodeMemoryLimit());
|
||||||
} catch (\Exception $e) {
|
} catch (\Exception $e) {
|
||||||
|
@ -402,14 +404,19 @@ class RawDataParser
|
||||||
}
|
}
|
||||||
$prev_row = $ddata[$k];
|
$prev_row = $ddata[$k];
|
||||||
} // end for each row
|
} // end for each row
|
||||||
// complete decoding
|
// complete decoding
|
||||||
} else {
|
} else {
|
||||||
// number of bytes in a row
|
// number of bytes in a row
|
||||||
$rowlen = array_sum($wb);
|
$rowlen = array_sum($wb);
|
||||||
// convert the stream into an array of integers
|
if (0 < $rowlen) {
|
||||||
$sdata = unpack('C*', $xrefcrs[1][3][0]);
|
// convert the stream into an array of integers
|
||||||
// split the rows
|
$sdata = unpack('C*', $xrefcrs[1][3][0]);
|
||||||
$ddata = array_chunk($sdata, $rowlen);
|
// split the rows
|
||||||
|
$ddata = array_chunk($sdata, $rowlen);
|
||||||
|
} else {
|
||||||
|
// if the row length is zero, $ddata should be an empty array as well
|
||||||
|
$ddata = [];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$sdata = [];
|
$sdata = [];
|
||||||
|
@ -609,7 +616,7 @@ class RawDataParser
|
||||||
*
|
*
|
||||||
* @return array containing object type, raw value and offset to next object
|
* @return array containing object type, raw value and offset to next object
|
||||||
*/
|
*/
|
||||||
protected function getRawObject(string $pdfData, int $offset = 0, array $headerDic = null): array
|
protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
|
||||||
{
|
{
|
||||||
$objtype = ''; // object type to be returned
|
$objtype = ''; // object type to be returned
|
||||||
$objval = ''; // object value to be returned
|
$objval = ''; // object value to be returned
|
||||||
|
@ -756,7 +763,7 @@ class RawDataParser
|
||||||
// start stream object
|
// start stream object
|
||||||
$objtype = 'stream';
|
$objtype = 'stream';
|
||||||
$offset += 6;
|
$offset += 6;
|
||||||
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
|
if (1 == preg_match('/^( *[\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
|
||||||
$offset += \strlen($matches[0]);
|
$offset += \strlen($matches[0]);
|
||||||
|
|
||||||
// we get stream length here to later help preg_match test less data
|
// we get stream length here to later help preg_match test less data
|
||||||
|
@ -857,39 +864,39 @@ class RawDataParser
|
||||||
*/
|
*/
|
||||||
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
|
protected function getXrefData(string $pdfData, int $offset = 0, array $xref = []): array
|
||||||
{
|
{
|
||||||
$startxrefPreg = preg_match(
|
// If the $offset is currently pointed at whitespace, bump it
|
||||||
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
// forward until it isn't; affects loosely targetted offsets
|
||||||
|
// for the 'xref' keyword
|
||||||
|
// See: https://github.com/smalot/pdfparser/issues/673
|
||||||
|
$bumpOffset = $offset;
|
||||||
|
while (preg_match('/\s/', substr($pdfData, $bumpOffset, 1))) {
|
||||||
|
++$bumpOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all startxref tables from this $offset forward
|
||||||
|
$startxrefPreg = preg_match_all(
|
||||||
|
'/(?<=[\r\n])startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
||||||
$pdfData,
|
$pdfData,
|
||||||
$matches,
|
$startxrefMatches,
|
||||||
\PREG_OFFSET_CAPTURE,
|
\PREG_SET_ORDER,
|
||||||
$offset
|
$offset
|
||||||
);
|
);
|
||||||
|
|
||||||
if (0 == $offset) {
|
if (0 == $startxrefPreg) {
|
||||||
// find last startxref
|
// No startxref tables were found
|
||||||
$pregResult = preg_match_all(
|
|
||||||
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
|
||||||
$pdfData,
|
|
||||||
$matches,
|
|
||||||
\PREG_SET_ORDER,
|
|
||||||
$offset
|
|
||||||
);
|
|
||||||
if (0 == $pregResult) {
|
|
||||||
throw new \Exception('Unable to find startxref');
|
|
||||||
}
|
|
||||||
$matches = array_pop($matches);
|
|
||||||
$startxref = $matches[1];
|
|
||||||
} elseif (strpos($pdfData, 'xref', $offset) == $offset) {
|
|
||||||
// Already pointing at the xref table
|
|
||||||
$startxref = $offset;
|
|
||||||
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset)) {
|
|
||||||
// Cross-Reference Stream object
|
|
||||||
$startxref = $offset;
|
|
||||||
} elseif ($startxrefPreg) {
|
|
||||||
// startxref found
|
|
||||||
$startxref = $matches[1][0];
|
|
||||||
} else {
|
|
||||||
throw new \Exception('Unable to find startxref');
|
throw new \Exception('Unable to find startxref');
|
||||||
|
} elseif (0 == $offset) {
|
||||||
|
// Use the last startxref in the document
|
||||||
|
$startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1];
|
||||||
|
} elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) {
|
||||||
|
// Already pointing at the xref table
|
||||||
|
$startxref = $bumpOffset;
|
||||||
|
} elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) {
|
||||||
|
// Cross-Reference Stream object
|
||||||
|
$startxref = $bumpOffset;
|
||||||
|
} else {
|
||||||
|
// Use the next startxref from this $offset
|
||||||
|
$startxref = (int) $startxrefMatches[0][1];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($startxref > \strlen($pdfData)) {
|
if ($startxref > \strlen($pdfData)) {
|
||||||
|
@ -901,8 +908,15 @@ class RawDataParser
|
||||||
// Cross-Reference
|
// Cross-Reference
|
||||||
$xref = $this->decodeXref($pdfData, $startxref, $xref);
|
$xref = $this->decodeXref($pdfData, $startxref, $xref);
|
||||||
} else {
|
} else {
|
||||||
// Cross-Reference Stream
|
// Check if the $pdfData might have the wrong line-endings
|
||||||
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
|
$pdfDataUnix = str_replace("\r\n", "\n", $pdfData);
|
||||||
|
if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) {
|
||||||
|
// Return Unix-line-ending flag
|
||||||
|
$xref = ['Unix' => true];
|
||||||
|
} else {
|
||||||
|
// Cross-Reference Stream
|
||||||
|
$xref = $this->decodeXrefStream($pdfData, $startxref, $xref);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (empty($xref)) {
|
if (empty($xref)) {
|
||||||
throw new \Exception('Unable to find xref');
|
throw new \Exception('Unable to find xref');
|
||||||
|
@ -937,6 +951,12 @@ class RawDataParser
|
||||||
// get xref and trailer data
|
// get xref and trailer data
|
||||||
$xref = $this->getXrefData($pdfData);
|
$xref = $this->getXrefData($pdfData);
|
||||||
|
|
||||||
|
// If we found Unix line-endings
|
||||||
|
if (isset($xref['Unix'])) {
|
||||||
|
$pdfData = str_replace("\r\n", "\n", $pdfData);
|
||||||
|
$xref = $this->getXrefData($pdfData);
|
||||||
|
}
|
||||||
|
|
||||||
// parse all document objects
|
// parse all document objects
|
||||||
$objects = [];
|
$objects = [];
|
||||||
foreach ($xref['xref'] as $obj => $offset) {
|
foreach ($xref['xref'] as $obj => $offset) {
|
||||||
|
|
|
@ -41,7 +41,7 @@ use Smalot\PdfParser\PDFObject;
|
||||||
*/
|
*/
|
||||||
class Form extends Page
|
class Form extends Page
|
||||||
{
|
{
|
||||||
public function getText(Page $page = null): string
|
public function getText(?Page $page = null): string
|
||||||
{
|
{
|
||||||
$header = new Header([], $this->document);
|
$header = new Header([], $this->document);
|
||||||
$contents = new PDFObject($this->document, $header, $this->content, $this->config);
|
$contents = new PDFObject($this->document, $header, $this->content, $this->config);
|
||||||
|
|
|
@ -40,7 +40,7 @@ use Smalot\PdfParser\PDFObject;
|
||||||
*/
|
*/
|
||||||
class Image extends PDFObject
|
class Image extends PDFObject
|
||||||
{
|
{
|
||||||
public function getText(Page $page = null): string
|
public function getText(?Page $page = null): string
|
||||||
{
|
{
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue