Update pdfparser to 2.5.0
This commit is contained in:
parent
5cfeb0a414
commit
6d9d897784
|
@ -454,7 +454,7 @@ if (class_exists('PHPMailer\PHPMailer\PHPMailer')) {
|
||||||
// ***** Load the default Search Result Template
|
// ***** Load the default Search Result Template
|
||||||
if (!$_ODATA['s_result_template']) {
|
if (!$_ODATA['s_result_template']) {
|
||||||
OS_setValue('s_result_template', '<section id="os_results">
|
OS_setValue('s_result_template', '<section id="os_results">
|
||||||
<!-- Orcinus Site Search {{version}} - HTML Template -->
|
<!-- Orcinus Site Search v{{version}} - HTML Template -->
|
||||||
|
|
||||||
{{#errors}}
|
{{#errors}}
|
||||||
<ul>
|
<ul>
|
||||||
|
|
|
@ -171,3 +171,22 @@ $font = reset($pdf->getFonts());
|
||||||
// get width
|
// get width
|
||||||
$width = $font->calculateTextWidth('Some text', $missing);
|
$width = $font->calculateTextWidth('Some text', $missing);
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Get pages width and height
|
||||||
|
|
||||||
|
Ref: [#472](https://github.com/smalot/pdfparser/issues/427#issuecomment-973416786)
|
||||||
|
|
||||||
|
```php
|
||||||
|
$parser = new \Smalot\PdfParser\Parser();
|
||||||
|
$pdf = $parser->parseFile('document.pdf');
|
||||||
|
$pages = $pdf->getPages();
|
||||||
|
// this variable will contain the height and width of each page of the given PDF
|
||||||
|
$mediaBox = [];
|
||||||
|
foreach ($pages as $page) {
|
||||||
|
$details = $page->getDetails();
|
||||||
|
$mediaBox[] = [
|
||||||
|
'width' => $details['MediaBox'][2],
|
||||||
|
'height' => $details['MediaBox'][3]
|
||||||
|
];
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
|
@ -603,7 +603,7 @@ class Font extends PDFObject
|
||||||
// so we use iconv() here
|
// so we use iconv() here
|
||||||
$iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
|
$iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);
|
||||||
|
|
||||||
return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
|
return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8//TRANSLIT//IGNORE', $text) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -262,8 +262,7 @@ class RawDataParser
|
||||||
if (
|
if (
|
||||||
('/' == $v[0])
|
('/' == $v[0])
|
||||||
&& ('Type' == $v[1])
|
&& ('Type' == $v[1])
|
||||||
&& (
|
&& (isset($sarr[$k + 1])
|
||||||
isset($sarr[$k + 1])
|
|
||||||
&& '/' == $sarr[$k + 1][0]
|
&& '/' == $sarr[$k + 1][0]
|
||||||
&& 'XRef' == $sarr[$k + 1][1]
|
&& 'XRef' == $sarr[$k + 1][1]
|
||||||
)
|
)
|
||||||
|
@ -289,8 +288,7 @@ class RawDataParser
|
||||||
if (
|
if (
|
||||||
'/' == $vdc[0]
|
'/' == $vdc[0]
|
||||||
&& 'Columns' == $vdc[1]
|
&& 'Columns' == $vdc[1]
|
||||||
&& (
|
&& (isset($decpar[$kdc + 1])
|
||||||
isset($decpar[$kdc + 1])
|
|
||||||
&& 'numeric' == $decpar[$kdc + 1][0]
|
&& 'numeric' == $decpar[$kdc + 1][0]
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
@ -298,8 +296,7 @@ class RawDataParser
|
||||||
} elseif (
|
} elseif (
|
||||||
'/' == $vdc[0]
|
'/' == $vdc[0]
|
||||||
&& 'Predictor' == $vdc[1]
|
&& 'Predictor' == $vdc[1]
|
||||||
&& (
|
&& (isset($decpar[$kdc + 1])
|
||||||
isset($decpar[$kdc + 1])
|
|
||||||
&& 'numeric' == $decpar[$kdc + 1][0]
|
&& 'numeric' == $decpar[$kdc + 1][0]
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
@ -553,16 +550,18 @@ class RawDataParser
|
||||||
$offset += $objHeaderLen;
|
$offset += $objHeaderLen;
|
||||||
$objContentArr = [];
|
$objContentArr = [];
|
||||||
$i = 0; // object main index
|
$i = 0; // object main index
|
||||||
|
$header = null;
|
||||||
do {
|
do {
|
||||||
$oldOffset = $offset;
|
$oldOffset = $offset;
|
||||||
// get element
|
// get element
|
||||||
$element = $this->getRawObject($pdfData, $offset);
|
$element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
|
||||||
$offset = $element[2];
|
$offset = $element[2];
|
||||||
// decode stream using stream's dictionary information
|
// decode stream using stream's dictionary information
|
||||||
if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[$i - 1][0])) && ('<<' === $objContentArr[$i - 1][0])) {
|
if ($decoding && ('stream' === $element[0]) && null != $header) {
|
||||||
$element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[$i - 1][1], $element[1]);
|
$element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
|
||||||
}
|
}
|
||||||
$objContentArr[$i] = $element;
|
$objContentArr[$i] = $element;
|
||||||
|
$header = isset($element[0]) && '<<' === $element[0] ? $element : null;
|
||||||
++$i;
|
++$i;
|
||||||
} while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
|
} while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
|
||||||
// remove closing delimiter
|
// remove closing delimiter
|
||||||
|
@ -606,10 +605,11 @@ class RawDataParser
|
||||||
* Get object type, raw value and offset to next object
|
* Get object type, raw value and offset to next object
|
||||||
*
|
*
|
||||||
* @param int $offset Object offset
|
* @param int $offset Object offset
|
||||||
|
* @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
|
||||||
*
|
*
|
||||||
* @return array containing object type, raw value and offset to next object
|
* @return array containing object type, raw value and offset to next object
|
||||||
*/
|
*/
|
||||||
protected function getRawObject(string $pdfData, int $offset = 0): array
|
protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
|
||||||
{
|
{
|
||||||
$objtype = ''; // object type to be returned
|
$objtype = ''; // object type to be returned
|
||||||
$objval = ''; // object value to be returned
|
$objval = ''; // object value to be returned
|
||||||
|
@ -758,15 +758,21 @@ class RawDataParser
|
||||||
$offset += 6;
|
$offset += 6;
|
||||||
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
|
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
|
||||||
$offset += \strlen($matches[0]);
|
$offset += \strlen($matches[0]);
|
||||||
|
|
||||||
|
// we get stream length here to later help preg_match test less data
|
||||||
|
$streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
|
||||||
|
$skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
|
||||||
|
|
||||||
$pregResult = preg_match(
|
$pregResult = preg_match(
|
||||||
'/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
|
'/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
|
||||||
$pdfData,
|
$pdfData,
|
||||||
$matches,
|
$matches,
|
||||||
\PREG_OFFSET_CAPTURE,
|
\PREG_OFFSET_CAPTURE,
|
||||||
$offset
|
$offset + $streamLen
|
||||||
);
|
);
|
||||||
|
|
||||||
if (1 == $pregResult) {
|
if (1 == $pregResult) {
|
||||||
$objval = substr($pdfData, $offset, $matches[0][1] - $offset);
|
$objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
|
||||||
$offset = $matches[1][1];
|
$offset = $matches[1][1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -796,6 +802,48 @@ class RawDataParser
|
||||||
return [$objtype, $objval, $offset];
|
return [$objtype, $objval, $offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get value of an object header's section (obj << YYY >> part ).
|
||||||
|
*
|
||||||
|
* It is similar to Header::get('...')->getContent(), the only difference is it can be used during the parsing process,
|
||||||
|
* when no Smalot\PdfParser\Header objects are created yet.
|
||||||
|
*
|
||||||
|
* @param string $key header's section name
|
||||||
|
* @param string $type type of the section (i.e. 'numeric', '/', '<<', etc.)
|
||||||
|
* @param string|array|null $default default value for header's section
|
||||||
|
*
|
||||||
|
* @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
|
||||||
|
*/
|
||||||
|
private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
|
||||||
|
{
|
||||||
|
if (false === \is_array($headerDic)) {
|
||||||
|
return $default;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* It recieves dictionary of header fields, as it is returned by RawDataParser::getRawObject,
|
||||||
|
* iterates over it, searching for section of type '/' whith requested key.
|
||||||
|
* If such a section is found, it tries to receive it's value (next object in dictionary),
|
||||||
|
* returning it, if it matches requested type, or default value otherwise.
|
||||||
|
*/
|
||||||
|
foreach ($headerDic as $i => $val) {
|
||||||
|
$isSectionName = \is_array($val) && 3 == \count($val) && '/' == $val[0];
|
||||||
|
if (
|
||||||
|
$isSectionName
|
||||||
|
&& $val[1] == $key
|
||||||
|
&& isset($headerDic[$i + 1])
|
||||||
|
) {
|
||||||
|
$isSectionValue = \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]);
|
||||||
|
|
||||||
|
return $isSectionValue && $type == $headerDic[$i + 1][0]
|
||||||
|
? $headerDic[$i + 1][1]
|
||||||
|
: $default;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $default;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get Cross-Reference (xref) table and trailer data from PDF document data.
|
* Get Cross-Reference (xref) table and trailer data from PDF document data.
|
||||||
*
|
*
|
||||||
|
@ -821,7 +869,8 @@ class RawDataParser
|
||||||
// find last startxref
|
// find last startxref
|
||||||
$pregResult = preg_match_all(
|
$pregResult = preg_match_all(
|
||||||
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
'/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i',
|
||||||
$pdfData, $matches,
|
$pdfData,
|
||||||
|
$matches,
|
||||||
\PREG_SET_ORDER,
|
\PREG_SET_ORDER,
|
||||||
$offset
|
$offset
|
||||||
);
|
);
|
||||||
|
|
Loading…
Reference in a new issue