PdfParser update

This update adds XMP metadata and PDFDocEncoding support for regular metadata.
2023-07-04 13:08:42 -04:00 · 2023-07-04 13:08:42 -04:00 · 06cc7fe325
parent 30630c6c60
commit 06cc7fe325
4 changed files with 334 additions and 80 deletions
--- a/orcinus/pdfparser/doc/Usage.md
+++ b/orcinus/pdfparser/doc/Usage.md
@ -140,9 +140,49 @@ Array
    [Producer] => Adobe Acrobat
    [CreatedOn] => 2022-01-28T16:36:11+00:00
    [Pages] => 35
+    ...
 )
 ```

+If the PDF contains Extensible Metadata Platform (XMP) XML metadata, their values, including the XMP namespace, will be appended to the data returned by `getDetails()`. You can read more about what values and namespaces are commonly used in the [XMP Specifications](https://github.com/adobe/XMP-Toolkit-SDK/tree/main/docs).
+
+```php
+Array
+(
+    ...
+    [Pages] => 35
+    [dc:creator] => My Name
+    [pdf:producer] => Adobe Acrobat
+    [dc:title] => My Document Title
+    ...
+)
+```
+
+Some XMP metadata values may have multiple values, or even named children with their own values. In these cases, the value will be an array. The XMP metadata will follow the structure of the XML so it is possible to have multiple levels of nested values.
+
+```php
+Array
+(
+    ...
+    [dc:title] => My Document Title
+    [xmptpg:maxpagesize] => Array
+    (
+        [stdim:w] => 21.500000
+        [stdim:h] => 6.222222
+        [stdim:unit] => Inches
+    )
+    [xmptpg:platenames] => Array
+    (
+        [0] => Cyan
+        [1] => Magenta
+        [2] => Yellow
+        [3] => Black
+    )
+    ...
+)
+```
+
+
 ## Read Base64 encoded PDFs

 If working with [Base64](https://en.wikipedia.org/wiki/Base64) encoded PDFs, you might want to parse the PDF without saving the file to disk.
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Document.php
@ -32,6 +32,8 @@

 namespace Smalot\PdfParser;

+use Smalot\PdfParser\Encoding\PDFDocEncoding;
+
 /**
 * Technical references :
 * - http://www.mactech.com/articles/mactech/Vol.15/15.09/PDFIntro/index.html
@ -62,7 +64,7 @@ class Document
    protected $trailer;

    /**
-     * @var Metadata
+     * @var array<mixed>
     */
    protected $metadata = [];

@ -149,6 +151,43 @@ class Document
            $details['Pages'] = 0;
        }

+        // Decode and repair encoded document properties
+        foreach ($details as $key => $value) {
+            if (\is_string($value)) {
+
+                // If the string is already UTF-8 encoded, that means we only
+                // need to repair Adobe's ham-fisted insertion of line-feeds
+                // every ~127 characters, which doesn't seem to be multi-byte
+                // safe
+                if (mb_check_encoding($value, 'UTF-8')) {
+
+                    $value = str_replace("\x5c\x0d", '', $value);
+
+                    while (preg_match("/\x5c\x5c\xe0([\xb4-\xb8])(.)/", $value, $match)) {
+                        $diff = (\ord($match[1]) - 182) * 64;
+                        $newbyte = PDFDocEncoding::convertPDFDoc2UTF8(\chr(\ord($match[2]) + $diff));
+                        $value = preg_replace("/\x5c\x5c\xe0".$match[1].$match[2]."/", $newbyte, $value);
+                    }
+
+                    while (preg_match("/(.)\x9c\xe0([\xb3-\xb7])/", $value, $match)) {
+                        $diff = \ord($match[2]) - 181;
+                        $newbyte = \chr(\ord($match[1]) + $diff);
+                        $value = preg_replace("/".$match[1]."\x9c\xe0".$match[2]."/", $newbyte, $value);
+                    }
+
+                    $value = str_replace("\xe5\xb0\x8d", '', $value);
+
+                    $details[$key] = $value;
+
+                // If the string is just PDFDocEncoding, remove any line-feeds
+                // and decode the whole thing.
+                } else {
+                    $value = str_replace("\\\r", '', $value);
+                    $details[$key] = PDFDocEncoding::convertPDFDoc2UTF8($value);
+                }
+            }
+        }
+
        $details = array_merge($details, $this->metadata);

        $this->details = $details;
@ -162,101 +201,92 @@ class Document
        $xml = xml_parser_create();
        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);

-        if (xml_parse_into_struct($xml, $content, $values, $index)) {
-            $detail = '';
-
+        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
+            /*
+             * short overview about the following code parts:
+             *
+             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
+             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
+             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
+             * we save the current $metadata context in the $stack, then create a child array of $metadata and
+             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
+             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
+             * element) is set as the current $metadata context.
+             */
+            $metadata = [];
+            $stack = [];
            foreach ($values as $val) {
-                switch ($val['tag']) {
-                    case 'DC:CREATOR':
-                        $detail = ('open' == $val['type']) ? 'Author' : '';
-                        break;
+                // Standardize to lowercase
+                $val['tag'] = strtolower($val['tag']);

-                    case 'DC:DESCRIPTION':
-                        $detail = ('open' == $val['type']) ? 'Description' : '';
-                        break;
+                // Ignore structural x: and rdf: XML elements
+                if (0 === strpos($val['tag'], 'x:')) {
+                    continue;
+                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
+                    continue;
+                }

-                    case 'DC:TITLE':
-                        $detail = ('open' == $val['type']) ? 'Title' : '';
-                        break;
+                switch ($val['type']) {
+                    case 'open':
+                        // Create an array of list items
+                        if ('rdf:li' == $val['tag']) {
+                            $metadata[] = [];

-                    case 'DC:SUBJECT':
-                        $detail = ('open' == $val['type']) ? 'Subject' : '';
-                        break;
+                            // Move up one level in the stack
+                            $stack[\count($stack)] = &$metadata;
+                            $metadata = &$metadata[\count($metadata) - 1];
+                        } else {
+                            // Else create an array of named values
+                            $metadata[$val['tag']] = [];

-                    case 'RDF:LI':
-                        if ($detail && 'complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata[$detail] = $val['value'];
+                            // Move up one level in the stack
+                            $stack[\count($stack)] = &$metadata;
+                            $metadata = &$metadata[$val['tag']];
                        }
                        break;

-                    case 'DC:FORMAT':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Format'] = $val['value'];
+                    case 'complete':
+                        if (isset($val['value'])) {
+                            // Assign a value to this list item
+                            if ('rdf:li' == $val['tag']) {
+                                $metadata[] = $val['value'];
+
+                                // Else assign a value to this property
+                            } else {
+                                $metadata[$val['tag']] = $val['value'];
+                            }
                        }
                        break;

-                    case 'PDF:KEYWORDS':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Keywords'] = $val['value'];
+                    case 'close':
+                        // If the value of this property is a single-
+                        // element array where the element is of type
+                        // string, use the value of the first list item
+                        // as the value for this property
+                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
+                            $metadata = $metadata[0];
                        }
-                        break;

-                    case 'PDF:PRODUCER':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Producer'] = $val['value'];
-                        }
-                        break;
-
-                    case 'PDFX:SOURCEMODIFIED':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['SourceModified'] = $val['value'];
-                        }
-                        break;
-
-                    case 'PDFX:COMPANY':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Company'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMP:CREATEDATE':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['CreationDate'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMP:CREATORTOOL':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Creator'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMP:MODIFYDATE':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['ModDate'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMP:METADATADATE':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['MetadataDate'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMPMM:DOCUMENTID':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['DocumentUUID'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMPMM:INSTANCEID':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['InstanceUUID'] = $val['value'];
-                        }
+                        // Move down one level in the stack
+                        $metadata = &$stack[\count($stack) - 1];
+                        unset($stack[\count($stack) - 1]);
                        break;
                }
            }
+
+            // Only use this metadata if it's referring to a PDF
+            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
+                // According to the XMP specifications: 'Conflict resolution
+                // for separate packets that describe the same resource is
+                // beyond the scope of this document.' - Section 6.1
+                // Source: https://www.adobe.com/devnet/xmp.html
+                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
+                // So if there are multiple XMP blocks, just merge the values
+                // of each found block over top of the existing values
+                $this->metadata = array_merge($this->metadata, $metadata);
+            }
        }
+        xml_parser_free($xml);
    }

    public function getDictionary(): array
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding.php
@ -32,7 +32,6 @@

 namespace Smalot\PdfParser;

-use Exception;
 use Smalot\PdfParser\Element\ElementNumeric;
 use Smalot\PdfParser\Encoding\EncodingLocator;
 use Smalot\PdfParser\Encoding\PostScriptGlyphs;
--- a/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
+++ b/orcinus/pdfparser/src/Smalot/PdfParser/Encoding/PDFDocEncoding.php
@ -0,0 +1,185 @@
+<?php
+
+/**
+ * @file
+ *          This file is part of the PdfParser library.
+ *
+ * @author  Sébastien MALOT <sebastien@malot.fr>
+ *
+ * @date    2017-01-03
+ *
+ * @license LGPLv3
+ *
+ * @url     <https://github.com/smalot/pdfparser>
+ *
+ *  PdfParser is a pdf library written in PHP, extraction oriented.
+ *  Copyright (C) 2017 - Sébastien MALOT <sebastien@malot.fr>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.
+ *  If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
+ */
+
+// Source : https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/pdfreference1.2.pdf
+// Source : https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf
+
+namespace Smalot\PdfParser\Encoding;
+
+/**
+ * Class PDFDocEncoding
+ */
+abstract class PDFDocEncoding extends AbstractEncoding
+{
+    public static function convertPDFDoc2UTF8(string $content): string
+    {
+        return strtr($content, array(
+            "\x18" => "\u{02d8}", // breve
+            "\x19" => "\u{02c7}", // caron
+            "\x1a" => "\u{02c6}", // circumflex
+            "\x1b" => "\u{02d9}", // dotaccent
+            "\x1c" => "\u{02dd}", // hungarumlaut
+            "\x1d" => "\u{02db}", // ogonek
+            "\x1e" => "\u{02de}", // ring
+            "\x1f" => "\u{02dc}", // tilde
+            "\x7f" => '',
+            "\x80" => "\u{2022}", // bullet
+            "\x81" => "\u{2020}", // dagger
+            "\x82" => "\u{2021}", // daggerdbl
+            "\x83" => "\u{2026}", // ellipsis
+            "\x84" => "\u{2014}", // emdash
+            "\x85" => "\u{2013}", // endash
+            "\x86" => "\u{0192}", // florin
+            "\x87" => "\u{2044}", // fraction
+            "\x88" => "\u{2039}", // guilsinglleft
+            "\x89" => "\u{203a}", // guilsinglright
+            "\x8a" => "\u{2212}", // minus
+            "\x8b" => "\u{2030}", // perthousand
+            "\x8c" => "\u{201e}", // quotedblbase
+            "\x8d" => "\u{201c}", // quotedblleft
+            "\x8e" => "\u{201d}", // quotedblright
+            "\x8f" => "\u{2018}", // quoteleft
+            "\x90" => "\u{2019}", // quoteright
+            "\x91" => "\u{201a}", // quotesinglbase
+            "\x92" => "\u{2122}", // trademark
+            "\x93" => "\u{fb01}", // fi
+            "\x94" => "\u{fb02}", // fl
+            "\x95" => "\u{0141}", // Lslash
+            "\x96" => "\u{0152}", // OE
+            "\x97" => "\u{0160}", // Scaron
+            "\x98" => "\u{0178}", // Ydieresis
+            "\x99" => "\u{017d}", // Zcaron
+            "\x9a" => "\u{0131}", // dotlessi
+            "\x9b" => "\u{0142}", // lslash
+            "\x9c" => "\u{0153}", // oe
+            "\x9d" => "\u{0161}", // scaron
+            "\x9e" => "\u{017e}", // zcaron
+            "\x9f" => '',
+            "\xa0" => "\u{20ac}", // Euro
+            "\xa1" => "\u{00a1}", // exclamdown
+            "\xa2" => "\u{00a2}", // cent
+            "\xa3" => "\u{00a3}", // sterling
+            "\xa4" => "\u{00a4}", // currency
+            "\xa5" => "\u{00a5}", // yen
+            "\xa6" => "\u{00a6}", // brokenbar
+            "\xa7" => "\u{00a7}", // section
+            "\xa8" => "\u{00a8}", // dieresis
+            "\xa9" => "\u{00a9}", // copyright
+            "\xaa" => "\u{00aa}", // ordfeminine
+            "\xab" => "\u{00ab}", // guillemotleft
+            "\xac" => "\u{00ac}", // logicalnot
+            "\xad" => '',
+            "\xae" => "\u{00ae}", // registered
+            "\xaf" => "\u{00af}", // macron
+            "\xb0" => "\u{00b0}", // degree
+            "\xb1" => "\u{00b1}", // plusminus
+            "\xb2" => "\u{00b2}", // twosuperior
+            "\xb3" => "\u{00b3}", // threesuperior
+            "\xb4" => "\u{00b4}", // acute
+            "\xb5" => "\u{00b5}", // mu
+            "\xb6" => "\u{00b6}", // paragraph
+            "\xb7" => "\u{00b7}", // periodcentered
+            "\xb8" => "\u{00b8}", // cedilla
+            "\xb9" => "\u{00b9}", // onesuperior
+            "\xba" => "\u{00ba}", // ordmasculine
+            "\xbb" => "\u{00bb}", // guillemotright
+            "\xbc" => "\u{00bc}", // onequarter
+            "\xbd" => "\u{00bd}", // onehalf
+            "\xbe" => "\u{00be}", // threequarters
+            "\xbf" => "\u{00bf}", // questiondown
+            "\xc0" => "\u{00c0}", // Agrave
+            "\xc1" => "\u{00c1}", // Aacute
+            "\xc2" => "\u{00c2}", // Acircumflex
+            "\xc3" => "\u{00c3}", // Atilde
+            "\xc4" => "\u{00c4}", // Adieresis
+            "\xc5" => "\u{00c5}", // Aring
+            "\xc6" => "\u{00c6}", // AE
+            "\xc7" => "\u{00c7}", // Ccedill
+            "\xc8" => "\u{00c8}", // Egrave
+            "\xc9" => "\u{00c9}", // Eacute
+            "\xca" => "\u{00ca}", // Ecircumflex
+            "\xcb" => "\u{00cb}", // Edieresis
+            "\xcc" => "\u{00cc}", // Igrave
+            "\xcd" => "\u{00cd}", // Iacute
+            "\xce" => "\u{00ce}", // Icircumflex
+            "\xcf" => "\u{00cf}", // Idieresis
+            "\xd0" => "\u{00d0}", // Eth
+            "\xd1" => "\u{00d1}", // Ntilde
+            "\xd2" => "\u{00d2}", // Ograve
+            "\xd3" => "\u{00d3}", // Oacute
+            "\xd4" => "\u{00d4}", // Ocircumflex
+            "\xd5" => "\u{00d5}", // Otilde
+            "\xd6" => "\u{00d6}", // Odieresis
+            "\xd7" => "\u{00d7}", // multiply
+            "\xd8" => "\u{00d8}", // Oslash
+            "\xd9" => "\u{00d9}", // Ugrave
+            "\xda" => "\u{00da}", // Uacute
+            "\xdb" => "\u{00db}", // Ucircumflex
+            "\xdc" => "\u{00dc}", // Udieresis
+            "\xdd" => "\u{00dd}", // Yacute
+            "\xde" => "\u{00de}", // Thorn
+            "\xdf" => "\u{00df}", // germandbls
+            "\xe0" => "\u{00e0}", // agrave
+            "\xe1" => "\u{00e1}", // aacute
+            "\xe2" => "\u{00e2}", // acircumflex
+            "\xe3" => "\u{00e3}", // atilde
+            "\xe4" => "\u{00e4}", // adieresis
+            "\xe5" => "\u{00e5}", // aring
+            "\xe6" => "\u{00e6}", // ae
+            "\xe7" => "\u{00e7}", // ccedilla
+            "\xe8" => "\u{00e8}", // egrave
+            "\xe9" => "\u{00e9}", // eacute
+            "\xea" => "\u{00ea}", // ecircumflex
+            "\xeb" => "\u{00eb}", // edieresis
+            "\xec" => "\u{00ec}", // igrave
+            "\xed" => "\u{00ed}", // iacute
+            "\xee" => "\u{00ee}", // icircumflex
+            "\xef" => "\u{00ef}", // idieresis
+            "\xf0" => "\u{00f0}", // eth
+            "\xf1" => "\u{00f1}", // ntilde
+            "\xf2" => "\u{00f2}", // ograve
+            "\xf3" => "\u{00f3}", // oacute
+            "\xf4" => "\u{00f4}", // ocircumflex
+            "\xf5" => "\u{00f5}", // otilde
+            "\xf6" => "\u{00f6}", // odieresis
+            "\xf7" => "\u{00f7}", // divide
+            "\xf8" => "\u{00f8}", // oslash
+            "\xf9" => "\u{00f9}", // ugrave
+            "\xfa" => "\u{00fa}", // uacute
+            "\xfb" => "\u{00fb}", // ucircumflex
+            "\xfc" => "\u{00fc}", // udieresis
+            "\xfd" => "\u{00fd}", // yacute
+            "\xfe" => "\u{00fe}", // thorn
+            "\xff" => "\u{00ff}"  // ydieresis
+        ));
+    }
+}