Skip to content

Commit

Permalink
Fix encoding for encoding dictionary without Type item. (#500)
Browse files Browse the repository at this point in the history
* Font::decodeContent() fixes to support Encoding dictionates without Type header.

* Pass $unicode by reference in Font::decodeContentByEncoding().

* Add encoding initialization.

* Font's initialized encoding in private property.

* Add "ext-iconv" to "require" in composer.json

* Delete unnecessary unicode string test for #95 test.

* Fix misprint.

* Add comments + small refactoring.

* Run dev-tools/vendor/bin/php-cs-fixer fix

Fixed all files in 0.018 seconds, 12.000 MB memory used

* Fixes by @k00ni suggestions.

* Add test (with pdf file) for issue in PR #500.

* Add pdf-file for test.

* Add comments to Font class methods.

* Apply suggestions from code review

Co-authored-by: Konrad Abicht <[email protected]>

* Delete trows in phpDoc for test.

* Fixes according to @k00ni suggestions.

- Add return types to tests methods.
- Fix todos in phpDocs.

* Apply suggestions from code review

Co-authored-by: Konrad Abicht <[email protected]>

* Update test file (which opens without error in Adobe Acrobat Reader).

* CS-fixer fix.

* Avoid throwing error when encoding isn't found (previous behavior)

Co-authored-by: Konrad Abicht <[email protected]>
Co-authored-by: Jeremy Benoist <[email protected]>
  • Loading branch information
3 people authored Feb 3, 2022
1 parent 43ca68f commit 4551cd0
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 76 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"require": {
"php": ">=7.1",
"symfony/polyfill-mbstring": "^1.18",
"ext-zlib": "*"
"ext-zlib": "*",
"ext-iconv": "*"
},
"autoload": {
"psr-0": {
Expand Down
Binary file added samples/bugs/PullRequest500.pdf
Binary file not shown.
262 changes: 197 additions & 65 deletions src/Smalot/PdfParser/Font.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,19 @@ class Font extends PDFObject
*/
private static $uchrCache = [];

/**
* In some PDF-files encoding could be referenced by object id but object itself does not contain
* `/Type /Encoding` in its dictionary. These objects wouldn't be initialized as Encoding in
* \Smalot\PdfParser\PDFObject::factory() during file parsing (they would be just PDFObject).
*
* Therefore, we create an instance of Encoding from them during decoding and cache this value in this property.
*
* @var Encoding
*
* @see https://github.com/smalot/pdfparser/pull/500
*/
private $initializedEncodingByPdfObject;

public function init()
{
// Load translate table.
Expand Down Expand Up @@ -408,91 +421,210 @@ public function decodeText(array $commands): string
}

/**
* Decode given $text to "utf-8" encoded string.
*
* @param bool $unicode This parameter is deprecated and might be removed in a future release
*/
public function decodeContent(string $text, ?bool &$unicode = null): string
{
if ($this->has('ToUnicode')) {
$bytes = $this->tableSizes['from'];
return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text);
}

if ($bytes) {
$result = '';
$length = \strlen($text);
if ($this->has('Encoding')) {
$result = $this->decodeContentByEncoding($text);

for ($i = 0; $i < $length; $i += $bytes) {
$char = substr($text, $i, $bytes);
if (null !== $result) {
return $result;
}
}

if (false !== ($decoded = $this->translateChar($char, false))) {
$char = $decoded;
} elseif ($this->has('DescendantFonts')) {
if ($this->get('DescendantFonts') instanceof PDFObject) {
$fonts = $this->get('DescendantFonts')->getHeader()->getElements();
} else {
$fonts = $this->get('DescendantFonts')->getContent();
}
$decoded = false;

foreach ($fonts as $font) {
if ($font instanceof self) {
if (false !== ($decoded = $font->translateChar($char, false))) {
$decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
break;
}
return $this->decodeContentByAutodetectIfNecessary($text);
}

/**
* First try to decode $text by ToUnicode CMap.
* If char translation not found in ToUnicode CMap tries:
* - If DescendantFonts exists tries to decode char by one of that fonts.
* - If have no success to decode by DescendantFonts interpret $text as a string with "Windows-1252" encoding.
* - If DescendantFonts does not exist just return "?" as decoded char.
*
* @todo Seems this is invalid algorithm that do not follow pdf-format specification. Must be rewritten.
*/
private function decodeContentByToUnicodeCMapOrDescendantFonts(string $text): string
{
$bytes = $this->tableSizes['from'];

if ($bytes) {
$result = '';
$length = \strlen($text);

for ($i = 0; $i < $length; $i += $bytes) {
$char = substr($text, $i, $bytes);

if (false !== ($decoded = $this->translateChar($char, false))) {
$char = $decoded;
} elseif ($this->has('DescendantFonts')) {
if ($this->get('DescendantFonts') instanceof PDFObject) {
$fonts = $this->get('DescendantFonts')->getHeader()->getElements();
} else {
$fonts = $this->get('DescendantFonts')->getContent();
}
$decoded = false;

foreach ($fonts as $font) {
if ($font instanceof self) {
if (false !== ($decoded = $font->translateChar($char, false))) {
$decoded = mb_convert_encoding($decoded, 'UTF-8', 'Windows-1252');
break;
}
}
}

if (false !== $decoded) {
$char = $decoded;
} else {
$char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
}
if (false !== $decoded) {
$char = $decoded;
} else {
$char = self::MISSING;
$char = mb_convert_encoding($char, 'UTF-8', 'Windows-1252');
}

$result .= $char;
} else {
$char = self::MISSING;
}

$text = $result;
$result .= $char;
}
} elseif ($this->has('Encoding') && $this->get('Encoding') instanceof Encoding) {
/** @var Encoding $encoding */
$encoding = $this->get('Encoding');
$unicode = mb_check_encoding($text, 'UTF-8');
$result = '';
if ($unicode) {
$chars = preg_split(
'//s'.($unicode ? 'u' : ''),
$text,
-1,
\PREG_SPLIT_DELIM_CAPTURE | \PREG_SPLIT_NO_EMPTY
);

foreach ($chars as $char) {
$dec_av = hexdec(bin2hex($char));
$dec_ap = $encoding->translateChar($dec_av);
$result .= self::uchr($dec_ap ?? $dec_av);
}
} else {
$length = \strlen($text);

for ($i = 0; $i < $length; ++$i) {
$dec_av = hexdec(bin2hex($text[$i]));
$dec_ap = $encoding->translateChar($dec_av);
$result .= self::uchr($dec_ap ?? $dec_av);
}
}
$text = $result;
} elseif ($this->get('Encoding') instanceof Element &&
$this->get('Encoding')->equals('MacRomanEncoding')) {
// mb_convert_encoding does not support MacRoman/macintosh,
// so we use iconv() here
$text = iconv('macintosh', 'UTF-8', $text);
} elseif (!mb_check_encoding($text, 'UTF-8')) {
// don't double-encode strings already in UTF-8
$text = mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
}

return $text;
}

/**
* Decode content by any type of Encoding (dictionary's item) instance.
*/
private function decodeContentByEncoding(string $text): ?string
{
$encoding = $this->get('Encoding');

// When Encoding referenced by object id (/Encoding 520 0 R) but object itself does not contain `/Type /Encoding` in it's dictionary.
if ($encoding instanceof PDFObject) {
$encoding = $this->getInitializedEncodingByPdfObject($encoding);
}

// When Encoding referenced by object id (/Encoding 520 0 R) but object itself contains `/Type /Encoding` in it's dictionary.
if ($encoding instanceof Encoding) {
return $this->decodeContentByEncodingEncoding($text, $encoding);
}

// When Encoding is just string (/Encoding /WinAnsiEncoding)
if ($encoding instanceof Element) { //todo: ElementString class must by used?
return $this->decodeContentByEncodingElement($text, $encoding);
}

// don't double-encode strings already in UTF-8
if (!mb_check_encoding($text, 'UTF-8')) {
return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
}

return $text;
}

/**
* Returns already created or create a new one if not created before Encoding instance by PDFObject instance.
*/
private function getInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
{
if (!$this->initializedEncodingByPdfObject) {
$this->initializedEncodingByPdfObject = $this->createInitializedEncodingByPdfObject($PDFObject);
}

return $this->initializedEncodingByPdfObject;
}

/**
* Decode content when $encoding (given by $this->get('Encoding')) is instance of Encoding.
*/
private function decodeContentByEncodingEncoding(string $text, Encoding $encoding): string
{
$result = '';
$length = \strlen($text);

for ($i = 0; $i < $length; ++$i) {
$dec_av = hexdec(bin2hex($text[$i]));
$dec_ap = $encoding->translateChar($dec_av);
$result .= self::uchr($dec_ap ?? $dec_av);
}

return $result;
}

/**
* Decode content when $encoding (given by $this->get('Encoding')) is instance of Element.
*/
private function decodeContentByEncodingElement(string $text, Element $encoding): ?string
{
$pdfEncodingName = $encoding->getContent();

// mb_convert_encoding does not support MacRoman/macintosh,
// so we use iconv() here
$iconvEncodingName = $this->getIconvEncodingNameOrNullByPdfEncodingName($pdfEncodingName);

return $iconvEncodingName ? iconv($iconvEncodingName, 'UTF-8', $text) : null;
}

/**
* Convert PDF encoding name to iconv-known encoding name.
*/
private function getIconvEncodingNameOrNullByPdfEncodingName(string $pdfEncodingName): ?string
{
$pdfToIconvEncodingNameMap = [
'StandardEncoding' => 'ISO-8859-1',
'MacRomanEncoding' => 'MACINTOSH',
'WinAnsiEncoding' => 'CP1252',
];

return \array_key_exists($pdfEncodingName, $pdfToIconvEncodingNameMap)
? $pdfToIconvEncodingNameMap[$pdfEncodingName]
: null;
}

/**
* If string seems like "utf-8" encoded string do nothing and just return given string as is.
* Otherwise, interpret string as "Window-1252" encoded string.
*
* @return string|false
*/
private function decodeContentByAutodetectIfNecessary(string $text)
{
if (mb_check_encoding($text, 'UTF-8')) {
return $text;
}

return mb_convert_encoding($text, 'UTF-8', 'Windows-1252');
//todo: Why exactly `Windows-1252` used?
}

/**
* Create Encoding instance by PDFObject instance and init it.
*/
private function createInitializedEncodingByPdfObject(PDFObject $PDFObject): Encoding
{
$encoding = $this->createEncodingByPdfObject($PDFObject);
$encoding->init();

return $encoding;
}

/**
* Create Encoding instance by PDFObject instance (without init).
*/
private function createEncodingByPdfObject(PDFObject $PDFObject): Encoding
{
$document = $PDFObject->getDocument();
$header = $PDFObject->getHeader();
$content = $PDFObject->getContent();
$config = $PDFObject->getConfig();

return new Encoding($document, $header, $content, $config);
}
}
10 changes: 10 additions & 0 deletions src/Smalot/PdfParser/PDFObject.php
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,21 @@ public function init()
{
}

public function getDocument(): Document
{
return $this->document;
}

public function getHeader(): ?Header
{
return $this->header;
}

public function getConfig(): ?Config
{
return $this->config;
}

/**
* @return Element|PDFObject|Header
*/
Expand Down
32 changes: 25 additions & 7 deletions tests/Integration/FontTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -340,14 +340,32 @@ public function testDecodeText(): void
],
];
$this->assertEquals('æöü', $font->decodeText($commands));
}

$commands = [
[
't' => '<',
'c' => 'C3A6C3B6C3BC', //Unicode encoded string
],
];
$this->assertEquals('æöü', $font->decodeText($commands));
/**
* Font could have indirect encoding without `/Type /Encoding`
* which would be instance of PDFObject class (but not Encoding or ElementString).
*
* @see https://github.com/smalot/pdfparser/pull/500
*/
public function testDecodeTextForFontWithIndirectEncodingWithoutTypeEncoding(): void
{
$filename = $this->rootDir.'/samples/bugs/PullRequest500.pdf';
$parser = $this->getParserInstance();
$document = $parser->parseFile($filename);
$pages = $document->getPages();
$page1 = reset($pages);
$page1Text = $page1->getText();
$expectedText = <<<TEXT
Export\u{a0}transakční\u{a0}historie
Typ\u{a0}produktu:\u{a0}Podnikatelský\u{a0}účet\u{a0}Maxi
Číslo\u{a0}účtu:\u{a0}0000000000/0000
Počáteční\u{a0}zůstatek: 000\u{a0}000,00\u{a0}
Konečný\u{a0}zůstatek: 000\u{a0}000,00\u{a0}
Cena\u{a0}za\u{a0}služby
TEXT;

$this->assertEquals($expectedText, trim($page1Text));
}

/**
Expand Down
10 changes: 7 additions & 3 deletions tests/Performance/runPerformanceTests.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@

require __DIR__.'/../../vendor/autoload.php';

use Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException;
use Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest;
use Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest;

$tests = [
new \Tests\Smalot\PdfParser\Performance\Test\DocumentDictionaryCacheTest(),
new DocumentDictionaryCacheTest(),
];

foreach ($tests as $test) { /* @var $test \Tests\Smalot\PdfParser\Performance\Test\AbstractPerformanceTest */
foreach ($tests as $test) { /* @var $test AbstractPerformanceTest */
$test->init();

$startTime = microtime(true);
Expand All @@ -16,6 +20,6 @@
$time = $endTime - $startTime;

if ($test->getMaxEstimatedTime() <= $time) {
throw new \Tests\Smalot\PdfParser\Performance\Exception\PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
throw new PerformanceFailException(sprintf('Performance failed on test "%s". Time taken was %.2f seconds, expected less than %d seconds.', get_class($test), $time, $test->getMaxEstimatedTime()));
}
}

0 comments on commit 4551cd0

Please sign in to comment.