Skip to content

Commit

Permalink
Fix infite loop when xref table corrupted (#377)
Browse files Browse the repository at this point in the history
* Fix infite loop when xref table corrupted

Issue #372

* added a test which triggers the infinite loop

it proofs the fix in RawDataParser prevents the
infinite loop.

thanks to @partulaj:
#372 (comment)

* fixed coding style issues

Co-authored-by: Konrad Abicht <[email protected]>
  • Loading branch information
partulaj and k00ni authored Jan 5, 2021
1 parent 8b8a157 commit b47f264
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 37 deletions.
76 changes: 39 additions & 37 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -631,58 +631,60 @@ protected function getRawObject($pdfData, $offset = 0)

case '[': // \x5B LEFT SQUARE BRACKET
case ']': // \x5D RIGHT SQUARE BRACKET
// array object
$objtype = $char;
++$offset;
if ('[' == $char) {
// array object
$objtype = $char;
++$offset;
if ('[' == $char) {
// get array content
$objval = [];
do {
$oldOffset = $offset;
// get element
$element = $this->getRawObject($pdfData, $offset);
$offset = $element[2];
$objval[] = $element;
} while ((']' != $element[0]) && ($offset != $oldOffset));
// remove closing delimiter
array_pop($objval);
}
break;

case '<': // \x3C LESS-THAN SIGN
case '>': // \x3E GREATER-THAN SIGN
if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
// dictionary object
$objtype = $char.$char;
$offset += 2;
if ('<' == $char) {
// get array content
$objval = [];
do {
$oldOffset = $offset;
// get element
$element = $this->getRawObject($pdfData, $offset);
$offset = $element[2];
$objval[] = $element;
} while (']' != $element[0]);
} while (('>>' != $element[0]) && ($offset != $oldOffset));
// remove closing delimiter
array_pop($objval);
}
break;

case '<': // \x3C LESS-THAN SIGN
case '>': // \x3E GREATER-THAN SIGN
if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
// dictionary object
$objtype = $char.$char;
$offset += 2;
if ('<' == $char) {
// get array content
$objval = [];
do {
// get element
$element = $this->getRawObject($pdfData, $offset);
$offset = $element[2];
$objval[] = $element;
} while ('>>' != $element[0]);
// remove closing delimiter
array_pop($objval);
}
} else {
// hexadecimal string object
$objtype = $char;
++$offset;
$pregResult = preg_match(
} else {
// hexadecimal string object
$objtype = $char;
++$offset;
$pregResult = preg_match(
'/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
substr($pdfData, $offset),
$matches
);
if (('<' == $char) && 1 == $pregResult) {
// remove white space characters
$objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
$offset += \strlen($matches[0]);
} elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
$offset = $endpos + 1;
}
if (('<' == $char) && 1 == $pregResult) {
// remove white space characters
$objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
$offset += \strlen($matches[0]);
} elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
$offset = $endpos + 1;
}
}
break;

default:
Expand Down
87 changes: 87 additions & 0 deletions tests/Integration/RawData/RawDataParserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Konrad Abicht <[email protected]>
* @date 2020-06-01
*
* @author Sébastien MALOT <[email protected]>
* @date 2017-01-03
*
* @license LGPLv3
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/

namespace Tests\Smalot\PdfParser\Integration\RawData;

use Smalot\PdfParser\RawData\RawDataParser;
use Tests\Smalot\PdfParser\TestCase;

class RawDataParserHelper extends RawDataParser
{
/**
* Expose protected function "getRawObject".
*/
public function exposeGetRawObject($pdfData, $offset = 0)
{
return $this->getRawObject($pdfData, $offset);
}
}

class RawDataParserTest extends TestCase
{
protected function setUp()
{
parent::setUp();

$this->fixture = new RawDataParserHelper();
}

/**
* Tests buggy behavior of getRawObject.
*
* When PDF has corrupted xref table getRawObject may run into an infinite loop.
*
* @see https://github.com/smalot/pdfparser/issues/372
* @see https://github.com/smalot/pdfparser/pull/377
*/
public function testGetRawObjectIssue372()
{
// The following $data content is a minimal example to trigger the infinite loop
$data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';

// calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
// if the fix is not there.
$result = $this->fixture->exposeGetRawObject($data);

$this->assertEquals(
[
'<<',
[
['/', 'Producer', 11],
['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
],
52,
],
$result
);
}
}

0 comments on commit b47f264

Please sign in to comment.