Skip to content

Commit b47f264

Browse files
partulajk00ni
andauthored
Fix infite loop when xref table corrupted (#377)
* Fix infite loop when xref table corrupted Issue #372 * added a test which triggers the infinite loop it proofs the fix in RawDataParser prevents the infinite loop. thanks to @partulaj: #372 (comment) * fixed coding style issues Co-authored-by: Konrad Abicht <[email protected]>
1 parent 8b8a157 commit b47f264

File tree

2 files changed

+126
-37
lines changed

2 files changed

+126
-37
lines changed

src/Smalot/PdfParser/RawData/RawDataParser.php

Lines changed: 39 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -631,58 +631,60 @@ protected function getRawObject($pdfData, $offset = 0)
631631

632632
case '[': // \x5B LEFT SQUARE BRACKET
633633
case ']': // \x5D RIGHT SQUARE BRACKET
634-
// array object
635-
$objtype = $char;
636-
++$offset;
637-
if ('[' == $char) {
634+
// array object
635+
$objtype = $char;
636+
++$offset;
637+
if ('[' == $char) {
638+
// get array content
639+
$objval = [];
640+
do {
641+
$oldOffset = $offset;
642+
// get element
643+
$element = $this->getRawObject($pdfData, $offset);
644+
$offset = $element[2];
645+
$objval[] = $element;
646+
} while ((']' != $element[0]) && ($offset != $oldOffset));
647+
// remove closing delimiter
648+
array_pop($objval);
649+
}
650+
break;
651+
652+
case '<': // \x3C LESS-THAN SIGN
653+
case '>': // \x3E GREATER-THAN SIGN
654+
if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
655+
// dictionary object
656+
$objtype = $char.$char;
657+
$offset += 2;
658+
if ('<' == $char) {
638659
// get array content
639660
$objval = [];
640661
do {
662+
$oldOffset = $offset;
641663
// get element
642664
$element = $this->getRawObject($pdfData, $offset);
643665
$offset = $element[2];
644666
$objval[] = $element;
645-
} while (']' != $element[0]);
667+
} while (('>>' != $element[0]) && ($offset != $oldOffset));
646668
// remove closing delimiter
647669
array_pop($objval);
648670
}
649-
break;
650-
651-
case '<': // \x3C LESS-THAN SIGN
652-
case '>': // \x3E GREATER-THAN SIGN
653-
if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
654-
// dictionary object
655-
$objtype = $char.$char;
656-
$offset += 2;
657-
if ('<' == $char) {
658-
// get array content
659-
$objval = [];
660-
do {
661-
// get element
662-
$element = $this->getRawObject($pdfData, $offset);
663-
$offset = $element[2];
664-
$objval[] = $element;
665-
} while ('>>' != $element[0]);
666-
// remove closing delimiter
667-
array_pop($objval);
668-
}
669-
} else {
670-
// hexadecimal string object
671-
$objtype = $char;
672-
++$offset;
673-
$pregResult = preg_match(
671+
} else {
672+
// hexadecimal string object
673+
$objtype = $char;
674+
++$offset;
675+
$pregResult = preg_match(
674676
'/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',
675677
substr($pdfData, $offset),
676678
$matches
677679
);
678-
if (('<' == $char) && 1 == $pregResult) {
679-
// remove white space characters
680-
$objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
681-
$offset += \strlen($matches[0]);
682-
} elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
683-
$offset = $endpos + 1;
684-
}
680+
if (('<' == $char) && 1 == $pregResult) {
681+
// remove white space characters
682+
$objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
683+
$offset += \strlen($matches[0]);
684+
} elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
685+
$offset = $endpos + 1;
685686
}
687+
}
686688
break;
687689

688690
default:
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
<?php
2+
3+
/**
4+
* @file This file is part of the PdfParser library.
5+
*
6+
* @author Konrad Abicht <[email protected]>
7+
* @date 2020-06-01
8+
*
9+
* @author Sébastien MALOT <[email protected]>
10+
* @date 2017-01-03
11+
*
12+
* @license LGPLv3
13+
* @url <https://github.com/smalot/pdfparser>
14+
*
15+
* PdfParser is a pdf library written in PHP, extraction oriented.
16+
* Copyright (C) 2017 - Sébastien MALOT <[email protected]>
17+
*
18+
* This program is free software: you can redistribute it and/or modify
19+
* it under the terms of the GNU Lesser General Public License as published by
20+
* the Free Software Foundation, either version 3 of the License, or
21+
* (at your option) any later version.
22+
*
23+
* This program is distributed in the hope that it will be useful,
24+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
25+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26+
* GNU Lesser General Public License for more details.
27+
*
28+
* You should have received a copy of the GNU Lesser General Public License
29+
* along with this program.
30+
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
31+
*/
32+
33+
namespace Tests\Smalot\PdfParser\Integration\RawData;
34+
35+
use Smalot\PdfParser\RawData\RawDataParser;
36+
use Tests\Smalot\PdfParser\TestCase;
37+
38+
class RawDataParserHelper extends RawDataParser
39+
{
40+
/**
41+
* Expose protected function "getRawObject".
42+
*/
43+
public function exposeGetRawObject($pdfData, $offset = 0)
44+
{
45+
return $this->getRawObject($pdfData, $offset);
46+
}
47+
}
48+
49+
class RawDataParserTest extends TestCase
50+
{
51+
protected function setUp()
52+
{
53+
parent::setUp();
54+
55+
$this->fixture = new RawDataParserHelper();
56+
}
57+
58+
/**
59+
* Tests buggy behavior of getRawObject.
60+
*
61+
* When PDF has corrupted xref table getRawObject may run into an infinite loop.
62+
*
63+
* @see https://github.com/smalot/pdfparser/issues/372
64+
* @see https://github.com/smalot/pdfparser/pull/377
65+
*/
66+
public function testGetRawObjectIssue372()
67+
{
68+
// The following $data content is a minimal example to trigger the infinite loop
69+
$data = '<</Producer (eDkºãa˜þõ‚LÅòÕ�PïÙ��)©)>>';
70+
71+
// calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop
72+
// if the fix is not there.
73+
$result = $this->fixture->exposeGetRawObject($data);
74+
75+
$this->assertEquals(
76+
[
77+
'<<',
78+
[
79+
['/', 'Producer', 11],
80+
['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52],
81+
],
82+
52,
83+
],
84+
$result
85+
);
86+
}
87+
}

0 commit comments

Comments
 (0)