From 768d1d6859bdf9ef0da44724ac84079f920b7599 Mon Sep 17 00:00:00 2001 From: Wolfgang Mattis Date: Mon, 22 Nov 2021 12:57:24 +0100 Subject: [PATCH] Fixes #478 (/Index problem) (#479) * Add files via upload Fixing problem of incomplete analysis of the /Index entry. * Delete RawDataParser.php Wrong subdirectory. * Add files via upload Fix problem of uncomplete analysis of /Index entry. * Update RawDataParser.php optical changes * Update RawDataParser.php optical changes * Update RawDataParser.php optical changes * Add files via upload After adding a description to the file, the valid /Index entry now contains two entries (consisting of 2 values: first object number, number of objects): /Index[2 1 21 2] * Update RawDataParserTest.php Adding test for issue 479 * Update RawDataParserTest.php Forgot a { * Update RawDataParser.php Code style update * Update RawDataParserTest.php Added more description and more checks. * Update PageTest.php Issue #331 is fixed by issue #479: test updated * Update RawDataParserTest.php optical fix * Update PageTest.php optical changes * Update RawDataParser.php change to remove the native_function_invocation message * Update tests/Integration/PageTest.php Co-authored-by: Konrad Abicht * Update RawDataParser.php Added comments... * Update RawDataParser.php Changes for CS fixer * Update PageTest.php Comment update * Update tests/Integration/PageTest.php Co-authored-by: Konrad Abicht Co-authored-by: Konrad Abicht --- samples/bugs/Issue479.pdf | Bin 0 -> 10803 bytes .../PdfParser/RawData/RawDataParser.php | 27 ++++++++++++--- tests/Integration/PageTest.php | 17 ++++++---- .../Integration/RawData/RawDataParserTest.php | 31 ++++++++++++++++++ 4 files changed, 65 insertions(+), 10 deletions(-) create mode 100644 samples/bugs/Issue479.pdf diff --git a/samples/bugs/Issue479.pdf b/samples/bugs/Issue479.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b304d677fdd41d1b22a8805b8c88f31ff448da7b GIT binary patch literal 10803 zcmeHN3v?4z8ZH!CECjH4N?j1f@UW##=8>5sleU3051`vXX&Q=Dpp(p`4J4V6Olc_9 z}CfIM|U#DX9qXu%>+QP8dySip+FB2W+(SRS%>@@P{Ct?c62J-f-t$(h{w z@Bjbz`~Um;?(LPO(TPwoA?Q{6<5%AZVo(G^Al5WmkeUi-S{)4KvX(G3gu){=^E#kVA7F!Mu%ThhEX`z5F1s68 zRI|1syTc9PaF&aqt!9pO0eea%th3rU#s%wa6vz0gzymdTsp%RWDwU9GomM6zBwB(% zaUCv0F_~1WMo<-|OqHwEa;X|kM>G-%p}}P;sXCn?)ag=`K+|=EMmYi==%5)-9%wj< z@_#HWL<0Y^;Y`L+z!gHcoOfm%-%b?HW?2r%dEXlVG7ttJ81H1@VcCoY&doz13_<|{ zZqCI}c0o+xDU*rrn4fRTceRRb_gcTn70+r;wut>xpZLY`&s!$~rm$a}d8#RDsxE5K)DPwevdVT+Sps{$z8yC0 z#ZukSi4leSsU2d!M~1s)4YL zEojIfl&|U6re+-+yaL zs%SW3Rp0f+-Q(R9(r@1CrM+h+t-Nnd&-}wwCVtiSJ`27$Hh<}*_IbE*VVkYGH^!D{ zkIh+zd`q%zZ=W5X^r0kj@|zdB_Lfu*UcBl?&KoOq@2*cdwS3m`%=s~sm8u>i`?me~ zL{ep8vSMfd8PjUtsvDj^yBhC0bYIa|)mv`F-<)K6ui*3VPY;uxD?2>6!=IBiZQa*e zA6kFrr`125+u3nHw)BJTWu2D~^4#z5jmWr5&gG8oTam(Roy! zEp1~}hfC7A6~ERz*!8@*a`LaeHlq=)9_cYdSl})OwFmA{&vOnilPl*(}7VS;kX1M%OU90N_TdZrQN5++( zzGpc#;r5-LUvED6TQI%9v*g0o^Y87i_=@cG7W?9;*x9|Qx30%-&XE52 zH};nv_Z<`NZ|?KGvW(n;bQ^Qxc()^-cW-O&5`9jPm$!Df(Z4!p#FF+-9uFN*gb*!IJPDW=;O#va>M zw(ZKvT%*xxsD69b^Lq=I=8AsUztBmmoj?5~1Sx|JMJCQW%jh$GMgt>jLuRJ-^*AZ>7>0vP zOo7#rCanA9Ga+Q9(}cN%9??71Ordo^iHk9m49YQ=j4_k6u&**E)uZs(opvxzLms=$ z;Z}IkgcO*&83lOei(w(;72(FD303?HXqY|=QnM}wBE*=;j36jj7>H2`iA%_22;&z6 z2nk~{5h_tgWC~0gSOW-^{H&GgqAiMSjm~cl{7VxSa-35E!^OqL;$n%IbrrxUNs=&v z!5Ah25+e6_2S<5C4tEcql7Jo!<2JjjPR`0YAYLzJVvD#mp)fE#dQJP=Wp~yaa=69b zHj2%x9rjR87!@OMy=GoHz(k=2GdNg(f36gwBhu0P6x9A9x^y^54d1Qi+ z@`mAGK#(@q1L!Pr*}QS0%`jtQ?EI_@KtvmZNShTF)@7$SCFOM5tY(Uj6YQidsc^%q zpq7F{&dS-CfP>!X`4UED;~JL+{I^;8L{iu&M?sp*m8Ar2;R_zRY+?0}r!-8QZwD!B1r_YqJypKbvx( zaec1?4;g!E;RJE0k7uI}0bYPuN*c!!6e_Y@EL~(~TuK$d)&_nKyOgpCUNqG9*mt}*oN!J?ML#>*KMBiZ$pEhvf^NM zVI#q4H2WNet67MG1;TIzqtWbh7_Md^3Kj^%6^ur+&tbTlg(z5{DY#-9c7A{{OcNG^ zeO-v1FMd!K4kQ_e4`nK*Ux;fr(Y2CUlc`T4}kMK!Za54DIsvAOfOWj~)R zefgrkV)+=!HzhZ>th?g8l(2JqmRKjBkYRYH?4VJ+C2hxpd66r-qzqnlrOhVM6u}I2 z`SQv_X8y~&wK(OxJ*acy7Q>kfdzG2v_HO$0m6S9 z3m$^_euVA@yZh$o?ueGXb>jv_^^TeTi%Xm zKys$F>JkE#Ai5hxrS*h&2x3Hzhl!E@MPdY#DhT=SPmF-^)ko0y4=F;>E01~-EbAYX zpW2Q*OtA1%-hY~4@w-4bP>YRs*j@s&F;T?3?PoS0$49AX8l0lie)a(h(%uvu)Yy!d z4hpIn@aw51V3-p6Jt-j^Wf-}Ml#6g0M?|=c#ziK%NhUHgh?%0y;4OxSDIs59s{d_j z;ae~C?>>Z$JQ0FnxWaHf5du#n9L}yVTu+3+6AAx+WY=$`gfRXdoB#fcs)#FOT?v6h z%0$?Ek_--P`E5~xM!}z5dp|frdXKGNLXemIQlQ@B56BZ?aIXD`5=`pd;`x*W4i*z0 zf3WyU;AHWY#6wqKL$>0xZ;HcaSNfZBTAmybwpGy*W#E= zi)ghP62(<=xk|0VH8K>@qB;b}>#rk%YlPr#Ap~x;d2b|oLsfqz5yN~}5&=OH;zpw6 n5sC(R=tkoFVY{m;yYH#0tZLseKjN8$7I-s641$wnJr(m$_p5#q literal 0 HcmV?d00001 diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 9a647837..86b157da 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -269,8 +269,11 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref ) { $valid_crs = true; } elseif (('/' == $v[0]) && ('Index' == $v[1]) && (isset($sarr[($k + 1)]))) { - // first object number in the subsection - $index_first = (int) ($sarr[($k + 1)][1][0][1]); + // initialize list for: first object number in the subsection / number of objects + $index_blocks = []; + for ($m = 0; $m < \count($sarr[($k + 1)][1]); $m += 2) { + $index_blocks[] = [$sarr[($k + 1)][1][$m][1], $sarr[($k + 1)][1][$m + 1][1]]; + } } elseif (('/' == $v[0]) && ('Prev' == $v[1]) && (isset($sarr[($k + 1)]) && ('numeric' == $sarr[($k + 1)][0]))) { // get previous xref offset $prevxref = (int) ($sarr[($k + 1)][1]); @@ -432,8 +435,9 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref } // fill xref - if (isset($index_first)) { - $obj_num = $index_first; + if (isset($index_blocks)) { + // load the first object number of the first /Index entry + $obj_num = $index_blocks[0][0]; } else { $obj_num = 0; } @@ -463,6 +467,21 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref break; } ++$obj_num; + if (isset($index_blocks)) { + // reduce the number of remaining objects + --$index_blocks[0][1]; + if (0 == $index_blocks[0][1]) { + // remove the actual used /Index entry + array_shift($index_blocks); + if (0 < \count($index_blocks)) { + // load the first object number of the following /Index entry + $obj_num = $index_blocks[0][0]; + } else { + // if there are no more entries, remove $index_blocks to avoid actions on an empty array + unset($index_blocks); + } + } + } } } // end decoding data if (isset($prevxref)) { diff --git a/tests/Integration/PageTest.php b/tests/Integration/PageTest.php index 3f136f57..8d0670ce 100644 --- a/tests/Integration/PageTest.php +++ b/tests/Integration/PageTest.php @@ -486,12 +486,17 @@ public function testGetPages() $document = $this->getParserInstance()->parseFile($filename); $pages = $document->getPages(); - // This should actually be 3 pages, but as long as the cause for issue #331 - // has not been found and the issue is not fixed, we'll settle for 2 here. - // We still test for the count, so in case the bug should be fixed - // unknowingly, we don't forget to resolve the issue as well and make sure - // this assertion is present. - $this->assertCount(2, $pages); + /* + * The problem of issue #331 is fixed by the pull request of the issue #479. + * The original Issue331.pdf was modified so for the updated version (actual + * version) a new xref was added and now the valid /Index has the following value: + * [1 1 3 1 7 1 175 1 178 1 219 2] + * This means, that there a 6 pairs containing the values for 'first object id' + * and 'number of objects'. Till now only the first entry was used and so the + * objects of all following entries gots a wrong id. + * By the fix of issue #479 now the expected number of pages is counted. + */ + $this->assertCount(3, $pages); foreach ($pages as $page) { $this->assertTrue($page instanceof Page); diff --git a/tests/Integration/RawData/RawDataParserTest.php b/tests/Integration/RawData/RawDataParserTest.php index 7f15870f..f5b16fa9 100644 --- a/tests/Integration/RawData/RawDataParserTest.php +++ b/tests/Integration/RawData/RawDataParserTest.php @@ -119,4 +119,35 @@ public function testDecodeObjectHeaderIssue405() $this->assertStringContainsString('Bug fix: PR #405', $pages[0]->getText()); } + + /** + * Tests buggy behavior of decodeXrefStream. + * + * When PDF has more than one entry in the /Index area (for example by changing + * the document description), only the first entry is used. + * If the fix is not used the array returned by getDetails() contains only the entry + * with the key 'Pages'. All other entries like 'Author', 'Creator', 'Title', + * 'Subject' (which come from the 'Info' object) are not listed, because the + * 'Info' object gets a wrong object id during parsing the data into the xref structure. + * So the object id listed at the /Info entry is not valid and the data of the info object + * cannot be loaded during executing Document::buildDetails(). + * + * @see https://github.com/smalot/pdfparser/pull/479 + */ + public function testDecodeXrefStreamIssue479() + { + $filename = $this->rootDir.'/samples/bugs/Issue479.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $details = $document->getDetails(); + + $this->assertArrayHasKey('Author', $details); + $this->assertArrayHasKey('CreationDate', $details); + $this->assertArrayHasKey('Creator', $details); + $this->assertArrayHasKey('ModDate', $details); + $this->assertArrayHasKey('Producer', $details); + $this->assertArrayHasKey('Subject', $details); + $this->assertArrayHasKey('Title', $details); + } }