From 12591b47b877b18cc066e730d364f25f745a0f39 Mon Sep 17 00:00:00 2001 From: Vitali Prudnikovich Date: Mon, 1 Jul 2024 11:12:02 +0000 Subject: [PATCH] Fix SO error while iterating struct tree root DEVSIX-8373 Autoported commit. Original commit hash: [bba9f55da] --- .../itext/kernel/pdf/PdfStructTreeRootTest.cs | 7 ++ .../pdf/tagutils/TagTreeIteratorTest.cs | 69 ++++++++++++++++ .../pdf/tagutils/TagTreePointerUnitTest.cs | 51 ++++++++++++ .../kernel/utils/TaggedPdfReaderToolTest.cs | 23 ++++++ .../cyclicReferences.pdf | Bin 0 -> 9616 bytes .../cmp_cyclicReferences.xml | 4 + .../kernel/pdf/tagging/PdfStructTreeRoot.cs | 15 ++-- .../kernel/pdf/tagutils/TagTreeIterator.cs | 74 +++++++++++++++--- .../TagTreeIteratorAvoidDuplicatesApprover.cs | 68 ++++++++++++++++ .../TagTreeIteratorElementApprover.cs | 56 +++++++++++++ .../pdf/tagutils/TagTreeIteratorFlusher.cs | 46 +++++++++++ .../kernel/pdf/tagutils/TagTreePointer.cs | 73 +++++++++++++---- .../kernel/pdf/tagutils/WaitingTagsManager.cs | 24 ++++-- .../itext/kernel/utils/TaggedPdfReaderTool.cs | 6 ++ port-hash | 2 +- 15 files changed, 479 insertions(+), 39 deletions(-) create mode 100644 itext.tests/itext.kernel.tests/resources/itext/kernel/pdf/PdfStructTreeRootTest/cyclicReferences.pdf create mode 100644 itext.tests/itext.kernel.tests/resources/itext/kernel/utils/TaggedPdfReaderToolTest/cmp_cyclicReferences.xml create mode 100644 itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorAvoidDuplicatesApprover.cs create mode 100644 itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorElementApprover.cs create mode 100644 itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorFlusher.cs diff --git a/itext.tests/itext.kernel.tests/itext/kernel/pdf/PdfStructTreeRootTest.cs b/itext.tests/itext.kernel.tests/itext/kernel/pdf/PdfStructTreeRootTest.cs index ac49763b6b..7caebd4d0f 100644 --- a/itext.tests/itext.kernel.tests/itext/kernel/pdf/PdfStructTreeRootTest.cs +++ b/itext.tests/itext.kernel.tests/itext/kernel/pdf/PdfStructTreeRootTest.cs @@ -88,5 +88,12 @@ public virtual void IdTreeIsLazyTest() { PdfDocument readPdfDoc = new PdfDocument(r); NUnit.Framework.Assert.IsFalse(readPdfDoc.GetStructTreeRoot().GetPdfObject().ContainsKey(PdfName.IDTree)); } + + [NUnit.Framework.Test] + public virtual void CyclicReferencesTest() { + String inFile = sourceFolder + "cyclicReferences.pdf"; + PdfDocument pdfDoc = new PdfDocument(new PdfReader(inFile), new PdfWriter(new MemoryStream())); + NUnit.Framework.Assert.DoesNotThrow(() => pdfDoc.Close()); + } } } diff --git a/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreeIteratorTest.cs b/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreeIteratorTest.cs index d431109a25..92325f73d6 100644 --- a/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreeIteratorTest.cs +++ b/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreeIteratorTest.cs @@ -40,6 +40,28 @@ public virtual void TagTreeIteratorTagPointerNull() { NUnit.Framework.Assert.AreEqual(e.Message, errorMessage); } + [NUnit.Framework.Test] + public virtual void TagTreeIteratorApproverNull() { + String errorMessage = MessageFormatUtil.Format(KernelExceptionMessageConstant.ARG_SHOULD_NOT_BE_NULL, "approver" + ); + PdfDocument doc = new PdfDocument(new PdfWriter(new ByteArrayOutputStream(), new WriterProperties())); + doc.SetTagged(); + Exception e = NUnit.Framework.Assert.Catch(typeof(ArgumentException), () => new TagTreeIterator(doc.GetStructTreeRoot + (), null, TagTreeIterator.TreeTraversalOrder.PRE_ORDER)); + NUnit.Framework.Assert.AreEqual(e.Message, errorMessage); + } + + [NUnit.Framework.Test] + public virtual void TagTreeIteratorHandlerNull() { + String errorMessage = MessageFormatUtil.Format(KernelExceptionMessageConstant.ARG_SHOULD_NOT_BE_NULL, "handler" + ); + PdfDocument doc = new PdfDocument(new PdfWriter(new ByteArrayOutputStream(), new WriterProperties())); + doc.SetTagged(); + TagTreeIterator it = new TagTreeIterator(doc.GetStructTreeRoot()); + Exception e = NUnit.Framework.Assert.Catch(typeof(ArgumentException), () => it.AddHandler(null)); + NUnit.Framework.Assert.AreEqual(e.Message, errorMessage); + } + [NUnit.Framework.Test] public virtual void TraversalWithoutElements() { PdfDocument doc = new PdfDocument(new PdfWriter(new ByteArrayOutputStream(), new WriterProperties())); @@ -76,6 +98,53 @@ public virtual void TraversalWithSomeElements() { NUnit.Framework.Assert.AreEqual(PdfName.Code, handler.nodes[6].GetRole()); } + [NUnit.Framework.Test] + public virtual void PostOrderTraversal() { + PdfDocument doc = new PdfDocument(new PdfWriter(new ByteArrayOutputStream(), new WriterProperties())); + doc.SetTagged(); + TagTreePointer tp = new TagTreePointer(doc); + tp.AddTag(StandardRoles.DIV); + tp.AddTag(StandardRoles.P); + tp.AddTag(StandardRoles.FIGURE); + tp.MoveToParent(); + tp.AddTag(StandardRoles.DIV); + tp.AddTag(StandardRoles.CODE); + TagTreeIterator iterator = new TagTreeIterator(doc.GetStructTreeRoot(), new TagTreeIteratorElementApprover + (), TagTreeIterator.TreeTraversalOrder.POST_ORDER); + TagTreeIteratorTest.TestHandler handler = new TagTreeIteratorTest.TestHandler(); + iterator.AddHandler(handler); + iterator.Traverse(); + NUnit.Framework.Assert.AreEqual(7, handler.nodes.Count); + NUnit.Framework.Assert.AreEqual(PdfName.Figure, handler.nodes[0].GetRole()); + NUnit.Framework.Assert.AreEqual(PdfName.Code, handler.nodes[1].GetRole()); + NUnit.Framework.Assert.AreEqual(PdfName.Div, handler.nodes[2].GetRole()); + NUnit.Framework.Assert.AreEqual(PdfName.P, handler.nodes[3].GetRole()); + NUnit.Framework.Assert.AreEqual(PdfName.Div, handler.nodes[4].GetRole()); + NUnit.Framework.Assert.AreEqual(PdfName.Document, handler.nodes[5].GetRole()); + NUnit.Framework.Assert.IsNull(handler.nodes[6].GetRole()); + } + + [NUnit.Framework.Test] + public virtual void CyclicReferencesTraversal() { + PdfDocument doc = new PdfDocument(new PdfWriter(new ByteArrayOutputStream(), new WriterProperties())); + doc.SetTagged(); + PdfStructElem kid1 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.P)); + PdfStructElem kid2 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.DIV)); + doc.GetStructTreeRoot().AddKid(kid1); + doc.GetStructTreeRoot().AddKid(kid2); + kid1.AddKid(kid2); + kid2.AddKid(kid1); + TagTreeIterator iterator = new TagTreeIterator(doc.GetStructTreeRoot(), new TagTreeIteratorAvoidDuplicatesApprover + (), TagTreeIterator.TreeTraversalOrder.POST_ORDER); + TagTreeIteratorTest.TestHandler handler = new TagTreeIteratorTest.TestHandler(); + iterator.AddHandler(handler); + iterator.Traverse(); + NUnit.Framework.Assert.AreEqual(3, handler.nodes.Count); + NUnit.Framework.Assert.AreEqual(PdfName.Div, handler.nodes[0].GetRole()); + NUnit.Framework.Assert.AreEqual(PdfName.P, handler.nodes[1].GetRole()); + NUnit.Framework.Assert.IsNull(handler.nodes[2].GetRole()); + } + //\cond DO_NOT_DOCUMENT internal class TestHandler : ITagTreeIteratorHandler { //\cond DO_NOT_DOCUMENT diff --git a/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreePointerUnitTest.cs b/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreePointerUnitTest.cs index ad8a3e9680..5078be6d57 100644 --- a/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreePointerUnitTest.cs +++ b/itext.tests/itext.kernel.tests/itext/kernel/pdf/tagutils/TagTreePointerUnitTest.cs @@ -130,6 +130,57 @@ public virtual void CannotFlushAlreadyFlushedPageTest() { NUnit.Framework.Assert.AreEqual(KernelExceptionMessageConstant.PAGE_ALREADY_FLUSHED, exception.Message); } + [NUnit.Framework.Test] + public virtual void CyclicReferencesWhileLookingForRoleTest() { + PdfDocument doc = CreateTestDocument(); + PdfStructElem kid1 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.P)); + PdfStructElem kid2 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.DIV)); + doc.GetStructTreeRoot().AddKid(kid1); + doc.GetStructTreeRoot().AddKid(kid2); + kid1.AddKid(kid2); + kid2.AddKid(kid1); + TagTreePointer pointer = new TagTreePointer(doc); + Exception exception = NUnit.Framework.Assert.Catch(typeof(PdfException), () => pointer.MoveToKid(StandardRoles + .FIGURE)); + NUnit.Framework.Assert.AreEqual(KernelExceptionMessageConstant.NO_KID_WITH_SUCH_ROLE, exception.Message); + } + + [NUnit.Framework.Test] + public virtual void CyclicReferencesWhileFlushingTest() { + PdfDocument doc = CreateTestDocument(); + PdfStructElem kid1 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.P)); + PdfStructElem kid2 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.DIV)); + doc.GetStructTreeRoot().AddKid(kid1); + doc.GetStructTreeRoot().AddKid(kid2); + kid1.AddKid(kid2); + kid2.AddKid(kid1); + TagTreePointer pointer = new TagTreePointer(doc); + pointer.MoveToKid(StandardRoles.P); + NUnit.Framework.Assert.DoesNotThrow(() => pointer.FlushTag()); + NUnit.Framework.Assert.IsTrue(kid1.IsFlushed()); + NUnit.Framework.Assert.IsTrue(kid2.IsFlushed()); + } + + [NUnit.Framework.Test] + public virtual void CyclicReferencesWithWaitingObjectsWhileFlushingTest() { + PdfDocument doc = CreateTestDocument(); + PdfStructElem kid1 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.P)); + PdfStructElem kid2 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.DIV)); + doc.GetStructTreeRoot().AddKid(kid1); + doc.GetStructTreeRoot().AddKid(kid2); + kid1.AddKid(kid2); + kid2.AddKid(kid1); + TagTreePointer pointer = new TagTreePointer(doc); + pointer.MoveToKid(StandardRoles.P); + WaitingTagsManager waitingTagsManager = pointer.GetContext().GetWaitingTagsManager(); + Object pWaitingTagObj = new Object(); + waitingTagsManager.AssignWaitingState(pointer, pWaitingTagObj); + pointer.MoveToParent().MoveToKid(StandardRoles.DIV); + NUnit.Framework.Assert.DoesNotThrow(() => pointer.FlushTag()); + NUnit.Framework.Assert.IsFalse(kid1.IsFlushed()); + NUnit.Framework.Assert.IsTrue(kid2.IsFlushed()); + } + private static PdfDocument CreateTestDocument() { PdfDocument pdfDoc = new PdfDocument(new PdfWriter(new ByteArrayOutputStream())); pdfDoc.SetTagged(); diff --git a/itext.tests/itext.kernel.tests/itext/kernel/utils/TaggedPdfReaderToolTest.cs b/itext.tests/itext.kernel.tests/itext/kernel/utils/TaggedPdfReaderToolTest.cs index 4e9ab1ddb7..02da1d4d86 100644 --- a/itext.tests/itext.kernel.tests/itext/kernel/utils/TaggedPdfReaderToolTest.cs +++ b/itext.tests/itext.kernel.tests/itext/kernel/utils/TaggedPdfReaderToolTest.cs @@ -25,6 +25,7 @@ You should have received a copy of the GNU Affero General Public License using iText.Commons.Utils; using iText.Kernel.Exceptions; using iText.Kernel.Pdf; +using iText.Kernel.Pdf.Tagging; using iText.Test; namespace iText.Kernel.Utils { @@ -77,5 +78,27 @@ public virtual void NoStructTreeRootInDocTest() { NUnit.Framework.Assert.Fail("IOException is not expected to be triggered"); } } + + [NUnit.Framework.Test] + public virtual void CyclicReferencesTest() { + String outXmlPath = DESTINATION_FOLDER + "cyclicReferences.xml"; + String cmpXmlPath = SOURCE_FOLDER + "cmp_cyclicReferences.xml"; + PdfDocument doc = new PdfDocument(new PdfWriter(new MemoryStream())); + doc.SetTagged(); + PdfStructElem kid1 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.P)); + PdfStructElem kid2 = new PdfStructElem(doc, PdfStructTreeRoot.ConvertRoleToPdfName(StandardRoles.DIV)); + doc.GetStructTreeRoot().AddKid(kid1); + doc.GetStructTreeRoot().AddKid(kid2); + kid1.AddKid(kid2); + kid2.AddKid(kid1); + TaggedPdfReaderTool tool = new TaggedPdfReaderTool(doc); + using (Stream outXml = FileUtil.GetFileOutputStream(outXmlPath)) { + tool.ConvertToXml(outXml, "UTF-8"); + } + CompareTool compareTool = new CompareTool(); + if (!compareTool.CompareXmls(outXmlPath, cmpXmlPath)) { + NUnit.Framework.Assert.Fail("Resultant xml is different."); + } + } } } diff --git a/itext.tests/itext.kernel.tests/resources/itext/kernel/pdf/PdfStructTreeRootTest/cyclicReferences.pdf b/itext.tests/itext.kernel.tests/resources/itext/kernel/pdf/PdfStructTreeRootTest/cyclicReferences.pdf new file mode 100644 index 0000000000000000000000000000000000000000..5b6965718840d493fd2cc59f9514e476524e3aaa GIT binary patch literal 9616 zcmeHt2T)U6*S6g&*c;M=(h<@N9YlikF47GoK!6Yop*OLBA^}8-(lJt%F1=kv0jbiZ z2ntf9DpgA4J3;U3E$=(?eKY^e|Iau7nSt!H_S$Pdd#$~nlasXsbS`4Wp%Q2ofsxU+ zP8K8x0wP=4v&hJRu|!8Ifda-l;;4j+1Zy&$0M;Nl*;4I57>Jyl3zb5^kyt$9JKdW! zz~X9#dHL1)f%7E0Qo%+jSZ}b)^u%+sJ;bLD`W&`h|}<^4}l;X5L1CfuR07 z2p~V#1t8d7;Qs^x`WFKAKRfA{Nz~7^TqIk&l7JBY=hZMZf38N|lxbM+kFXn12m}yp zJDxu`0#l+z@Sv&_JV8*f9@&wgiF1~b0X+IAXEBxZ$Ykm-hcQk3oI(lM=u{%v=^}6r zgfB|LATS674ueAxNCX5X27wDfAVMNwO)~zM0s?}9NkSyyXt?CR6m%$LysI^VB1|*@ z;N*bN5>N>!R9s(6QAhuhwgD&;Fbff9S_f@wfK(`CS7(scWssS$qKb}&xEq1u0>Cg6 zkp_{-c%luFfCss{5S?s)=>N0``9EMtiA*7YFcJ_6sB$uD(^~c`*BOIU~TVgvV=9#WTIL$=TS&!}D?QeAWi6eTjrne4$BVKqSrOvd0fD z2SJ%ksrP5DQQc-#FyIMmf-}_>=lBQ5!y%Fwu)eDml}UE0B*2kBT>6tm|IAH4`P#!` z_Wg$Y$uaKwuQRQ4&4O{71!a!Eggd`EpS+N4k*u4fOxHb@D%^dIYoOSn zD0b3tQ;0#*3Bvkb7LA%x4i0!18Em0kv`iD5e36vUb}@1}U{WtcIoB#IY&L_oAcorACe(g>Ep6x|PpX}jG2?{+nsl%6iOKdq^8ii4Ar&X;#TgP*6B#4B;o z-LU)>!gef2y0?CGSiAr8I$MYpEi|;_sD}rC1>^Q3rA)oXYYFQ@ZY^_StFq6ZBTepE zgqc6}w>vyD%}&fMo=@_1(jFIy9ky0+qt0)3BB1*?#;lq;qUjInQ4_q$Yof*D#j?+r zj%qev$IjQZ+D}xM4bQ}RqsHqd>49z%Gmr1Hzn%15nH;euKk=_1d{&HAzhfk=!V+g^ zwFBWbeDG9n4Q#o8?sk1Ga%_ujU7dk|?ck_7@{;sXc2S!n2P55(roVBv3{B9@lVRCIT+nL?5H%VQMqNj+)S^CXKQ1710WH< z)49?sXlZF_rtQ*Ja5);gPk|obeYVhcc#ynVh2-ZnesX{3Tf6IAy~k9Nqzk0&=8_&w z9;6+-j=gz2*R2}kdnx{P=b$|6{o5km!Gg@elQ~Ow6gC77IS6#<4x^!oc2CNu(+3{Sz_mvd^ z+sCo2Xg7q4qRw=}tP{_QJ>ok9I2+^2>vv4F&v!-Pufc7>hsg7 zW##1K6GVQY6OU>*DlVK@OxE3dZcQ^cefZtntXx{y174D_VXUee>Q}ufW_>| zbd|O?o-YfW5uB8^B?f~sQ!qv7YMPB9;`tH%gT;fjx#?k^GP0t#Sku?FEkZR9M(_K4 zPIAP6C@7_Q;#^8Q_H0o@b(yWdY)x-ddwRH=Q#Jg{#L_~R`%UE{EE&^F#ET-zvkZt| zy5G=Eql@2~_1^w=`vJq>d~@yd;jdxYNBG}6JH2f%u%i-8T?Rrms%F0Oyj9CsP+b^p z^Oj3pTOe#1h$x?s6ijOGO=+)?%vo$6l$%!z<%1)8mX~U7`sYsOhnkr zMszyG^bL)sX zVkh3j&&e<6w$9sCvve0xB{)|$C+wR^OLv7vCAt_%b8nO&V=dybdX4c1BXlvw5RC zwb0!1ZcF+mw`!K{QsD43Rr)4Jk-06v2K#LedzmEzdwVpy>n{1k@LRLcjy84ul`a{# zC3y!1wzeqmT{ds5A91E`zGS?(J-xhoPHMH}`vaGD<5+e5^+!X`PbF|Te8u*j3L3gp zBcc@+7_5XNZ&|-KZkOKeFPZdBsKt<9^Z-|gy~7w5u)ve(msuE_TqbcXn4!B++sQ~z zn^9fAq?_BQ+0wZZnIG1hok8GabibUdT5NlJ(Wqamrq_FXw(IVe+ol39QP)dqKI^9X zs=Hka)*bIIQ~XUm*6xkTJ>I2if651L%fpAFis`*y7CRD5^4c$n7dYV1n$I_dzR!=| z^k2BPoQpSJ5y`kvdbNwA1qNu`-xLIk4fymiUdwyQtzPSnwG$Wh*YI(P+R)YLxk0m+ zFPCIX@GhCBb|$?y6sdxqC(+w0imuX;+3vsl_O8tWu~Fn=6*S~Xp!#`99C7NsLCgAn z;aqF5){{{GZiYY$Bl=kb%7i5*#4D5bmd_>O%MUz$?eGv;B^S^o43DR zzJa}YwuZ||5T_}m0ZW$kfYxea-V`o zx5aF-D*U!Hj)zSpPLVD*kWoOd#~J{f{>X}6Dv&FQm`cx4GqN;Z*(r!SPE%8$`*BDd z?7Vcim=1fJk%V1{Bl0WmnHpt@5I;y;7t~d4kpG?k>KK=*T^E`&n8jB80!I_3QITG) zD@A9^x&?KOe{5AgU%tL$Q-mwbpSQhS?m1j}N-pWd4vLT5otbuX}2y$rR8Z?#s* z-9z(2J}#x#OVP95W*D8LG`%0b@l*?F3$>dVEKdj^vRG(duy~`2^RY#z^DicfBBt(6 z`SQQJW3NC@db-m3c@anh|37#&n`-eshkK8#4aCF z1_DnSuv=2P7)vyj)1d>EY;;k;URqbA=*@)xk`L5FyTHhuH!b=^?3!9LsjgG=e8ET zDyx{zwtz!q9iUC!5#b3b=PNlQ(Xr=biY|B7fUe`kDh0Zff%eoQ=FIXA+LX@wUTf*8 zy&UGOdl6GRIlAu`ohI_zLLaaAmU44+r+7)rI@PT1qD@^IAg*!FzC!0eD`m4Y=v;f% zi?GLjkIMsaeO8-kEXyfG7$MEPcHxeP3*t-CWeNf{^xIE zo{n7I-21;IRJr@kt(#fi2~s|V9|@mj5>Zpe*EpXhL3yd-Qt_RlImA-dcBi6OGYQ~wzHVSuU5!5)x4K$ zH)!5!M@#XfW+xxrrKz~i zuk7h!y?dhRY%GlZfzNzIMs7UUqKS8XfoHZSrl&0+!AY*D@Iiv_!g}MoQriX+@56@J zFUDt~yE%6sn3)&~$4_Kps~R{mvzKsm;e_A6js_Wt11m%acH)*F5WxT2D z=*mxhvMsiOE}ki*2D`K(!ce-q>nB@6ppA*G|M$FlxtaV!sl_YD!f5^SKRT`S+B@CIp_jf>Oog~ZeLh2^&7`V(C2=uh-^V0<4POUVeBH_=FOf`NE(Mh==gqmOzZd4T zTr`iEdr>K0Srtm_@vP#1r>)HIS616vq|(HtXkquT2fnUw&k-Rz+w45YJ8Us=B+jGi zR%=*8IY~w9Rte%@{kp~s*HY>cj*gAORK%%B#??=K(IexhubH$>4$))Cp@r9$0^E9m z?_N_%KINI==joE_l-<#oY)kK{N%tj-W**9byo||ONmvh$wB?yAp-P(qnqo3f&Gjqa zey;xPn%KShUXW|rRi*V*HDj#NdVQK=+Cc=g%w^L{kD&f{_0B0wV{YxXR$*a5PW2Gh+kJ&$XR;V$?--fzx7Wg z3C|C(3-AwcSo2z367;#etrQMRpAktvkhC}E3p+kCob%HWWSGZHWh$vbn2()Zw#05; zON14srW`6>oNsf3Ey88NRu#$0o@%9{E>BjysEg^F_;9=f=u%7izO3nyK zB2+EBeavX?;CCw(A+U{4$Hx)z7A0&T3N!L%`>nOaXcOPDPdSMvR+LKoP>+f!@z%}b+~$riG84rRD7#yoHKi<#0wKDv5J53-0(_%M!S z^OS9xNLTy*wLKP+Sf0cRD=+Pd_AWsmG0U9C%lI|b$mg~`q#f+wqu+ZBL09JYSEV{p zHT!&AhG(TOzt|NV6FnXjVu3!b|2V%<-x6wU^f_Cix-rsy=EL{7UPH#Hab=*G61L{4 zVQO`tn)WD*wtiI5d~B~#`n@Zwci)iN4zn|&Q3e%p4t1vXg7K`0{c}1-Z~O;DzaTn` zE?{$MpgY3@kkX=Sc)q8h8Yh%LDpWXZeWh^q)o|mfDN8rLDJ=fps>v`iq2P<*wPitV zC%I*e%g^gS%Gw9>ay@NrWx5Drot!a@v{4U!iDruzlE7%0IUgP+giY^cPtXPeNgDmV zn0C|co}+!>!Ti!hT0r^!#qxcNT8BOfMP#PF{NSaCorrb_o3mqoHzad45`6oClTYKi zvimjfT;Q}0!{1^I{vcx z@>S!ox#O1mPSOH1qv=gIVf6w5SJ%3=4V(7U5q-Z}|wPe%G5EaZ)TPZ}myP8NMN zDX2>IPaG%?4h#|PxYf4Mzjb->LWIM;*@q4ZE~gZ*QB$w>PSh`ysOB!m=IDXwEXDKh z#%{Mf+I(hL`*J8#?=;@v#d=+VF}aad^`4KQohvY8HqOEr+vkx~Vz7tZuvlx2JFW4h z!j!7Lt0Fdv^y*y2uGe`F7otr-^v2Vn?1O%+V19`p$2xOhwo`3v5mjdrtW77McB=)1 zh+16ve&uS8X7NY{3y2;Z19c!4liG)OnprhujJ!TkK(^9zMbF;Xeckksa(4M}O*x>% zrrmJik-_`(B7n;0F&D#&qxKt#XPJYOSYRxlKy-zKK4H!`^`B~ewkNbw`x4C`=yx(B zzhOg{YdV0d!qClHB)QzP1+ZJ#B&4v+M^|9sgbyWgENryzMd;y zt7W87A2D7rlRAwuYYQ=Q=AFx}D#k{QH=0;az418qeBv<+o0owtmVxVh;l>8$Z=Jcb z9{MeHEhPj!qjh9#Av!ptnAdmo>TJW4nS?juu5_c=L&hn^U11A72Mx6n()&ZOBZ26{ zjW)AZeg(@KbqZJ7opL!pimUc2RHN<2ZwF_r`7C*aEmqYUR)#+eQSIjX+6PU+fSXz0 zu%!oks+$|g+4L_OxAfuqMEcjG=7OB2#%@P2iQV+L?#b#&EH>)%^39@3|y(mo4(ubxHE<0-OJ-l>ElUSB&?S z;d#{RZx%|3oXVP}3wBm=o1IFYMXA+#Yy5BtmC2qWw)6|*)z<{XYm6g!*IVWupbBRm zwaW(COKosh>TqX2>vLW*``Evb`Pi%vRBNN^Xd!E%d-Tj=zs%0PgX3;^mBgVwU5T2l zKDX+tE2j=3+tsk6s*ITQd-iIZ&v-{FYz$8BnKwo^k4R4yX&$R~(>I^-9)Gctb+%90 zdu6}h$MN!vR&+c&zcO6nmyreea3l^EJ(K()C-8o{ zo9nnOp%uL{vT=y5%VP_k5D1gX&Q8BhgY@K57fo_1K^`iM$yrzW`Nn6{317sw&T(`L zRP(+cT5J$!3khsiiM3cyPE4r%a-j-QZed5jeaNq9MVXUqoKREqJk6ZTCVdB{-x=7? znyfrxiM?Xkwni(BPyN<2wJ+`A^|Rx*-ZP&?1<$UXwxcT|rc%Wnrj>6#7b>y@bI?=^ z`G_b%&AL4UHX3{=8m;^+^rk$1DM)k1%;oCKA^F5^*@wIlZLYbR4U>8&s8Z1;sLR%e z6fY$Z>PB2%pSH=^>XWg*!euOMoXZ}{eg)00tdP5peeYDbZt~&(1s|H}8}byPiA(={ z)89_U|9sQeawWL{#Twhi5_&*#^N;$7?X)CQ=g-9^e{_0K2sSLhAOs7LAK$*92oMU1 zgd;&Vpr0}*l$lOtULdD`$zTv5<@#3{1T6_%sqt8H4$$w;eYs1xIuwP?%*hsuw|~ zG8h-$6*k7YaGrBhL)2-!6ji0$Bynh%4HAQdL(mWm8ZZWA>JbEpl{Ll&fdEQ|B&{J<1gH%TZH2+0Ffb?t zfxti!aBCYYj5Sc?VS|Ih@zyW|T+#-LfCA+l)=0dhH5BN=A*}HzG?1QWmNbBV6r3 +
+
+

diff --git a/itext/itext.kernel/itext/kernel/pdf/tagging/PdfStructTreeRoot.cs b/itext/itext.kernel/itext/kernel/pdf/tagging/PdfStructTreeRoot.cs index 7a8d440996..7a6d371016 100644 --- a/itext/itext.kernel/itext/kernel/pdf/tagging/PdfStructTreeRoot.cs +++ b/itext/itext.kernel/itext/kernel/pdf/tagging/PdfStructTreeRoot.cs @@ -29,6 +29,7 @@ You should have received a copy of the GNU Affero General Public License using iText.Kernel.Exceptions; using iText.Kernel.Pdf; using iText.Kernel.Pdf.Filespec; +using iText.Kernel.Pdf.Tagutils; namespace iText.Kernel.Pdf.Tagging { /// Represents a wrapper-class for structure tree root dictionary. @@ -399,7 +400,7 @@ public override void Flush() { GetPdfObject().Put(PdfName.IDTree, this.idTree.BuildTree().MakeIndirect(GetDocument())); } if (!GetDocument().IsAppendMode()) { - FlushAllKids(this); + iText.Kernel.Pdf.Tagging.PdfStructTreeRoot.FlushAllKids(this); } base.Flush(); } @@ -597,13 +598,11 @@ protected internal override bool IsWrappedObjectMustBeIndirect() { return true; } - private void FlushAllKids(IStructureNode elem) { - foreach (IStructureNode kid in elem.GetKids()) { - if (kid is PdfStructElem && !((PdfStructElem)kid).IsFlushed()) { - FlushAllKids(kid); - ((PdfStructElem)kid).Flush(); - } - } + private static void FlushAllKids(iText.Kernel.Pdf.Tagging.PdfStructTreeRoot elem) { + TagTreeIterator iterator = new TagTreeIterator(elem, new TagTreeIteratorAvoidDuplicatesApprover(), TagTreeIterator.TreeTraversalOrder + .POST_ORDER); + iterator.AddHandler(new TagTreeIteratorFlusher()); + iterator.Traverse(); } private void IfKidIsStructElementAddToList(PdfObject kid, IList kids) { diff --git a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIterator.cs b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIterator.cs index d9ad00de10..1ccf6c8faa 100644 --- a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIterator.cs +++ b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIterator.cs @@ -31,28 +31,65 @@ namespace iText.Kernel.Pdf.Tagutils { /// /// This class is used to traverse the tag tree. /// - /// There is a possibility to add a handler that will be called for specific events during the traversal. + /// There is a possibility to add a handler that will be called for the elements during the traversal. /// public class TagTreeIterator { private readonly IStructureNode pointer; private readonly ICollection handlerList; + private readonly TagTreeIteratorElementApprover approver; + + private readonly TagTreeIterator.TreeTraversalOrder traversalOrder; + + /// + /// Creates a new instance of + /// . + /// + /// + /// Creates a new instance of + /// + /// . It will use + /// + /// to filter + /// elements and TreeTraversalOrder.PRE_ORDER for tree traversal. + /// + /// the tag tree pointer. + public TagTreeIterator(IStructureNode tagTreePointer) + : this(tagTreePointer, new TagTreeIteratorElementApprover(), TagTreeIterator.TreeTraversalOrder.PRE_ORDER) { + } + /// /// Creates a new instance of /// . /// /// the tag tree pointer. - public TagTreeIterator(IStructureNode tagTreePointer) { + /// + /// a filter that will be called to let iterator know whether some particular element + /// should be traversed or not. + /// + /// an order in which the tree will be traversed. + public TagTreeIterator(IStructureNode tagTreePointer, TagTreeIteratorElementApprover approver, TagTreeIterator.TreeTraversalOrder + traversalOrder) { if (tagTreePointer == null) { throw new ArgumentException(MessageFormatUtil.Format(KernelExceptionMessageConstant.ARG_SHOULD_NOT_BE_NULL , "tagTreepointer")); } + if (approver == null) { + throw new ArgumentException(MessageFormatUtil.Format(KernelExceptionMessageConstant.ARG_SHOULD_NOT_BE_NULL + , "approver")); + } + if (traversalOrder == null) { + throw new ArgumentException(MessageFormatUtil.Format(KernelExceptionMessageConstant.ARG_SHOULD_NOT_BE_NULL + , "traversalOrder")); + } this.pointer = tagTreePointer; + this.traversalOrder = traversalOrder; handlerList = new HashSet(); + this.approver = approver; } - /// Adds a handler that will be called for specific events during the traversal. + /// Adds a handler that will be called for the elements during the traversal. /// the handler. /// /// this @@ -60,6 +97,10 @@ public TagTreeIterator(IStructureNode tagTreePointer) { /// instance. /// public virtual iText.Kernel.Pdf.Tagutils.TagTreeIterator AddHandler(ITagTreeIteratorHandler handler) { + if (handler == null) { + throw new ArgumentException(MessageFormatUtil.Format(KernelExceptionMessageConstant.ARG_SHOULD_NOT_BE_NULL + , "handler")); + } this.handlerList.Add(handler); return this; } @@ -71,22 +112,37 @@ public virtual iText.Kernel.Pdf.Tagutils.TagTreeIterator AddHandler(ITagTreeIter /// Make sure the correct handlers are added before calling this method. /// public virtual void Traverse() { - Traverse(this.pointer, this.handlerList); + Traverse(this.pointer); } - private static void Traverse(IStructureNode elem, ICollection handlerList) { - if (elem == null) { + private void Traverse(IStructureNode elem) { + if (!approver.Approve(elem)) { return; } - foreach (ITagTreeIteratorHandler handler in handlerList) { - handler.NextElement(elem); + if (traversalOrder == TagTreeIterator.TreeTraversalOrder.PRE_ORDER) { + foreach (ITagTreeIteratorHandler handler in handlerList) { + handler.NextElement(elem); + } } IList kids = elem.GetKids(); if (kids != null) { foreach (IStructureNode kid in kids) { - Traverse(kid, handlerList); + Traverse(kid); + } + } + if (traversalOrder == TagTreeIterator.TreeTraversalOrder.POST_ORDER) { + foreach (ITagTreeIteratorHandler handler in handlerList) { + handler.NextElement(elem); } } } + + /// Tree traversal order enum. + public enum TreeTraversalOrder { + /// Preorder traversal. + PRE_ORDER, + /// Postorder traversal. + POST_ORDER + } } } diff --git a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorAvoidDuplicatesApprover.cs b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorAvoidDuplicatesApprover.cs new file mode 100644 index 0000000000..b3688c6bd5 --- /dev/null +++ b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorAvoidDuplicatesApprover.cs @@ -0,0 +1,68 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2024 Apryse Group NV +Authors: Apryse Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using System.Collections.Generic; +using iText.Kernel.Pdf; +using iText.Kernel.Pdf.Tagging; + +namespace iText.Kernel.Pdf.Tagutils { + /// + /// Element checker for + /// . + /// + /// + /// Element checker for + /// . + /// It is used to check whether specific element should be traversed. + /// It doesn't approve elements which have been traversed before. + /// + public class TagTreeIteratorAvoidDuplicatesApprover : TagTreeIteratorElementApprover { + private readonly ICollection processedObjects = new HashSet(); + + /// + /// Creates a new instance of + /// + /// + public TagTreeIteratorAvoidDuplicatesApprover() + : base() { + } + + /// + public override bool Approve(IStructureNode elem) { + if (elem is PdfStructTreeRoot) { + return true; + } + if (!base.Approve(elem) || !(elem is PdfStructElem)) { + return false; + } + PdfObject obj = ((PdfStructElem)elem).GetPdfObject(); + bool isProcessed = processedObjects.Contains(obj); + if (isProcessed) { + return false; + } + else { + processedObjects.Add(obj); + return true; + } + } + } +} diff --git a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorElementApprover.cs b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorElementApprover.cs new file mode 100644 index 0000000000..5ff992fd1e --- /dev/null +++ b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorElementApprover.cs @@ -0,0 +1,56 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2024 Apryse Group NV +Authors: Apryse Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Kernel.Pdf.Tagging; + +namespace iText.Kernel.Pdf.Tagutils { + /// + /// Element checker for + /// . + /// + /// + /// Element checker for + /// . + /// It is used to check whether specific element should be traversed. + /// + public class TagTreeIteratorElementApprover { + /// + /// Creates a new instance of + /// + /// + public TagTreeIteratorElementApprover() { + } + + // Empty constructor + /// Checks whether the element should be traversed. + /// the element to check + /// + /// + /// + /// if the element should be traversed, + /// false otherwise + /// + public virtual bool Approve(IStructureNode elem) { + return elem != null; + } + } +} diff --git a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorFlusher.cs b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorFlusher.cs new file mode 100644 index 0000000000..faac173d70 --- /dev/null +++ b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreeIteratorFlusher.cs @@ -0,0 +1,46 @@ +/* +This file is part of the iText (R) project. +Copyright (c) 1998-2024 Apryse Group NV +Authors: Apryse Software. + +This program is offered under a commercial and under the AGPL license. +For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below. + +AGPL licensing: +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +*/ +using iText.Kernel.Pdf.Tagging; + +namespace iText.Kernel.Pdf.Tagutils { + /// + /// Class that flushes struct elements while iterating over struct tree root with + /// . + /// + public class TagTreeIteratorFlusher : ITagTreeIteratorHandler { + /// + /// Creates a new instance of + /// + /// + public TagTreeIteratorFlusher() { + } + + // Empty constructor + /// + public virtual void NextElement(IStructureNode elem) { + if (elem is PdfStructElem && !((PdfStructElem)elem).IsFlushed()) { + ((PdfStructElem)elem).Flush(); + } + } + } +} diff --git a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreePointer.cs b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreePointer.cs index 4a02187854..b22458e775 100644 --- a/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreePointer.cs +++ b/itext/itext.kernel/itext/kernel/pdf/tagutils/TagTreePointer.cs @@ -761,22 +761,19 @@ public virtual iText.Kernel.Pdf.Tagutils.TagTreePointer MoveToKid(int n, String if (MCR_MARKER.Equals(role)) { throw new PdfException(KernelExceptionMessageConstant.CANNOT_MOVE_TO_MARKED_CONTENT_REFERENCE); } - IList descendants = new List(GetCurrentStructElem().GetKids()); - int k = 0; - for (int i = 0; i < descendants.Count; ++i) { - if (descendants[i] == null || descendants[i] is PdfMcr) { - continue; - } - String descendantRole = descendants[i].GetRole().GetValue(); - if (descendantRole.Equals(role) && k++ == n) { - SetCurrentStructElem((PdfStructElem)descendants[i]); - return this; - } - else { - descendants.AddAll(descendants[i].GetKids()); - } + TagTreePointer.RoleFinderHandler handler = new TagTreePointer.RoleFinderHandler(n, role); + TagTreePointer.TagTreeIteratorApproverWithStop approver = new TagTreePointer.TagTreeIteratorApproverWithStop + (handler); + TagTreeIterator iterator = new TagTreeIterator(GetCurrentStructElem(), approver, TagTreeIterator.TreeTraversalOrder + .PRE_ORDER); + iterator.AddHandler(handler); + iterator.Traverse(); + PdfStructElem elem = handler.GetFoundElement(); + if (elem == null) { + throw new PdfException(KernelExceptionMessageConstant.NO_KID_WITH_SUCH_ROLE); } - throw new PdfException(KernelExceptionMessageConstant.NO_KID_WITH_SUCH_ROLE); + SetCurrentStructElem(elem); + return this; } /// Gets current tag kids roles. @@ -1131,5 +1128,51 @@ private void ThrowExceptionIfCurrentPageIsNotInited() { throw new PdfException(KernelExceptionMessageConstant.PAGE_IS_NOT_SET_FOR_THE_PDF_TAG_STRUCTURE); } } + + private class RoleFinderHandler : ITagTreeIteratorHandler { + private readonly int n; + + private readonly String role; + + private int foundIdx = 0; + + private PdfStructElem foundElem; + +//\cond DO_NOT_DOCUMENT + internal RoleFinderHandler(int n, String role) { + this.n = n; + this.role = role; + } +//\endcond + + public virtual void NextElement(IStructureNode elem) { + if (foundElem != null) { + return; + } + String descendantRole = elem.GetRole().GetValue(); + if (descendantRole.Equals(role) && foundIdx++ == n) { + foundElem = (PdfStructElem)elem; + } + } + + public virtual PdfStructElem GetFoundElement() { + return foundElem; + } + } + + [System.ObsoleteAttribute(@"change ITagTreeIteratorHandler#nextElement to return boolean showing whether the iteration should be continued. It will allow to get rid of this ugly workaround." + )] + private class TagTreeIteratorApproverWithStop : TagTreeIteratorAvoidDuplicatesApprover { + private readonly TagTreePointer.RoleFinderHandler handler; + + public TagTreeIteratorApproverWithStop(TagTreePointer.RoleFinderHandler handler) + : base() { + this.handler = handler; + } + + public override bool Approve(IStructureNode elem) { + return base.Approve(elem) && handler.GetFoundElement() == null; + } + } } } diff --git a/itext/itext.kernel/itext/kernel/pdf/tagutils/WaitingTagsManager.cs b/itext/itext.kernel/itext/kernel/pdf/tagutils/WaitingTagsManager.cs index 1797fff24c..18afe508f4 100644 --- a/itext/itext.kernel/itext/kernel/pdf/tagutils/WaitingTagsManager.cs +++ b/itext/itext.kernel/itext/kernel/pdf/tagutils/WaitingTagsManager.cs @@ -222,12 +222,10 @@ private void FlushStructElementAndItKids(PdfStructElem elem) { if (waitingTagToAssociatedObj.ContainsKey(elem.GetPdfObject())) { return; } - foreach (IStructureNode kid in elem.GetKids()) { - if (kid is PdfStructElem) { - FlushStructElementAndItKids((PdfStructElem)kid); - } - } - elem.Flush(); + TagTreeIterator iterator = new TagTreeIterator(elem, new WaitingTagsManager.WaitingTagsApprover(waitingTagToAssociatedObj + .Keys), TagTreeIterator.TreeTraversalOrder.POST_ORDER); + iterator.AddHandler(new TagTreeIteratorFlusher()); + iterator.Traverse(); } private void RemoveWaitingStateAndFlushIfParentFlushed(PdfStructElem structElem) { @@ -239,5 +237,19 @@ private void RemoveWaitingStateAndFlushIfParentFlushed(PdfStructElem structElem) } } } + + private class WaitingTagsApprover : TagTreeIteratorAvoidDuplicatesApprover { + private readonly ICollection waitingTags; + + public WaitingTagsApprover(ICollection waitingTags) + : base() { + this.waitingTags = waitingTags; + } + + public override bool Approve(IStructureNode elem) { + return base.Approve(elem) && elem is PdfStructElem && (waitingTags == null || !waitingTags.Contains(((PdfStructElem + )elem).GetPdfObject())); + } + } } } diff --git a/itext/itext.kernel/itext/kernel/utils/TaggedPdfReaderTool.cs b/itext/itext.kernel/itext/kernel/utils/TaggedPdfReaderTool.cs index c65335e8e6..ba4c328e15 100644 --- a/itext/itext.kernel/itext/kernel/utils/TaggedPdfReaderTool.cs +++ b/itext/itext.kernel/itext/kernel/utils/TaggedPdfReaderTool.cs @@ -46,6 +46,8 @@ public class TaggedPdfReaderTool { protected internal IDictionary> parsedTags = new Dictionary>(); + private readonly ICollection inspectedStructTreeElems = new HashSet(); + /// /// Constructs a /// @@ -118,6 +120,10 @@ protected internal virtual void InspectKid(IStructureNode kid) { try { if (kid is PdfStructElem) { PdfStructElem structElemKid = (PdfStructElem)kid; + if (inspectedStructTreeElems.Contains(structElemKid.GetPdfObject())) { + return; + } + inspectedStructTreeElems.Add(structElemKid.GetPdfObject()); PdfName s = structElemKid.GetRole(); String tagN = s.GetValue(); String tag = FixTagName(tagN); diff --git a/port-hash b/port-hash index 3f601ca095..69ab66ba16 100644 --- a/port-hash +++ b/port-hash @@ -1 +1 @@ -1f62986b9a995607ee9143809bba30e9ab4cf893 +bba9f55da2ea35bf2b2f5bc3da9e49c2414a7285