Browse Source

[Bug-61354] fix issue with extracting text from Word docs.

pull/1597/head
Antony Liu 4 weeks ago
parent
commit
2705868e6d
  1. 33
      OpenXmlFormats/Wordprocessing/Document.cs
  2. 41
      ooxml/XWPF/Usermodel/XWPFDocument.cs
  3. 11
      testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs
  4. 4
      testcases/test-data/document/MultipleBodyBug.docx

33
OpenXmlFormats/Wordprocessing/Document.cs

@ -78,10 +78,24 @@ namespace NPOI.OpenXmlFormats.Wordprocessing
if(node == null)
return null;
CT_Document ctObj = new CT_Document();
bool firstBody = true;
foreach(XmlNode childNode in node.ChildNodes)
{
if(childNode.LocalName == "body")
ctObj.body = CT_Body.Parse(childNode, namespaceManager);
{
if(firstBody)
{
ctObj.body = CT_Body.Parse(childNode, namespaceManager);
firstBody = false;
}
else
{
if(ctObj.bodyListField == null)
ctObj.bodyListField = new List<CT_Body> ();
ctObj.bodyListField.Add(CT_Body.Parse(childNode, namespaceManager));
}
}
else if(childNode.LocalName == "background")
ctObj.background = CT_Background.Parse(childNode, namespaceManager);
}
@ -111,6 +125,13 @@ namespace NPOI.OpenXmlFormats.Wordprocessing
sw.Write("mc:Ignorable=\"w14 w15 w16se w16cid w16 w16cex w16sdtdh wp14\">");
if(this.body != null)
this.body.Write(sw, "body");
if(this.bodyList != null)
{
foreach(var b in this.bodyList)
{
b.Write(sw, "body");
}
}
if(this.background != null)
this.background.Write(sw, "background");
sw.Write("</w:document>");
@ -123,6 +144,8 @@ namespace NPOI.OpenXmlFormats.Wordprocessing
this.bodyField = new CT_Body();
}
private List<CT_Body> bodyListField;
[XmlElement(Order = 0)]
public CT_Body body
{
@ -136,6 +159,14 @@ namespace NPOI.OpenXmlFormats.Wordprocessing
}
}
public List<CT_Body> bodyList
{
get
{
return bodyListField;
}
}
public void AddNewBody()
{
this.bodyField = new CT_Body();

41
ooxml/XWPF/Usermodel/XWPFDocument.cs

@ -103,26 +103,31 @@ namespace NPOI.XWPF.UserModel
InitFootnotes();
// parse the document with cursor and add
// // the XmlObject to its lists
foreach (object o in ctDocument.body.Items)
List<CT_Body> allBody = [ctDocument.body];
if(ctDocument.bodyList != null)
allBody.AddRange(ctDocument.bodyList);
foreach(CT_Body body in allBody)
{
if (o is CT_P ctP)
{
XWPFParagraph p = new XWPFParagraph(ctP, this);
bodyElements.Add(p);
paragraphs.Add(p);
}
else if (o is CT_Tbl tbl)
foreach (object o in body.Items)
{
XWPFTable t = new XWPFTable(tbl, this);
bodyElements.Add(t);
tables.Add(t);
}
else if (o is CT_SdtBlock block)
{
XWPFSDT c = new XWPFSDT(block, this);
bodyElements.Add(c);
contentControls.Add(c);
if (o is CT_P ctP)
{
XWPFParagraph p = new XWPFParagraph(ctP, this);
bodyElements.Add(p);
paragraphs.Add(p);
}
else if (o is CT_Tbl tbl)
{
XWPFTable t = new XWPFTable(tbl, this);
bodyElements.Add(t);
tables.Add(t);
}
else if (o is CT_SdtBlock block)
{
XWPFSDT c = new XWPFSDT(block, this);
bodyElements.Add(c);
contentControls.Add(c);
}
}
}
// Sort out headers and footers

11
testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs

@ -425,5 +425,16 @@ namespace TestCases.XWPF.Extractor
extractor.Close();
}
[Test]
public void TestMultipleBodyBug()
{
XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("MultipleBodyBug.docx");
XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
ClassicAssert.AreEqual("START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.\n"
+ "START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.\n"
+ "START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.\n",
extractor.Text);
extractor.Close();
}
}
}

4
testcases/test-data/document/MultipleBodyBug.docx

@ -0,0 +1,4 @@

START BODY 1 The quick, brown fox jumps over a lazy dog. END BODY 1.
START BODY 2 The quick, brown fox jumps over a lazy dog. END BODY 2.
START BODY 3 The quick, brown fox jumps over a lazy dog. END BODY 3.
Loading…
Cancel
Save