diff --git a/ooxml/XWPF/Extractor/XWPFWordExtractor.cs b/ooxml/XWPF/Extractor/XWPFWordExtractor.cs
index 0dba72320..bc9649f44 100644
--- a/ooxml/XWPF/Extractor/XWPFWordExtractor.cs
+++ b/ooxml/XWPF/Extractor/XWPFWordExtractor.cs
@@ -32,13 +32,14 @@ namespace NPOI.XWPF.Extractor
public class XWPFWordExtractor : POIXMLTextExtractor
{
public static XWPFRelation[] SUPPORTED_TYPES = new XWPFRelation[] {
- XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
- XWPFRelation.MACRO_DOCUMENT,
- XWPFRelation.MACRO_TEMPLATE_DOCUMENT
- };
+ XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE,
+ XWPFRelation.MACRO_DOCUMENT,
+ XWPFRelation.MACRO_TEMPLATE_DOCUMENT
+ };
private XWPFDocument document;
private bool fetchHyperlinks = false;
+ private bool concatenatePhoneticRuns = true;
public XWPFWordExtractor(OPCPackage Container)
: this(new XWPFDocument(Container))
@@ -62,6 +63,13 @@ namespace NPOI.XWPF.Extractor
fetchHyperlinks = fetch;
}
+ /**
+ * Should we concatenate phonetic runs in extraction. Default is true
+ * @param concatenatePhoneticRuns
+ */
+ public void SetConcatenatePhoneticRuns(bool concatenatePhoneticRuns) {
+ this.concatenatePhoneticRuns = concatenatePhoneticRuns;
+ }
public override String Text
{
@@ -124,7 +132,14 @@ namespace NPOI.XWPF.Extractor
foreach (IRunElement run in paragraph.Runs)
{
- text.Append(run.ToString());
+ if (!concatenatePhoneticRuns && run is XWPFRun xr)
+ {
+ text.Append(xr.GetText());
+ }
+ else
+ {
+ text.Append(run.ToString());
+ }
if (run is XWPFHyperlinkRun hyperlinkRun && fetchHyperlinks)
{
XWPFHyperlink link = hyperlinkRun.GetHyperlink(document);
diff --git a/ooxml/XWPF/Usermodel/XWPFRun.cs b/ooxml/XWPF/Usermodel/XWPFRun.cs
index 4ce46bb04..b8b2efcf9 100644
--- a/ooxml/XWPF/Usermodel/XWPFRun.cs
+++ b/ooxml/XWPF/Usermodel/XWPFRun.cs
@@ -16,18 +16,19 @@
==================================================================== */
namespace NPOI.XWPF.UserModel
{
- using System;
+using Cysharp.Text;
+ using NPOI.OpenXmlFormats.Dml;
+ using NPOI.OpenXmlFormats.Dml.WordProcessing;
using NPOI.OpenXmlFormats.Wordprocessing;
+ using NPOI.Util;
+ using NPOI.WP.UserModel;
+ using System;
using System.Collections.Generic;
+ using System.IO;
using System.Text;
-using Cysharp.Text;
using System.Xml;
- using System.IO;
- using NPOI.Util;
- using NPOI.OpenXmlFormats.Dml;
+ using System.Xml.Linq;
using System.Xml.Serialization;
- using NPOI.OpenXmlFormats.Dml.WordProcessing;
- using NPOI.WP.UserModel;
/**
* @see [MS-OI29500] Run Fonts
@@ -473,84 +474,214 @@ using Cysharp.Text;
get
{
StringBuilder text = new StringBuilder();
- for (int i = 0; i < run.Items.Count; i++)
+ HandleRun(run, text);
+ return text.ToString();
+ }
+ }
+
+ ///
+ /// Returns the string version of the text and the phonetic string
+ ///
+ public override string ToString()
+ {
+ string phonetic = GetPhonetic();
+ StringBuilder text = new StringBuilder();
+ if (phonetic.Length > 0)
+ {
+ return HandleRun(run, text) +" ("+phonetic+")";
+ }
+ else
+ {
+ return HandleRun(run, text);
+ }
+ }
+ public string GetText()
+ {
+ StringBuilder text = new StringBuilder();
+ return HandleRun(run, text);
+ }
+ ///
+ /// Returns the string version of the text, with tabs and
+ /// carriage returns in place of their xml equivalents.
+ ///
+ private string HandleRun(CT_R run, StringBuilder text)
+ {
+ // Grab the text and tabs of the text run
+ // Do so in a way that preserves the ordering
+
+ for (int i = 0; i < run.Items.Count; i++)
+ {
+ object o = run.Items[i];
+ if (o is CT_Ruby)
+ {
+ HandleRuby(o as CT_Ruby, text, false, run.ItemsElementName[i]);
+ continue;
+ }
+ _getText(o, text, run.ItemsElementName[i]);
+ }
+
+ return text.ToString();
+
+ }
+
+ ///
+ ///
+ /// the phonetic (ruby) string associated with this run or an empty string if none exists
+ public string GetPhonetic()
+ {
+ StringBuilder text = new StringBuilder();
+
+ // Grab the text and tabs of the text run
+ // Do so in a way that preserves the ordering
+ for (int i = 0; i < run.Items.Count; i++)
+ {
+ object o = run.Items[i];
+ if (o is CT_Ruby)
+ {
+ HandleRuby(o as CT_Ruby, text, true, run.ItemsElementName[i]);
+ }
+ }
+ // Any picture text?
+ if (pictureText != null && pictureText.Length > 0)
+ {
+ text.Append("\n").Append(pictureText).Append("\n");
+ }
+
+ return text.ToString();
+ }
+
+ ///
+ ///
+ /// rubyobject
+ /// buffer to which to append the content
+ /// extract the phonetic (rt) component or the base component
+ private void HandleRuby(CT_Ruby rubyObj, StringBuilder text,
+ bool extractPhonetic, RunItemsChoiceType itemType)
+ {
+ //according to the spec, a ruby object
+ //has the phonetic (rt) first, then the actual text (base)
+ //second.
+ if(extractPhonetic && rubyObj.rt!=null)
+ {
+ handleRubyContent(rubyObj.rt, text);
+ }
+ if(!extractPhonetic && rubyObj.rubyBase!=null)
+ {
+ handleRubyContent(rubyObj.rubyBase, text);
+ }
+ }
+
+ private void handleRubyContent(CT_RubyContent rbc, StringBuilder text)
+ {
+ for(int i= 0 ; i 0)
+ if (o is CT_PTab)
+ {
+ text.Append("\t");
+ }
+ if (o is CT_Br)
+ {
+ text.Append("\n");
+ }
+ if (o is CT_Empty) {
+ // Some inline text elements Get returned not as
+ // themselves, but as CTEmpty, owing to some odd
+ // definitions around line 5642 of the XSDs
+ // This bit works around it, and replicates the above
+ // rules for that case
+ if (itemType == RunItemsChoiceType.tab)
{
- text.Append("\n").Append(pictureText);
+ text.Append("\t");
+ }
+ if (itemType == RunItemsChoiceType.br)
+ {
+ text.Append("\n");
+ }
+ if (itemType == RunItemsChoiceType.cr)
+ {
+ text.Append("\n");
}
-
- return text.ToString();
+ }
+ if (o is CT_FtnEdnRef ftn)
+ {
+ string footnoteRef = ftn.DomNode.LocalName.Equals("footnoteReference") ?
+ "[footnoteRef:" + ftn.id + "]" : "[endnoteRef:" + ftn.id + "]";
+ text.Append(footnoteRef);
}
}
-
/**
* Specifies that the contents of this run.shall be displayed with a single
* horizontal line through the center of the line.
@@ -1308,14 +1439,14 @@ using Cysharp.Text;
}
return pr;
}
- /**
- * Returns the string version of the text, with tabs and
- * carriage returns in place of their xml equivalents.
- */
- public override String ToString()
- {
- return Text;
- }
+ ///**
+ // * Returns the string version of the text, with tabs and
+ // * carriage returns in place of their xml equivalents.
+ // */
+ //public override String ToString()
+ //{
+ // return Text;
+ //}
}
}
diff --git a/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs b/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs
index b70fb0089..83e3b3250 100644
--- a/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs
+++ b/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs
@@ -436,5 +436,18 @@ namespace TestCases.XWPF.Extractor
extractor.Text);
extractor.Close();
}
+
+ [Test]
+ public void TestPhonetic()
+ {
+ XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("61470.docx");
+ XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
+ //expect: baseText (phoneticText)
+ ClassicAssert.AreEqual("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.Text.Trim());
+ extractor.Close();
+ extractor = new XWPFWordExtractor(doc);
+ extractor.SetConcatenatePhoneticRuns(false);
+ ClassicAssert.AreEqual("\u6771\u4EAC", extractor.Text.Trim());
+ }
}
}
diff --git a/testcases/test-data/document/61470.docx b/testcases/test-data/document/61470.docx
new file mode 100644
index 000000000..6fc1afe45
--- /dev/null
+++ b/testcases/test-data/document/61470.docx
@@ -0,0 +1,2 @@
+とうきょう東京
+