diff --git a/ooxml/XWPF/Extractor/XWPFWordExtractor.cs b/ooxml/XWPF/Extractor/XWPFWordExtractor.cs index 0dba72320..bc9649f44 100644 --- a/ooxml/XWPF/Extractor/XWPFWordExtractor.cs +++ b/ooxml/XWPF/Extractor/XWPFWordExtractor.cs @@ -32,13 +32,14 @@ namespace NPOI.XWPF.Extractor public class XWPFWordExtractor : POIXMLTextExtractor { public static XWPFRelation[] SUPPORTED_TYPES = new XWPFRelation[] { - XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE, - XWPFRelation.MACRO_DOCUMENT, - XWPFRelation.MACRO_TEMPLATE_DOCUMENT - }; + XWPFRelation.DOCUMENT, XWPFRelation.TEMPLATE, + XWPFRelation.MACRO_DOCUMENT, + XWPFRelation.MACRO_TEMPLATE_DOCUMENT + }; private XWPFDocument document; private bool fetchHyperlinks = false; + private bool concatenatePhoneticRuns = true; public XWPFWordExtractor(OPCPackage Container) : this(new XWPFDocument(Container)) @@ -62,6 +63,13 @@ namespace NPOI.XWPF.Extractor fetchHyperlinks = fetch; } + /** + * Should we concatenate phonetic runs in extraction. Default is true + * @param concatenatePhoneticRuns + */ + public void SetConcatenatePhoneticRuns(bool concatenatePhoneticRuns) { + this.concatenatePhoneticRuns = concatenatePhoneticRuns; + } public override String Text { @@ -124,7 +132,14 @@ namespace NPOI.XWPF.Extractor foreach (IRunElement run in paragraph.Runs) { - text.Append(run.ToString()); + if (!concatenatePhoneticRuns && run is XWPFRun xr) + { + text.Append(xr.GetText()); + } + else + { + text.Append(run.ToString()); + } if (run is XWPFHyperlinkRun hyperlinkRun && fetchHyperlinks) { XWPFHyperlink link = hyperlinkRun.GetHyperlink(document); diff --git a/ooxml/XWPF/Usermodel/XWPFRun.cs b/ooxml/XWPF/Usermodel/XWPFRun.cs index 4ce46bb04..b8b2efcf9 100644 --- a/ooxml/XWPF/Usermodel/XWPFRun.cs +++ b/ooxml/XWPF/Usermodel/XWPFRun.cs @@ -16,18 +16,19 @@ ==================================================================== */ namespace NPOI.XWPF.UserModel { - using System; +using Cysharp.Text; + using NPOI.OpenXmlFormats.Dml; + using NPOI.OpenXmlFormats.Dml.WordProcessing; using NPOI.OpenXmlFormats.Wordprocessing; + using NPOI.Util; + using NPOI.WP.UserModel; + using System; using System.Collections.Generic; + using System.IO; using System.Text; -using Cysharp.Text; using System.Xml; - using System.IO; - using NPOI.Util; - using NPOI.OpenXmlFormats.Dml; + using System.Xml.Linq; using System.Xml.Serialization; - using NPOI.OpenXmlFormats.Dml.WordProcessing; - using NPOI.WP.UserModel; /** * @see [MS-OI29500] Run Fonts @@ -473,84 +474,214 @@ using Cysharp.Text; get { StringBuilder text = new StringBuilder(); - for (int i = 0; i < run.Items.Count; i++) + HandleRun(run, text); + return text.ToString(); + } + } + + /// + /// Returns the string version of the text and the phonetic string + /// + public override string ToString() + { + string phonetic = GetPhonetic(); + StringBuilder text = new StringBuilder(); + if (phonetic.Length > 0) + { + return HandleRun(run, text) +" ("+phonetic+")"; + } + else + { + return HandleRun(run, text); + } + } + public string GetText() + { + StringBuilder text = new StringBuilder(); + return HandleRun(run, text); + } + /// + /// Returns the string version of the text, with tabs and + /// carriage returns in place of their xml equivalents. + /// + private string HandleRun(CT_R run, StringBuilder text) + { + // Grab the text and tabs of the text run + // Do so in a way that preserves the ordering + + for (int i = 0; i < run.Items.Count; i++) + { + object o = run.Items[i]; + if (o is CT_Ruby) + { + HandleRuby(o as CT_Ruby, text, false, run.ItemsElementName[i]); + continue; + } + _getText(o, text, run.ItemsElementName[i]); + } + + return text.ToString(); + + } + + /// + /// + /// the phonetic (ruby) string associated with this run or an empty string if none exists + public string GetPhonetic() + { + StringBuilder text = new StringBuilder(); + + // Grab the text and tabs of the text run + // Do so in a way that preserves the ordering + for (int i = 0; i < run.Items.Count; i++) + { + object o = run.Items[i]; + if (o is CT_Ruby) + { + HandleRuby(o as CT_Ruby, text, true, run.ItemsElementName[i]); + } + } + // Any picture text? + if (pictureText != null && pictureText.Length > 0) + { + text.Append("\n").Append(pictureText).Append("\n"); + } + + return text.ToString(); + } + + /// + /// + /// rubyobject + /// buffer to which to append the content + /// extract the phonetic (rt) component or the base component + private void HandleRuby(CT_Ruby rubyObj, StringBuilder text, + bool extractPhonetic, RunItemsChoiceType itemType) + { + //according to the spec, a ruby object + //has the phonetic (rt) first, then the actual text (base) + //second. + if(extractPhonetic && rubyObj.rt!=null) + { + handleRubyContent(rubyObj.rt, text); + } + if(!extractPhonetic && rubyObj.rubyBase!=null) + { + handleRubyContent(rubyObj.rubyBase, text); + } + } + + private void handleRubyContent(CT_RubyContent rbc, StringBuilder text) + { + for(int i= 0 ; i 0) + if (o is CT_PTab) + { + text.Append("\t"); + } + if (o is CT_Br) + { + text.Append("\n"); + } + if (o is CT_Empty) { + // Some inline text elements Get returned not as + // themselves, but as CTEmpty, owing to some odd + // definitions around line 5642 of the XSDs + // This bit works around it, and replicates the above + // rules for that case + if (itemType == RunItemsChoiceType.tab) { - text.Append("\n").Append(pictureText); + text.Append("\t"); + } + if (itemType == RunItemsChoiceType.br) + { + text.Append("\n"); + } + if (itemType == RunItemsChoiceType.cr) + { + text.Append("\n"); } - - return text.ToString(); + } + if (o is CT_FtnEdnRef ftn) + { + string footnoteRef = ftn.DomNode.LocalName.Equals("footnoteReference") ? + "[footnoteRef:" + ftn.id + "]" : "[endnoteRef:" + ftn.id + "]"; + text.Append(footnoteRef); } } - /** * Specifies that the contents of this run.shall be displayed with a single * horizontal line through the center of the line. @@ -1308,14 +1439,14 @@ using Cysharp.Text; } return pr; } - /** - * Returns the string version of the text, with tabs and - * carriage returns in place of their xml equivalents. - */ - public override String ToString() - { - return Text; - } + ///** + // * Returns the string version of the text, with tabs and + // * carriage returns in place of their xml equivalents. + // */ + //public override String ToString() + //{ + // return Text; + //} } } diff --git a/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs b/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs index b70fb0089..83e3b3250 100644 --- a/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs +++ b/testcases/ooxml/XWPF/Extractor/TestXWPFWordExtractor.cs @@ -436,5 +436,18 @@ namespace TestCases.XWPF.Extractor extractor.Text); extractor.Close(); } + + [Test] + public void TestPhonetic() + { + XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("61470.docx"); + XWPFWordExtractor extractor = new XWPFWordExtractor(doc); + //expect: baseText (phoneticText) + ClassicAssert.AreEqual("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)", extractor.Text.Trim()); + extractor.Close(); + extractor = new XWPFWordExtractor(doc); + extractor.SetConcatenatePhoneticRuns(false); + ClassicAssert.AreEqual("\u6771\u4EAC", extractor.Text.Trim()); + } } } diff --git a/testcases/test-data/document/61470.docx b/testcases/test-data/document/61470.docx new file mode 100644 index 000000000..6fc1afe45 --- /dev/null +++ b/testcases/test-data/document/61470.docx @@ -0,0 +1,2 @@ +とうきょう東京 +