WmlComparer.Private.Methods.PreProcessMarkup.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using System;
  4. using System.Collections.Generic;
  5. using System.Diagnostics.CodeAnalysis;
  6. using System.IO;
  7. using System.Linq;
  8. using System.Xml.Linq;
  9. using DocumentFormat.OpenXml;
  10. using DocumentFormat.OpenXml.Packaging;
  11. namespace OpenXmlPowerTools
  12. {
  13. public static partial class WmlComparer
  14. {
  15. private static WmlDocument PreProcessMarkup(WmlDocument source, int startingIdForFootnotesEndnotes)
  16. {
  17. // open and close to get rid of MC content
  18. using (var ms = new MemoryStream())
  19. {
  20. ms.Write(source.DocumentByteArray, 0, source.DocumentByteArray.Length);
  21. var os = new OpenSettings
  22. {
  23. MarkupCompatibilityProcessSettings = new MarkupCompatibilityProcessSettings(
  24. MarkupCompatibilityProcessMode.ProcessAllParts,
  25. FileFormatVersions.Office2007)
  26. };
  27. using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true, os))
  28. {
  29. OpenXmlPartRootElement unused = wDoc.MainDocumentPart.RootElement;
  30. if (wDoc.MainDocumentPart.FootnotesPart != null)
  31. {
  32. // contrary to what you might think, looking at the API, it is necessary to access the root element of each part to cause
  33. // the SDK to process MC markup.
  34. OpenXmlPartRootElement unused1 = wDoc.MainDocumentPart.FootnotesPart.RootElement;
  35. }
  36. if (wDoc.MainDocumentPart.EndnotesPart != null)
  37. {
  38. OpenXmlPartRootElement unused1 = wDoc.MainDocumentPart.EndnotesPart.RootElement;
  39. }
  40. }
  41. source = new WmlDocument(source.FileName, ms.ToArray());
  42. }
  43. // open and close to get rid of MC content
  44. using (var ms = new MemoryStream())
  45. {
  46. ms.Write(source.DocumentByteArray, 0, source.DocumentByteArray.Length);
  47. var os = new OpenSettings
  48. {
  49. MarkupCompatibilityProcessSettings = new MarkupCompatibilityProcessSettings(
  50. MarkupCompatibilityProcessMode.ProcessAllParts,
  51. FileFormatVersions.Office2007)
  52. };
  53. using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true, os))
  54. {
  55. TestForInvalidContent(wDoc);
  56. RemoveExistingPowerToolsMarkup(wDoc);
  57. // Removing content controls, field codes, and bookmarks is a no-no for many use cases.
  58. // We need content controls, e.g., on the title page. Field codes are required for
  59. // automatic cross-references, which require bookmarks.
  60. // TODO: Revisit
  61. var msSettings = new SimplifyMarkupSettings
  62. {
  63. RemoveBookmarks = true,
  64. AcceptRevisions = false,
  65. RemoveComments = true,
  66. RemoveContentControls = true,
  67. RemoveFieldCodes = true,
  68. RemoveGoBackBookmark = true,
  69. RemoveLastRenderedPageBreak = true,
  70. RemovePermissions = true,
  71. RemoveProof = true,
  72. RemoveSmartTags = true,
  73. RemoveSoftHyphens = true,
  74. RemoveHyperlinks = true
  75. };
  76. MarkupSimplifier.SimplifyMarkup(wDoc, msSettings);
  77. ChangeFootnoteEndnoteReferencesToUniqueRange(wDoc, startingIdForFootnotesEndnotes);
  78. AddUnidsToMarkupInContentParts(wDoc);
  79. AddFootnotesEndnotesParts(wDoc);
  80. FillInEmptyFootnotesEndnotes(wDoc);
  81. }
  82. return new WmlDocument(source.FileName, ms.ToArray());
  83. }
  84. }
  85. private static void TestForInvalidContent(WordprocessingDocument wDoc)
  86. {
  87. foreach (OpenXmlPart part in wDoc.ContentParts())
  88. {
  89. XDocument xDoc = part.GetXDocument();
  90. if (xDoc.Descendants(W.altChunk).Any())
  91. throw new OpenXmlPowerToolsException("Unsupported document, contains w:altChunk");
  92. if (xDoc.Descendants(W.subDoc).Any())
  93. throw new OpenXmlPowerToolsException("Unsupported document, contains w:subDoc");
  94. if (xDoc.Descendants(W.contentPart).Any())
  95. throw new OpenXmlPowerToolsException("Unsupported document, contains w:contentPart");
  96. }
  97. }
  98. private static void RemoveExistingPowerToolsMarkup(WordprocessingDocument wDoc)
  99. {
  100. wDoc.MainDocumentPart
  101. .GetXDocument()
  102. .Root?
  103. .Descendants()
  104. .Attributes()
  105. .Where(a => a.Name.Namespace == PtOpenXml.pt)
  106. .Where(a => a.Name != PtOpenXml.Unid)
  107. .Remove();
  108. wDoc.MainDocumentPart.PutXDocument();
  109. FootnotesPart fnPart = wDoc.MainDocumentPart.FootnotesPart;
  110. if (fnPart != null)
  111. {
  112. XDocument fnXDoc = fnPart.GetXDocument();
  113. fnXDoc
  114. .Root?
  115. .Descendants()
  116. .Attributes()
  117. .Where(a => a.Name.Namespace == PtOpenXml.pt)
  118. .Where(a => a.Name != PtOpenXml.Unid)
  119. .Remove();
  120. fnPart.PutXDocument();
  121. }
  122. EndnotesPart enPart = wDoc.MainDocumentPart.EndnotesPart;
  123. if (enPart != null)
  124. {
  125. XDocument enXDoc = enPart.GetXDocument();
  126. enXDoc
  127. .Root?
  128. .Descendants()
  129. .Attributes()
  130. .Where(a => a.Name.Namespace == PtOpenXml.pt)
  131. .Where(a => a.Name != PtOpenXml.Unid)
  132. .Remove();
  133. enPart.PutXDocument();
  134. }
  135. }
  136. private static void ChangeFootnoteEndnoteReferencesToUniqueRange(
  137. WordprocessingDocument wDoc,
  138. int startingIdForFootnotesEndnotes)
  139. {
  140. MainDocumentPart mainDocPart = wDoc.MainDocumentPart;
  141. FootnotesPart footnotesPart = wDoc.MainDocumentPart.FootnotesPart;
  142. EndnotesPart endnotesPart = wDoc.MainDocumentPart.EndnotesPart;
  143. XElement document =
  144. mainDocPart.GetXDocument().Root ?? throw new OpenXmlPowerToolsException("Invalid document.");
  145. XElement footnotes = footnotesPart?.GetXDocument().Root;
  146. XElement endnotes = endnotesPart?.GetXDocument().Root;
  147. IEnumerable<XElement> references = document
  148. .Descendants()
  149. .Where(d => d.Name == W.footnoteReference || d.Name == W.endnoteReference);
  150. foreach (XElement r in references)
  151. {
  152. var oldId = (string) r.Attribute(W.id);
  153. string newId = startingIdForFootnotesEndnotes.ToString();
  154. startingIdForFootnotesEndnotes++;
  155. r.SetAttributeValue(W.id, newId);
  156. if (r.Name == W.footnoteReference)
  157. {
  158. XElement fn = footnotes?
  159. .Elements()
  160. .FirstOrDefault(e => (string) e.Attribute(W.id) == oldId);
  161. if (fn == null)
  162. {
  163. throw new OpenXmlPowerToolsException("Invalid document");
  164. }
  165. fn.SetAttributeValue(W.id, newId);
  166. }
  167. else
  168. {
  169. XElement en = endnotes?
  170. .Elements()
  171. .FirstOrDefault(e => (string) e.Attribute(W.id) == oldId);
  172. if (en == null)
  173. {
  174. throw new OpenXmlPowerToolsException("Invalid document");
  175. }
  176. en.SetAttributeValue(W.id, newId);
  177. }
  178. }
  179. mainDocPart.PutXDocument();
  180. footnotesPart?.PutXDocument();
  181. endnotesPart?.PutXDocument();
  182. }
  183. private static void AddUnidsToMarkupInContentParts(WordprocessingDocument wDoc)
  184. {
  185. XDocument mdp = wDoc.MainDocumentPart.GetXDocument();
  186. AssignUnidToAllElements(mdp.Root);
  187. IgnorePt14Namespace(mdp.Root);
  188. wDoc.MainDocumentPart.PutXDocument();
  189. if (wDoc.MainDocumentPart.FootnotesPart != null)
  190. {
  191. XDocument p = wDoc.MainDocumentPart.FootnotesPart.GetXDocument();
  192. AssignUnidToAllElements(p.Root);
  193. IgnorePt14Namespace(p.Root);
  194. wDoc.MainDocumentPart.FootnotesPart.PutXDocument();
  195. }
  196. if (wDoc.MainDocumentPart.EndnotesPart != null)
  197. {
  198. XDocument p = wDoc.MainDocumentPart.EndnotesPart.GetXDocument();
  199. AssignUnidToAllElements(p.Root);
  200. IgnorePt14Namespace(p.Root);
  201. wDoc.MainDocumentPart.EndnotesPart.PutXDocument();
  202. }
  203. }
  204. private static void AssignUnidToAllElements(XElement contentParent)
  205. {
  206. IEnumerable<XElement> content = contentParent.Descendants();
  207. foreach (XElement d in content)
  208. {
  209. if (d.Attribute(PtOpenXml.Unid) == null)
  210. {
  211. string unid = Guid.NewGuid().ToString().Replace("-", "");
  212. var newAtt = new XAttribute(PtOpenXml.Unid, unid);
  213. d.Add(newAtt);
  214. }
  215. }
  216. }
  217. [SuppressMessage("ReSharper", "CoVariantArrayConversion")]
  218. private static void AddFootnotesEndnotesParts(WordprocessingDocument wDoc)
  219. {
  220. MainDocumentPart mdp = wDoc.MainDocumentPart;
  221. if (mdp.FootnotesPart == null)
  222. {
  223. mdp.AddNewPart<FootnotesPart>();
  224. XDocument newFootnotes = wDoc.MainDocumentPart.FootnotesPart.GetXDocument();
  225. newFootnotes.Declaration.Standalone = "yes";
  226. newFootnotes.Declaration.Encoding = "UTF-8";
  227. newFootnotes.Add(new XElement(W.footnotes, NamespaceAttributes));
  228. mdp.FootnotesPart.PutXDocument();
  229. }
  230. if (mdp.EndnotesPart == null)
  231. {
  232. mdp.AddNewPart<EndnotesPart>();
  233. XDocument newEndnotes = wDoc.MainDocumentPart.EndnotesPart.GetXDocument();
  234. newEndnotes.Declaration.Standalone = "yes";
  235. newEndnotes.Declaration.Encoding = "UTF-8";
  236. newEndnotes.Add(new XElement(W.endnotes, NamespaceAttributes));
  237. mdp.EndnotesPart.PutXDocument();
  238. }
  239. }
  240. private static void FillInEmptyFootnotesEndnotes(WordprocessingDocument wDoc)
  241. {
  242. XElement emptyFootnote = XElement.Parse(
  243. @"<w:p xmlns:w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'>
  244. <w:pPr>
  245. <w:pStyle w:val='FootnoteText'/>
  246. </w:pPr>
  247. <w:r>
  248. <w:rPr>
  249. <w:rStyle w:val='FootnoteReference'/>
  250. </w:rPr>
  251. <w:footnoteRef/>
  252. </w:r>
  253. </w:p>");
  254. XElement emptyEndnote = XElement.Parse(
  255. @"<w:p xmlns:w='http://schemas.openxmlformats.org/wordprocessingml/2006/main'>
  256. <w:pPr>
  257. <w:pStyle w:val='EndnoteText'/>
  258. </w:pPr>
  259. <w:r>
  260. <w:rPr>
  261. <w:rStyle w:val='EndnoteReference'/>
  262. </w:rPr>
  263. <w:endnoteRef/>
  264. </w:r>
  265. </w:p>");
  266. FootnotesPart footnotePart = wDoc.MainDocumentPart.FootnotesPart;
  267. if (footnotePart != null)
  268. {
  269. XElement fnRoot = footnotePart.GetXDocument().Root ?? throw new ArgumentException();
  270. foreach (XElement fn in fnRoot.Elements(W.footnote))
  271. {
  272. if (!fn.HasElements)
  273. fn.Add(emptyFootnote);
  274. }
  275. footnotePart.PutXDocument();
  276. }
  277. EndnotesPart endnotePart = wDoc.MainDocumentPart.EndnotesPart;
  278. if (endnotePart != null)
  279. {
  280. XElement fnRoot = endnotePart.GetXDocument().Root ?? throw new ArgumentException();
  281. foreach (XElement fn in fnRoot.Elements(W.endnote))
  282. {
  283. if (!fn.HasElements)
  284. fn.Add(emptyEndnote);
  285. }
  286. endnotePart.PutXDocument();
  287. }
  288. }
  289. }
  290. }