WmlComparer.Public.Methods.Compare.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using System.IO;
  4. using System.Linq;
  5. using System.Xml.Linq;
  6. using DocumentFormat.OpenXml.Packaging;
  7. using OpenXmlPowerTools.Previous;
  8. // It is possible to optimize DescendantContentAtoms
  9. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  10. // Currently, the unid is set at the beginning of the algorithm. It is used by the code that establishes correlation
  11. // based on first rejecting// tracked revisions, then correlating paragraphs/tables. It is requred for this algorithm
  12. // - after finding a correlated sequence in the document with rejected revisions, it uses the unid to find the same
  13. // paragraph in the document without rejected revisions, then sets the correlated sha1 hash in that document.
  14. //
  15. // But then when accepting tracked revisions, for certain paragraphs (where there are deleted paragraph marks) it is
  16. // going to lose the unids. But this isn't a problem because when paragraph marks are deleted, the correlation is
  17. // definitely no longer possible. Any paragraphs that are in a range of paragraphs that are coalesced can't be
  18. // correlated to paragraphs in the other document via their hash. At that point we no longer care what their unids
  19. // are.
  20. //
  21. // But after that it is only used to reconstruct the tree. It is also used in the debugging code that
  22. // prints the various correlated sequences and comparison units - this is display for debugging purposes only.
  23. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  24. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  25. // The key idea here is that a given paragraph will always have the same ancestors, and it doesn't matter whether the
  26. // content was deleted from the old document, inserted into the new document, or set as equal. At this point, we
  27. // identify a paragraph as a sequential list of content atoms, terminated by a paragraph mark. This entire list will
  28. // for a single paragraph, regardless of whether the paragraph is a child of the body, or if the paragraph is in a cell
  29. // in a table, or if the paragraph is in a text box. The list of ancestors, from the paragraph to the root of the XML
  30. // tree will be the same for all content atoms in the paragraph.
  31. //
  32. // Therefore:
  33. //
  34. // Iterate through the list of content atoms backwards. When the loop sees a paragraph mark, it gets the ancestor
  35. // unids from the paragraph mark to the top of the tree, and sets this as the same for all content atoms in the
  36. // paragraph. For descendants of the paragraph mark, it doesn't really matter if content is put into separate runs
  37. // or what not. We don't need to be concerned about what the unids are for descendants of the paragraph.
  38. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  39. namespace OpenXmlPowerTools
  40. {
  41. public static partial class WmlComparer
  42. {
  43. public static WmlDocument Compare(WmlDocument source1, WmlDocument source2, WmlComparerSettings settings)
  44. {
  45. return CompareInternal(source1, source2, settings, true);
  46. }
  47. private static WmlDocument CompareInternal(
  48. WmlDocument source1,
  49. WmlDocument source2,
  50. WmlComparerSettings settings,
  51. bool preProcessMarkupInOriginal)
  52. {
  53. if (preProcessMarkupInOriginal)
  54. {
  55. source1 = PreProcessMarkup(source1, settings.StartingIdForFootnotesEndnotes + 1000);
  56. }
  57. source2 = PreProcessMarkup(source2, settings.StartingIdForFootnotesEndnotes + 2000);
  58. SaveDocumentIfDesired(source1, "Source1-Step1-PreProcess.docx", settings);
  59. SaveDocumentIfDesired(source2, "Source2-Step1-PreProcess.docx", settings);
  60. // at this point, both source1 and source2 have unid on every element. These are the values that will
  61. // enable reassembly of the XML tree. But we need other values.
  62. // In source1:
  63. // - accept tracked revisions
  64. // - determine hash code for every block-level element
  65. // - save as attribute on every element
  66. // - accept tracked revisions and reject tracked revisions leave the unids alone, where possible.
  67. // - after accepting and calculating the hash, then can use the unids to find the right block-level
  68. // element in the unmodified source1, and install the hash
  69. // In source2:
  70. // - reject tracked revisions
  71. // - determine hash code for every block-level element
  72. // - save as an attribute on every element
  73. // - after rejecting and calculating the hash, then can use the unids to find the right block-level element
  74. // in the unmodified source2, and install the hash
  75. // - sometimes after accepting or rejecting tracked revisions, several paragraphs will get coalesced into a
  76. // single paragraph due to paragraph marks being inserted / deleted.
  77. // - in this case, some paragraphs will not get a hash injected onto them.
  78. // - if a paragraph doesn't have a hash, then it will never correspond to another paragraph, and such
  79. // issues will need to be resolved in the normal execution of the LCS algorithm.
  80. // - note that when we do propagate the unid through for the first paragraph.
  81. // Establish correlation between the two.
  82. // Find the longest common sequence of block-level elements where hash codes are the same.
  83. // this sometimes will be every block level element in the document. Or sometimes will be just a fair
  84. // number of them.
  85. // at the start of doing the LCS algorithm, we will match up content, and put them in corresponding unknown
  86. // correlated comparison units. Those paragraphs will only ever be matched to their corresponding paragraph.
  87. // then the algorithm can proceed as usual.
  88. // need to call ChangeFootnoteEndnoteReferencesToUniqueRange before creating the wmlResult document, so that
  89. // the same GUID ids are used for footnote and endnote references in both the 'after' document, and in the
  90. // result document.
  91. WmlDocument source1AfterAccepting = RevisionProcessor.AcceptRevisions(source1);
  92. WmlDocument source2AfterRejecting = RevisionProcessor.RejectRevisions(source2);
  93. SaveDocumentIfDesired(source1AfterAccepting, "Source1-Step2-AfterAccepting.docx", settings);
  94. SaveDocumentIfDesired(source2AfterRejecting, "Source2-Step2-AfterRejecting.docx", settings);
  95. // this creates the correlated hash codes that enable us to match up ranges of paragraphs based on
  96. // accepting in source1, rejecting in source2
  97. source1 = HashBlockLevelContent(source1, source1AfterAccepting, settings);
  98. source2 = HashBlockLevelContent(source2, source2AfterRejecting, settings);
  99. SaveDocumentIfDesired(source1, "Source1-Step3-AfterHashing.docx", settings);
  100. SaveDocumentIfDesired(source2, "Source2-Step3-AfterHashing.docx", settings);
  101. // Accept revisions in before, and after
  102. source1 = RevisionProcessor.AcceptRevisions(source1);
  103. source2 = RevisionProcessor.AcceptRevisions(source2);
  104. SaveDocumentIfDesired(source1, "Source1-Step4-AfterAccepting.docx", settings);
  105. SaveDocumentIfDesired(source2, "Source2-Step4-AfterAccepting.docx", settings);
  106. // after accepting revisions, some unids may have been removed by revision accepter, along with the
  107. // correlatedSHA1Hash codes, this is as it should be.
  108. // but need to go back in and add guids to paragraphs that have had them removed.
  109. using (var ms = new MemoryStream())
  110. {
  111. ms.Write(source2.DocumentByteArray, 0, source2.DocumentByteArray.Length);
  112. using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true))
  113. {
  114. AddUnidsToMarkupInContentParts(wDoc);
  115. }
  116. }
  117. var wmlResult = new WmlDocument(source1);
  118. using (var ms1 = new MemoryStream())
  119. using (var ms2 = new MemoryStream())
  120. {
  121. ms1.Write(source1.DocumentByteArray, 0, source1.DocumentByteArray.Length);
  122. ms2.Write(source2.DocumentByteArray, 0, source2.DocumentByteArray.Length);
  123. WmlDocument producedDocument;
  124. using (WordprocessingDocument wDoc1 = WordprocessingDocument.Open(ms1, true))
  125. using (WordprocessingDocument wDoc2 = WordprocessingDocument.Open(ms2, true))
  126. {
  127. producedDocument = ProduceDocumentWithTrackedRevisions(settings, wmlResult, wDoc1, wDoc2);
  128. }
  129. SaveDocumentsAfterProducingDocument(ms1, ms2, settings);
  130. SaveCleanedDocuments(source1, producedDocument, settings);
  131. return producedDocument;
  132. }
  133. }
  134. private static void SaveDocumentIfDesired(WmlDocument source, string name, WmlComparerSettings settings)
  135. {
  136. if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
  137. {
  138. var fileInfo = new FileInfo(Path.Combine(settings.DebugTempFileDi.FullName, name));
  139. source.SaveAs(fileInfo.FullName);
  140. }
  141. }
  142. private static void SaveDocumentsAfterProducingDocument(MemoryStream ms1, MemoryStream ms2, WmlComparerSettings settings)
  143. {
  144. if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
  145. {
  146. SaveDocumentIfDesired(new WmlDocument("after1.docx", ms1), "Source1-Step5-AfterProducingDocument.docx", settings);
  147. SaveDocumentIfDesired(new WmlDocument("after2.docx", ms2), "Source2-Step5-AfterProducingDocument.docx", settings);
  148. }
  149. }
  150. private static void SaveCleanedDocuments(WmlDocument source1, WmlDocument producedDocument, WmlComparerSettings settings)
  151. {
  152. if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
  153. {
  154. WmlDocument cleanedSource = CleanPowerToolsAndRsid(source1);
  155. SaveDocumentIfDesired(cleanedSource, "Cleaned-Source.docx", settings);
  156. WmlDocument cleanedProduced = CleanPowerToolsAndRsid(producedDocument);
  157. SaveDocumentIfDesired(cleanedProduced, "Cleaned-Produced.docx", settings);
  158. }
  159. }
  160. private static WmlDocument CleanPowerToolsAndRsid(WmlDocument producedDocument)
  161. {
  162. using (var ms = new MemoryStream())
  163. {
  164. ms.Write(producedDocument.DocumentByteArray, 0, producedDocument.DocumentByteArray.Length);
  165. using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true))
  166. {
  167. foreach (OpenXmlPart cp in wDoc.ContentParts())
  168. {
  169. XDocument xd = cp.GetXDocument();
  170. object newRoot = CleanPartTransform(xd.Root);
  171. xd.Root?.ReplaceWith(newRoot);
  172. cp.PutXDocument();
  173. }
  174. }
  175. var cleaned = new WmlDocument("cleaned.docx", ms.ToArray());
  176. return cleaned;
  177. }
  178. }
  179. private static object CleanPartTransform(XNode node)
  180. {
  181. if (node is XElement element)
  182. {
  183. return new XElement(element.Name,
  184. element.Attributes().Where(a => a.Name.Namespace != PtOpenXml.pt &&
  185. !a.Name.LocalName.ToLower().Contains("rsid")),
  186. element.Nodes().Select(CleanPartTransform));
  187. }
  188. return node;
  189. }
  190. }
  191. }