WmlComparer.Public.Methods.Compare.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using System.IO;
  4. using System.Linq;
  5. using System.Xml.Linq;
  6. using DocumentFormat.OpenXml.Packaging;
  7. // It is possible to optimize DescendantContentAtoms
  8. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  9. // Currently, the unid is set at the beginning of the algorithm. It is used by the code that establishes correlation
  10. // based on first rejecting// tracked revisions, then correlating paragraphs/tables. It is requred for this algorithm
  11. // - after finding a correlated sequence in the document with rejected revisions, it uses the unid to find the same
  12. // paragraph in the document without rejected revisions, then sets the correlated sha1 hash in that document.
  13. //
  14. // But then when accepting tracked revisions, for certain paragraphs (where there are deleted paragraph marks) it is
  15. // going to lose the unids. But this isn't a problem because when paragraph marks are deleted, the correlation is
  16. // definitely no longer possible. Any paragraphs that are in a range of paragraphs that are coalesced can't be
  17. // correlated to paragraphs in the other document via their hash. At that point we no longer care what their unids
  18. // are.
  19. //
  20. // But after that it is only used to reconstruct the tree. It is also used in the debugging code that
  21. // prints the various correlated sequences and comparison units - this is display for debugging purposes only.
  22. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  23. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  24. // The key idea here is that a given paragraph will always have the same ancestors, and it doesn't matter whether the
  25. // content was deleted from the old document, inserted into the new document, or set as equal. At this point, we
  26. // identify a paragraph as a sequential list of content atoms, terminated by a paragraph mark. This entire list will
  27. // for a single paragraph, regardless of whether the paragraph is a child of the body, or if the paragraph is in a cell
  28. // in a table, or if the paragraph is in a text box. The list of ancestors, from the paragraph to the root of the XML
  29. // tree will be the same for all content atoms in the paragraph.
  30. //
  31. // Therefore:
  32. //
  33. // Iterate through the list of content atoms backwards. When the loop sees a paragraph mark, it gets the ancestor
  34. // unids from the paragraph mark to the top of the tree, and sets this as the same for all content atoms in the
  35. // paragraph. For descendants of the paragraph mark, it doesn't really matter if content is put into separate runs
  36. // or what not. We don't need to be concerned about what the unids are for descendants of the paragraph.
  37. ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  38. namespace OpenXmlPowerTools
  39. {
  40. public static partial class WmlComparer
  41. {
  42. public static WmlDocument Compare(WmlDocument source1, WmlDocument source2, WmlComparerSettings settings)
  43. {
  44. return CompareInternal(source1, source2, settings, true);
  45. }
  46. private static WmlDocument CompareInternal(
  47. WmlDocument source1,
  48. WmlDocument source2,
  49. WmlComparerSettings settings,
  50. bool preProcessMarkupInOriginal)
  51. {
  52. if (preProcessMarkupInOriginal)
  53. {
  54. source1 = PreProcessMarkup(source1, settings.StartingIdForFootnotesEndnotes + 1000);
  55. }
  56. source2 = PreProcessMarkup(source2, settings.StartingIdForFootnotesEndnotes + 2000);
  57. SaveDocumentIfDesired(source1, "Source1-Step1-PreProcess.docx", settings);
  58. SaveDocumentIfDesired(source2, "Source2-Step1-PreProcess.docx", settings);
  59. // at this point, both source1 and source2 have unid on every element. These are the values that will
  60. // enable reassembly of the XML tree. But we need other values.
  61. // In source1:
  62. // - accept tracked revisions
  63. // - determine hash code for every block-level element
  64. // - save as attribute on every element
  65. // - accept tracked revisions and reject tracked revisions leave the unids alone, where possible.
  66. // - after accepting and calculating the hash, then can use the unids to find the right block-level
  67. // element in the unmodified source1, and install the hash
  68. // In source2:
  69. // - reject tracked revisions
  70. // - determine hash code for every block-level element
  71. // - save as an attribute on every element
  72. // - after rejecting and calculating the hash, then can use the unids to find the right block-level element
  73. // in the unmodified source2, and install the hash
  74. // - sometimes after accepting or rejecting tracked revisions, several paragraphs will get coalesced into a
  75. // single paragraph due to paragraph marks being inserted / deleted.
  76. // - in this case, some paragraphs will not get a hash injected onto them.
  77. // - if a paragraph doesn't have a hash, then it will never correspond to another paragraph, and such
  78. // issues will need to be resolved in the normal execution of the LCS algorithm.
  79. // - note that when we do propagate the unid through for the first paragraph.
  80. // Establish correlation between the two.
  81. // Find the longest common sequence of block-level elements where hash codes are the same.
  82. // this sometimes will be every block level element in the document. Or sometimes will be just a fair
  83. // number of them.
  84. // at the start of doing the LCS algorithm, we will match up content, and put them in corresponding unknown
  85. // correlated comparison units. Those paragraphs will only ever be matched to their corresponding paragraph.
  86. // then the algorithm can proceed as usual.
  87. // need to call ChangeFootnoteEndnoteReferencesToUniqueRange before creating the wmlResult document, so that
  88. // the same GUID ids are used for footnote and endnote references in both the 'after' document, and in the
  89. // result document.
  90. WmlDocument source1AfterAccepting = RevisionProcessor.AcceptRevisions(source1);
  91. WmlDocument source2AfterRejecting = RevisionProcessor.RejectRevisions(source2);
  92. SaveDocumentIfDesired(source1AfterAccepting, "Source1-Step2-AfterAccepting.docx", settings);
  93. SaveDocumentIfDesired(source2AfterRejecting, "Source2-Step2-AfterRejecting.docx", settings);
  94. // this creates the correlated hash codes that enable us to match up ranges of paragraphs based on
  95. // accepting in source1, rejecting in source2
  96. source1 = HashBlockLevelContent(source1, source1AfterAccepting, settings);
  97. source2 = HashBlockLevelContent(source2, source2AfterRejecting, settings);
  98. SaveDocumentIfDesired(source1, "Source1-Step3-AfterHashing.docx", settings);
  99. SaveDocumentIfDesired(source2, "Source2-Step3-AfterHashing.docx", settings);
  100. // Accept revisions in before, and after
  101. source1 = RevisionProcessor.AcceptRevisions(source1);
  102. source2 = RevisionProcessor.AcceptRevisions(source2);
  103. SaveDocumentIfDesired(source1, "Source1-Step4-AfterAccepting.docx", settings);
  104. SaveDocumentIfDesired(source2, "Source2-Step4-AfterAccepting.docx", settings);
  105. // after accepting revisions, some unids may have been removed by revision accepter, along with the
  106. // correlatedSHA1Hash codes, this is as it should be.
  107. // but need to go back in and add guids to paragraphs that have had them removed.
  108. using (var ms = new MemoryStream())
  109. {
  110. ms.Write(source2.DocumentByteArray, 0, source2.DocumentByteArray.Length);
  111. using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true))
  112. {
  113. AddUnidsToMarkupInContentParts(wDoc);
  114. }
  115. }
  116. var wmlResult = new WmlDocument(source1);
  117. using (var ms1 = new MemoryStream())
  118. using (var ms2 = new MemoryStream())
  119. {
  120. ms1.Write(source1.DocumentByteArray, 0, source1.DocumentByteArray.Length);
  121. ms2.Write(source2.DocumentByteArray, 0, source2.DocumentByteArray.Length);
  122. WmlDocument producedDocument;
  123. using (WordprocessingDocument wDoc1 = WordprocessingDocument.Open(ms1, true))
  124. using (WordprocessingDocument wDoc2 = WordprocessingDocument.Open(ms2, true))
  125. {
  126. producedDocument = ProduceDocumentWithTrackedRevisions(settings, wmlResult, wDoc1, wDoc2);
  127. }
  128. SaveDocumentsAfterProducingDocument(ms1, ms2, settings);
  129. SaveCleanedDocuments(source1, producedDocument, settings);
  130. return producedDocument;
  131. }
  132. }
  133. private static void SaveDocumentIfDesired(WmlDocument source, string name, WmlComparerSettings settings)
  134. {
  135. if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
  136. {
  137. var fileInfo = new FileInfo(Path.Combine(settings.DebugTempFileDi.FullName, name));
  138. source.SaveAs(fileInfo.FullName);
  139. }
  140. }
  141. private static void SaveDocumentsAfterProducingDocument(MemoryStream ms1, MemoryStream ms2, WmlComparerSettings settings)
  142. {
  143. if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
  144. {
  145. SaveDocumentIfDesired(new WmlDocument("after1.docx", ms1), "Source1-Step5-AfterProducingDocument.docx", settings);
  146. SaveDocumentIfDesired(new WmlDocument("after2.docx", ms2), "Source2-Step5-AfterProducingDocument.docx", settings);
  147. }
  148. }
  149. private static void SaveCleanedDocuments(WmlDocument source1, WmlDocument producedDocument, WmlComparerSettings settings)
  150. {
  151. if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
  152. {
  153. WmlDocument cleanedSource = CleanPowerToolsAndRsid(source1);
  154. SaveDocumentIfDesired(cleanedSource, "Cleaned-Source.docx", settings);
  155. WmlDocument cleanedProduced = CleanPowerToolsAndRsid(producedDocument);
  156. SaveDocumentIfDesired(cleanedProduced, "Cleaned-Produced.docx", settings);
  157. }
  158. }
  159. private static WmlDocument CleanPowerToolsAndRsid(WmlDocument producedDocument)
  160. {
  161. using (var ms = new MemoryStream())
  162. {
  163. ms.Write(producedDocument.DocumentByteArray, 0, producedDocument.DocumentByteArray.Length);
  164. using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true))
  165. {
  166. foreach (OpenXmlPart cp in wDoc.ContentParts())
  167. {
  168. XDocument xd = cp.GetXDocument();
  169. object newRoot = CleanPartTransform(xd.Root);
  170. xd.Root?.ReplaceWith(newRoot);
  171. cp.PutXDocument();
  172. }
  173. }
  174. var cleaned = new WmlDocument("cleaned.docx", ms.ToArray());
  175. return cleaned;
  176. }
  177. }
  178. private static object CleanPartTransform(XNode node)
  179. {
  180. if (node is XElement element)
  181. {
  182. return new XElement(element.Name,
  183. element.Attributes().Where(a => a.Name.Namespace != PtOpenXml.pt &&
  184. !a.Name.LocalName.ToLower().Contains("rsid")),
  185. element.Nodes().Select(CleanPartTransform));
  186. }
  187. return node;
  188. }
  189. }
  190. }