WmlComparer.Private.Methods.Hashing.cs 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using System;
  4. using System.Collections.Generic;
  5. using System.IO;
  6. using System.Linq;
  7. using System.Xml.Linq;
  8. using DocumentFormat.OpenXml.Packaging;
  9. using OpenXmlPowerTools.Previous;
  10. namespace OpenXmlPowerTools
  11. {
  12. public static partial class WmlComparer
  13. {
  14. private static WmlDocument HashBlockLevelContent(
  15. WmlDocument source,
  16. WmlDocument sourceAfterProc,
  17. WmlComparerSettings settings)
  18. {
  19. using (var msSource = new MemoryStream())
  20. using (var msAfterProc = new MemoryStream())
  21. {
  22. msSource.Write(source.DocumentByteArray, 0, source.DocumentByteArray.Length);
  23. msAfterProc.Write(sourceAfterProc.DocumentByteArray, 0, sourceAfterProc.DocumentByteArray.Length);
  24. using (WordprocessingDocument wDocSource = WordprocessingDocument.Open(msSource, true))
  25. using (WordprocessingDocument wDocAfterProc = WordprocessingDocument.Open(msAfterProc, true))
  26. {
  27. // create Unid dictionary for source
  28. XDocument sourceMainXDoc = wDocSource.MainDocumentPart.GetXDocument();
  29. XElement sourceMainRoot = sourceMainXDoc.Root ?? throw new ArgumentException();
  30. Dictionary<string, XElement> sourceUnidDict = sourceMainRoot
  31. .Descendants()
  32. .Where(d => d.Name == W.p || d.Name == W.tbl || d.Name == W.tr)
  33. .ToDictionary(d => (string) d.Attribute(PtOpenXml.Unid));
  34. XDocument afterProcMainXDoc = wDocAfterProc.MainDocumentPart.GetXDocument();
  35. XElement afterProcMainRoot = afterProcMainXDoc.Root ?? throw new ArgumentException();
  36. IEnumerable<XElement> blockLevelElements = afterProcMainRoot
  37. .Descendants()
  38. .Where(d => d.Name == W.p || d.Name == W.tbl || d.Name == W.tr);
  39. foreach (XElement blockLevelContent in blockLevelElements)
  40. {
  41. var cloneBlockLevelContentForHashing = (XElement) CloneBlockLevelContentForHashing(
  42. wDocAfterProc.MainDocumentPart,
  43. blockLevelContent,
  44. true,
  45. settings);
  46. string shaString = cloneBlockLevelContentForHashing
  47. .ToString(SaveOptions.DisableFormatting)
  48. .Replace(" xmlns=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"", "");
  49. string sha1Hash = PtUtils.SHA1HashStringForUTF8String(shaString);
  50. var thisUnid = (string) blockLevelContent.Attribute(PtOpenXml.Unid);
  51. if (thisUnid != null)
  52. {
  53. if (sourceUnidDict.ContainsKey(thisUnid))
  54. {
  55. XElement correlatedBlockLevelContent = sourceUnidDict[thisUnid];
  56. correlatedBlockLevelContent.Add(new XAttribute(PtOpenXml.CorrelatedSHA1Hash, sha1Hash));
  57. }
  58. }
  59. }
  60. wDocSource.MainDocumentPart.PutXDocument();
  61. }
  62. var sourceWithCorrelatedSHA1Hash = new WmlDocument(source.FileName, msSource.ToArray());
  63. return sourceWithCorrelatedSHA1Hash;
  64. }
  65. }
  66. // prohibit
  67. // - altChunk
  68. // - subDoc
  69. // - contentPart
  70. // This strips all text nodes from the XML tree, thereby leaving only the structure.
  71. private static object CloneBlockLevelContentForHashing(
  72. OpenXmlPart mainDocumentPart,
  73. XNode node,
  74. bool includeRelatedParts,
  75. WmlComparerSettings settings)
  76. {
  77. if (node is XElement element)
  78. {
  79. if (element.Name == W.bookmarkStart ||
  80. element.Name == W.bookmarkEnd ||
  81. element.Name == W.pPr ||
  82. element.Name == W.rPr)
  83. {
  84. return null;
  85. }
  86. if (element.Name == W.p)
  87. {
  88. var clonedPara = new XElement(element.Name,
  89. element.Attributes().Where(a => a.Name != W.rsid &&
  90. a.Name != W.rsidDel &&
  91. a.Name != W.rsidP &&
  92. a.Name != W.rsidR &&
  93. a.Name != W.rsidRDefault &&
  94. a.Name != W.rsidRPr &&
  95. a.Name != W.rsidSect &&
  96. a.Name != W.rsidTr &&
  97. a.Name.Namespace != PtOpenXml.pt),
  98. element.Nodes().Select(n =>
  99. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  100. IEnumerable<IGrouping<bool, XElement>> groupedRuns = clonedPara
  101. .Elements()
  102. .GroupAdjacent(e => e.Name == W.r &&
  103. e.Elements().Count() == 1 &&
  104. e.Element(W.t) != null);
  105. var clonedParaWithGroupedRuns = new XElement(element.Name,
  106. groupedRuns.Select(g =>
  107. {
  108. if (g.Key)
  109. {
  110. string text = g.Select(t => t.Value).StringConcatenate();
  111. if (settings.CaseInsensitive)
  112. text = text.ToUpper(settings.CultureInfo);
  113. var newRun = (object) new XElement(W.r,
  114. new XElement(W.t,
  115. text));
  116. return newRun;
  117. }
  118. return g;
  119. }));
  120. return clonedParaWithGroupedRuns;
  121. }
  122. if (element.Name == W.r)
  123. {
  124. IEnumerable<XElement> clonedRuns = element
  125. .Elements()
  126. .Where(e => e.Name != W.rPr)
  127. .Select(rc => new XElement(W.r,
  128. CloneBlockLevelContentForHashing(mainDocumentPart, rc, includeRelatedParts, settings)));
  129. return clonedRuns;
  130. }
  131. if (element.Name == W.tbl)
  132. {
  133. var clonedTable = new XElement(W.tbl,
  134. element.Elements(W.tr).Select(n =>
  135. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  136. return clonedTable;
  137. }
  138. if (element.Name == W.tr)
  139. {
  140. var clonedRow = new XElement(W.tr,
  141. element.Elements(W.tc).Select(n =>
  142. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  143. return clonedRow;
  144. }
  145. if (element.Name == W.tc)
  146. {
  147. var clonedCell = new XElement(W.tc,
  148. element.Elements().Select(n =>
  149. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  150. return clonedCell;
  151. }
  152. if (element.Name == W.tcPr)
  153. {
  154. var clonedCellProps = new XElement(W.tcPr,
  155. element.Elements(W.gridSpan).Select(n =>
  156. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  157. return clonedCellProps;
  158. }
  159. if (element.Name == W.gridSpan)
  160. {
  161. var clonedGridSpan = new XElement(W.gridSpan,
  162. new XAttribute("val", (string) element.Attribute(W.val)));
  163. return clonedGridSpan;
  164. }
  165. if (element.Name == W.txbxContent)
  166. {
  167. var clonedTextbox = new XElement(W.txbxContent,
  168. element.Elements().Select(n =>
  169. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  170. return clonedTextbox;
  171. }
  172. if (includeRelatedParts)
  173. {
  174. if (ComparisonUnitWord.ElementsWithRelationshipIds.Contains(element.Name))
  175. {
  176. var newElement = new XElement(element.Name,
  177. element.Attributes()
  178. .Where(a => a.Name.Namespace != PtOpenXml.pt)
  179. .Where(a => !AttributesToTrimWhenCloning.Contains(a.Name))
  180. .Select(a =>
  181. {
  182. if (!ComparisonUnitWord.RelationshipAttributeNames.Contains(a.Name))
  183. return a;
  184. var rId = (string) a;
  185. // could be an hyperlink relationship
  186. try
  187. {
  188. OpenXmlPart oxp = mainDocumentPart.GetPartById(rId);
  189. if (oxp == null)
  190. throw new FileFormatException("Invalid WordprocessingML Document");
  191. var anno = oxp.Annotation<PartSHA1HashAnnotation>();
  192. if (anno != null)
  193. return new XAttribute(a.Name, anno.Hash);
  194. if (!oxp.ContentType.EndsWith("xml"))
  195. {
  196. using (Stream str = oxp.GetStream())
  197. {
  198. byte[] ba;
  199. using (var br = new BinaryReader(str))
  200. {
  201. ba = br.ReadBytes((int) str.Length);
  202. }
  203. string sha1 = PtUtils.SHA1HashStringForByteArray(ba);
  204. oxp.AddAnnotation(new PartSHA1HashAnnotation(sha1));
  205. return new XAttribute(a.Name, sha1);
  206. }
  207. }
  208. }
  209. catch (ArgumentOutOfRangeException)
  210. {
  211. HyperlinkRelationship hr =
  212. mainDocumentPart.HyperlinkRelationships.FirstOrDefault(z => z.Id == rId);
  213. if (hr != null)
  214. {
  215. string str = hr.Uri.ToString();
  216. return new XAttribute(a.Name, str);
  217. }
  218. // could be an external relationship
  219. ExternalRelationship er =
  220. mainDocumentPart.ExternalRelationships.FirstOrDefault(z => z.Id == rId);
  221. if (er != null)
  222. {
  223. string str = er.Uri.ToString();
  224. return new XAttribute(a.Name, str);
  225. }
  226. return new XAttribute(a.Name, "NULL Relationship");
  227. }
  228. return null;
  229. }),
  230. element.Nodes().Select(n =>
  231. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  232. return newElement;
  233. }
  234. }
  235. if (element.Name == VML.shape)
  236. {
  237. return new XElement(element.Name,
  238. element.Attributes()
  239. .Where(a => a.Name.Namespace != PtOpenXml.pt)
  240. .Where(a => a.Name != "style" && a.Name != "id" && a.Name != "type"),
  241. element.Nodes().Select(n =>
  242. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  243. }
  244. if (element.Name == O.OLEObject)
  245. {
  246. var o = new XElement(element.Name,
  247. element.Attributes()
  248. .Where(a => a.Name.Namespace != PtOpenXml.pt)
  249. .Where(a => a.Name != "ObjectID" && a.Name != R.id),
  250. element.Nodes().Select(n =>
  251. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  252. return o;
  253. }
  254. if (element.Name == W._object)
  255. {
  256. var o = new XElement(element.Name,
  257. element.Attributes()
  258. .Where(a => a.Name.Namespace != PtOpenXml.pt),
  259. element.Nodes().Select(n =>
  260. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  261. return o;
  262. }
  263. if (element.Name == WP.docPr)
  264. {
  265. return new XElement(element.Name,
  266. element.Attributes()
  267. .Where(a => a.Name.Namespace != PtOpenXml.pt && a.Name != "id"),
  268. element.Nodes().Select(n =>
  269. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  270. }
  271. return new XElement(element.Name,
  272. element.Attributes()
  273. .Where(a => a.Name.Namespace != PtOpenXml.pt)
  274. .Where(a => !AttributesToTrimWhenCloning.Contains(a.Name)),
  275. element.Nodes().Select(n =>
  276. CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
  277. }
  278. if (settings.CaseInsensitive)
  279. {
  280. if (node is XText xt)
  281. {
  282. string newText = xt.Value.ToUpper(settings.CultureInfo);
  283. return new XText(newText);
  284. }
  285. }
  286. return node;
  287. }
  288. }
  289. }