123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- // Copyright (c) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE file in the project root for full license information.
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Linq;
- using System.Xml.Linq;
- using DocumentFormat.OpenXml.Packaging;
- using OpenXmlPowerTools.Previous;
- namespace OpenXmlPowerTools
- {
- public static partial class WmlComparer
- {
- private static WmlDocument HashBlockLevelContent(
- WmlDocument source,
- WmlDocument sourceAfterProc,
- WmlComparerSettings settings)
- {
- using (var msSource = new MemoryStream())
- using (var msAfterProc = new MemoryStream())
- {
- msSource.Write(source.DocumentByteArray, 0, source.DocumentByteArray.Length);
- msAfterProc.Write(sourceAfterProc.DocumentByteArray, 0, sourceAfterProc.DocumentByteArray.Length);
- using (WordprocessingDocument wDocSource = WordprocessingDocument.Open(msSource, true))
- using (WordprocessingDocument wDocAfterProc = WordprocessingDocument.Open(msAfterProc, true))
- {
- // create Unid dictionary for source
- XDocument sourceMainXDoc = wDocSource.MainDocumentPart.GetXDocument();
- XElement sourceMainRoot = sourceMainXDoc.Root ?? throw new ArgumentException();
- Dictionary<string, XElement> sourceUnidDict = sourceMainRoot
- .Descendants()
- .Where(d => d.Name == W.p || d.Name == W.tbl || d.Name == W.tr)
- .ToDictionary(d => (string) d.Attribute(PtOpenXml.Unid));
- XDocument afterProcMainXDoc = wDocAfterProc.MainDocumentPart.GetXDocument();
- XElement afterProcMainRoot = afterProcMainXDoc.Root ?? throw new ArgumentException();
- IEnumerable<XElement> blockLevelElements = afterProcMainRoot
- .Descendants()
- .Where(d => d.Name == W.p || d.Name == W.tbl || d.Name == W.tr);
- foreach (XElement blockLevelContent in blockLevelElements)
- {
- var cloneBlockLevelContentForHashing = (XElement) CloneBlockLevelContentForHashing(
- wDocAfterProc.MainDocumentPart,
- blockLevelContent,
- true,
- settings);
- string shaString = cloneBlockLevelContentForHashing
- .ToString(SaveOptions.DisableFormatting)
- .Replace(" xmlns=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"", "");
- string sha1Hash = PtUtils.SHA1HashStringForUTF8String(shaString);
- var thisUnid = (string) blockLevelContent.Attribute(PtOpenXml.Unid);
- if (thisUnid != null)
- {
- if (sourceUnidDict.ContainsKey(thisUnid))
- {
- XElement correlatedBlockLevelContent = sourceUnidDict[thisUnid];
- correlatedBlockLevelContent.Add(new XAttribute(PtOpenXml.CorrelatedSHA1Hash, sha1Hash));
- }
- }
- }
- wDocSource.MainDocumentPart.PutXDocument();
- }
- var sourceWithCorrelatedSHA1Hash = new WmlDocument(source.FileName, msSource.ToArray());
- return sourceWithCorrelatedSHA1Hash;
- }
- }
- // prohibit
- // - altChunk
- // - subDoc
- // - contentPart
- // This strips all text nodes from the XML tree, thereby leaving only the structure.
- private static object CloneBlockLevelContentForHashing(
- OpenXmlPart mainDocumentPart,
- XNode node,
- bool includeRelatedParts,
- WmlComparerSettings settings)
- {
- if (node is XElement element)
- {
- if (element.Name == W.bookmarkStart ||
- element.Name == W.bookmarkEnd ||
- element.Name == W.pPr ||
- element.Name == W.rPr)
- {
- return null;
- }
- if (element.Name == W.p)
- {
- var clonedPara = new XElement(element.Name,
- element.Attributes().Where(a => a.Name != W.rsid &&
- a.Name != W.rsidDel &&
- a.Name != W.rsidP &&
- a.Name != W.rsidR &&
- a.Name != W.rsidRDefault &&
- a.Name != W.rsidRPr &&
- a.Name != W.rsidSect &&
- a.Name != W.rsidTr &&
- a.Name.Namespace != PtOpenXml.pt),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- IEnumerable<IGrouping<bool, XElement>> groupedRuns = clonedPara
- .Elements()
- .GroupAdjacent(e => e.Name == W.r &&
- e.Elements().Count() == 1 &&
- e.Element(W.t) != null);
- var clonedParaWithGroupedRuns = new XElement(element.Name,
- groupedRuns.Select(g =>
- {
- if (g.Key)
- {
- string text = g.Select(t => t.Value).StringConcatenate();
- if (settings.CaseInsensitive)
- text = text.ToUpper(settings.CultureInfo);
- var newRun = (object) new XElement(W.r,
- new XElement(W.t,
- text));
- return newRun;
- }
- return g;
- }));
- return clonedParaWithGroupedRuns;
- }
- if (element.Name == W.r)
- {
- IEnumerable<XElement> clonedRuns = element
- .Elements()
- .Where(e => e.Name != W.rPr)
- .Select(rc => new XElement(W.r,
- CloneBlockLevelContentForHashing(mainDocumentPart, rc, includeRelatedParts, settings)));
- return clonedRuns;
- }
- if (element.Name == W.tbl)
- {
- var clonedTable = new XElement(W.tbl,
- element.Elements(W.tr).Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return clonedTable;
- }
- if (element.Name == W.tr)
- {
- var clonedRow = new XElement(W.tr,
- element.Elements(W.tc).Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return clonedRow;
- }
- if (element.Name == W.tc)
- {
- var clonedCell = new XElement(W.tc,
- element.Elements().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return clonedCell;
- }
- if (element.Name == W.tcPr)
- {
- var clonedCellProps = new XElement(W.tcPr,
- element.Elements(W.gridSpan).Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return clonedCellProps;
- }
- if (element.Name == W.gridSpan)
- {
- var clonedGridSpan = new XElement(W.gridSpan,
- new XAttribute("val", (string) element.Attribute(W.val)));
- return clonedGridSpan;
- }
- if (element.Name == W.txbxContent)
- {
- var clonedTextbox = new XElement(W.txbxContent,
- element.Elements().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return clonedTextbox;
- }
- if (includeRelatedParts)
- {
- if (ComparisonUnitWord.ElementsWithRelationshipIds.Contains(element.Name))
- {
- var newElement = new XElement(element.Name,
- element.Attributes()
- .Where(a => a.Name.Namespace != PtOpenXml.pt)
- .Where(a => !AttributesToTrimWhenCloning.Contains(a.Name))
- .Select(a =>
- {
- if (!ComparisonUnitWord.RelationshipAttributeNames.Contains(a.Name))
- return a;
- var rId = (string) a;
- // could be an hyperlink relationship
- try
- {
- OpenXmlPart oxp = mainDocumentPart.GetPartById(rId);
- if (oxp == null)
- throw new FileFormatException("Invalid WordprocessingML Document");
- var anno = oxp.Annotation<PartSHA1HashAnnotation>();
- if (anno != null)
- return new XAttribute(a.Name, anno.Hash);
- if (!oxp.ContentType.EndsWith("xml"))
- {
- using (Stream str = oxp.GetStream())
- {
- byte[] ba;
- using (var br = new BinaryReader(str))
- {
- ba = br.ReadBytes((int) str.Length);
- }
- string sha1 = PtUtils.SHA1HashStringForByteArray(ba);
- oxp.AddAnnotation(new PartSHA1HashAnnotation(sha1));
- return new XAttribute(a.Name, sha1);
- }
- }
- }
- catch (ArgumentOutOfRangeException)
- {
- HyperlinkRelationship hr =
- mainDocumentPart.HyperlinkRelationships.FirstOrDefault(z => z.Id == rId);
- if (hr != null)
- {
- string str = hr.Uri.ToString();
- return new XAttribute(a.Name, str);
- }
- // could be an external relationship
- ExternalRelationship er =
- mainDocumentPart.ExternalRelationships.FirstOrDefault(z => z.Id == rId);
- if (er != null)
- {
- string str = er.Uri.ToString();
- return new XAttribute(a.Name, str);
- }
- return new XAttribute(a.Name, "NULL Relationship");
- }
- return null;
- }),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return newElement;
- }
- }
- if (element.Name == VML.shape)
- {
- return new XElement(element.Name,
- element.Attributes()
- .Where(a => a.Name.Namespace != PtOpenXml.pt)
- .Where(a => a.Name != "style" && a.Name != "id" && a.Name != "type"),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- }
- if (element.Name == O.OLEObject)
- {
- var o = new XElement(element.Name,
- element.Attributes()
- .Where(a => a.Name.Namespace != PtOpenXml.pt)
- .Where(a => a.Name != "ObjectID" && a.Name != R.id),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return o;
- }
- if (element.Name == W._object)
- {
- var o = new XElement(element.Name,
- element.Attributes()
- .Where(a => a.Name.Namespace != PtOpenXml.pt),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- return o;
- }
- if (element.Name == WP.docPr)
- {
- return new XElement(element.Name,
- element.Attributes()
- .Where(a => a.Name.Namespace != PtOpenXml.pt && a.Name != "id"),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- }
- return new XElement(element.Name,
- element.Attributes()
- .Where(a => a.Name.Namespace != PtOpenXml.pt)
- .Where(a => !AttributesToTrimWhenCloning.Contains(a.Name)),
- element.Nodes().Select(n =>
- CloneBlockLevelContentForHashing(mainDocumentPart, n, includeRelatedParts, settings)));
- }
- if (settings.CaseInsensitive)
- {
- if (node is XText xt)
- {
- string newText = xt.Value.ToUpper(settings.CultureInfo);
- return new XText(newText);
- }
- }
- return node;
- }
- }
- }
|