huanghb
/
TEAMModelOS


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
							// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System.IO;
using System.Linq;
using System.Xml.Linq;
using DocumentFormat.OpenXml.Packaging;

// It is possible to optimize DescendantContentAtoms

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Currently, the unid is set at the beginning of the algorithm.  It is used by the code that establishes correlation
// based on first rejecting// tracked revisions, then correlating paragraphs/tables.  It is requred for this algorithm
// - after finding a correlated sequence in the document with rejected revisions, it uses the unid to find the same
// paragraph in the document without rejected revisions, then sets the correlated sha1 hash in that document.
//
// But then when accepting tracked revisions, for certain paragraphs (where there are deleted paragraph marks) it is
// going to lose the unids.  But this isn't a problem because when paragraph marks are deleted, the correlation is
// definitely no longer possible.  Any paragraphs that are in a range of paragraphs that are coalesced can't be
// correlated to paragraphs in the other document via their hash.  At that point we no longer care what their unids
// are.
//
// But after that it is only used to reconstruct the tree.  It is also used in the debugging code that
// prints the various correlated sequences and comparison units - this is display for debugging purposes only.
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The key idea here is that a given paragraph will always have the same ancestors, and it doesn't matter whether the
// content was deleted from the old document, inserted into the new document, or set as equal.  At this point, we
// identify a paragraph as a sequential list of content atoms, terminated by a paragraph mark.  This entire list will
// for a single paragraph, regardless of whether the paragraph is a child of the body, or if the paragraph is in a cell
// in a table, or if the paragraph is in a text box.  The list of ancestors, from the paragraph to the root of the XML
// tree will be the same for all content atoms in the paragraph.
//
// Therefore:
//
// Iterate through the list of content atoms backwards.  When the loop sees a paragraph mark, it gets the ancestor
// unids from the paragraph mark to the top of the tree, and sets this as the same for all content atoms in the
// paragraph.  For descendants of the paragraph mark, it doesn't really matter if content is put into separate runs
// or what not.  We don't need to be concerned about what the unids are for descendants of the paragraph.
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

namespace OpenXmlPowerTools
{
    public static partial class WmlComparer
    {
        public static WmlDocument Compare(WmlDocument source1, WmlDocument source2, WmlComparerSettings settings)
        {
            return CompareInternal(source1, source2, settings, true);
        }

        private static WmlDocument CompareInternal(
            WmlDocument source1,
            WmlDocument source2,
            WmlComparerSettings settings,
            bool preProcessMarkupInOriginal)
        {
            if (preProcessMarkupInOriginal)
            {
                source1 = PreProcessMarkup(source1, settings.StartingIdForFootnotesEndnotes + 1000);
            }

            source2 = PreProcessMarkup(source2, settings.StartingIdForFootnotesEndnotes + 2000);

            SaveDocumentIfDesired(source1, "Source1-Step1-PreProcess.docx", settings);
            SaveDocumentIfDesired(source2, "Source2-Step1-PreProcess.docx", settings);

            // at this point, both source1 and source2 have unid on every element.  These are the values that will
            // enable reassembly of the XML tree.  But we need other values.

            // In source1:
            // - accept tracked revisions
            // - determine hash code for every block-level element
            // - save as attribute on every element

            // - accept tracked revisions and reject tracked revisions leave the unids alone, where possible.
            // - after accepting and calculating the hash, then can use the unids to find the right block-level
            //   element in the unmodified source1, and install the hash

            // In source2:
            // - reject tracked revisions
            // - determine hash code for every block-level element
            // - save as an attribute on every element

            // - after rejecting and calculating the hash, then can use the unids to find the right block-level element
            //   in the unmodified source2, and install the hash

            // - sometimes after accepting or rejecting tracked revisions, several paragraphs will get coalesced into a
            //   single paragraph due to paragraph marks being inserted / deleted.
            // - in this case, some paragraphs will not get a hash injected onto them.
            // - if a paragraph doesn't have a hash, then it will never correspond to another paragraph, and such
            //   issues will need to be resolved in the normal execution of the LCS algorithm.
            // - note that when we do propagate the unid through for the first paragraph.

            // Establish correlation between the two.
            // Find the longest common sequence of block-level elements where hash codes are the same.
            // this sometimes will be every block level element in the document.  Or sometimes will be just a fair
            // number of them.

            // at the start of doing the LCS algorithm, we will match up content, and put them in corresponding unknown
            // correlated comparison units.  Those paragraphs will only ever be matched to their corresponding paragraph.
            // then the algorithm can proceed as usual.

            // need to call ChangeFootnoteEndnoteReferencesToUniqueRange before creating the wmlResult document, so that
            // the same GUID ids are used for footnote and endnote references in both the 'after' document, and in the
            // result document.

            WmlDocument source1AfterAccepting = RevisionProcessor.AcceptRevisions(source1);
            WmlDocument source2AfterRejecting = RevisionProcessor.RejectRevisions(source2);

            SaveDocumentIfDesired(source1AfterAccepting, "Source1-Step2-AfterAccepting.docx", settings);
            SaveDocumentIfDesired(source2AfterRejecting, "Source2-Step2-AfterRejecting.docx", settings);

            // this creates the correlated hash codes that enable us to match up ranges of paragraphs based on
            // accepting in source1, rejecting in source2
            source1 = HashBlockLevelContent(source1, source1AfterAccepting, settings);
            source2 = HashBlockLevelContent(source2, source2AfterRejecting, settings);

            SaveDocumentIfDesired(source1, "Source1-Step3-AfterHashing.docx", settings);
            SaveDocumentIfDesired(source2, "Source2-Step3-AfterHashing.docx", settings);

            // Accept revisions in before, and after
            source1 = RevisionProcessor.AcceptRevisions(source1);
            source2 = RevisionProcessor.AcceptRevisions(source2);

            SaveDocumentIfDesired(source1, "Source1-Step4-AfterAccepting.docx", settings);
            SaveDocumentIfDesired(source2, "Source2-Step4-AfterAccepting.docx", settings);

            // after accepting revisions, some unids may have been removed by revision accepter, along with the
            // correlatedSHA1Hash codes, this is as it should be.
            // but need to go back in and add guids to paragraphs that have had them removed.

            using (var ms = new MemoryStream())
            {
                ms.Write(source2.DocumentByteArray, 0, source2.DocumentByteArray.Length);
                using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true))
                {
                    AddUnidsToMarkupInContentParts(wDoc);
                }
            }

            var wmlResult = new WmlDocument(source1);
            using (var ms1 = new MemoryStream())
            using (var ms2 = new MemoryStream())
            {
                ms1.Write(source1.DocumentByteArray, 0, source1.DocumentByteArray.Length);
                ms2.Write(source2.DocumentByteArray, 0, source2.DocumentByteArray.Length);
                WmlDocument producedDocument;

                using (WordprocessingDocument wDoc1 = WordprocessingDocument.Open(ms1, true))
                using (WordprocessingDocument wDoc2 = WordprocessingDocument.Open(ms2, true))
                {
                    producedDocument = ProduceDocumentWithTrackedRevisions(settings, wmlResult, wDoc1, wDoc2);
                }

                SaveDocumentsAfterProducingDocument(ms1, ms2, settings);
                SaveCleanedDocuments(source1, producedDocument, settings);

                return producedDocument;
            }
        }

        private static void SaveDocumentIfDesired(WmlDocument source, string name, WmlComparerSettings settings)
        {
            if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
            {
                var fileInfo = new FileInfo(Path.Combine(settings.DebugTempFileDi.FullName, name));
                source.SaveAs(fileInfo.FullName);
            }
        }

        private static void SaveDocumentsAfterProducingDocument(MemoryStream ms1, MemoryStream ms2, WmlComparerSettings settings)
        {
            if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
            {
                SaveDocumentIfDesired(new WmlDocument("after1.docx", ms1), "Source1-Step5-AfterProducingDocument.docx", settings);
                SaveDocumentIfDesired(new WmlDocument("after2.docx", ms2), "Source2-Step5-AfterProducingDocument.docx", settings);
            }
        }

        private static void SaveCleanedDocuments(WmlDocument source1, WmlDocument producedDocument, WmlComparerSettings settings)
        {
            if (SaveIntermediateFilesForDebugging && settings.DebugTempFileDi != null)
            {
                WmlDocument cleanedSource = CleanPowerToolsAndRsid(source1);
                SaveDocumentIfDesired(cleanedSource, "Cleaned-Source.docx", settings);

                WmlDocument cleanedProduced = CleanPowerToolsAndRsid(producedDocument);
                SaveDocumentIfDesired(cleanedProduced, "Cleaned-Produced.docx", settings);
            }
        }

        private static WmlDocument CleanPowerToolsAndRsid(WmlDocument producedDocument)
        {
            using (var ms = new MemoryStream())
            {
                ms.Write(producedDocument.DocumentByteArray, 0, producedDocument.DocumentByteArray.Length);
                using (WordprocessingDocument wDoc = WordprocessingDocument.Open(ms, true))
                {
                    foreach (OpenXmlPart cp in wDoc.ContentParts())
                    {
                        XDocument xd = cp.GetXDocument();
                        object newRoot = CleanPartTransform(xd.Root);
                        xd.Root?.ReplaceWith(newRoot);
                        cp.PutXDocument();
                    }
                }

                var cleaned = new WmlDocument("cleaned.docx", ms.ToArray());
                return cleaned;
            }
        }

        private static object CleanPartTransform(XNode node)
        {
            if (node is XElement element)
            {
                return new XElement(element.Name,
                    element.Attributes().Where(a => a.Name.Namespace != PtOpenXml.pt &&
                                                    !a.Name.LocalName.ToLower().Contains("rsid")),
                    element.Nodes().Select(CleanPartTransform));
            }

            return node;
        }
    }
}