WmlComparer.Private.Methods.Lcs.cs 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using System;
  4. using System.Collections.Generic;
  5. using System.Linq;
  6. using System.Text;
  7. using System.Xml.Linq;
  8. namespace OpenXmlPowerTools
  9. {
  10. public static partial class WmlComparer
  11. {
  12. private static List<CorrelatedSequence> DoLcsAlgorithm(CorrelatedSequence unknown, WmlComparerSettings settings)
  13. {
  14. var newListOfCorrelatedSequence = new List<CorrelatedSequence>();
  15. ComparisonUnit[] cul1 = unknown.ComparisonUnitArray1;
  16. ComparisonUnit[] cul2 = unknown.ComparisonUnitArray2;
  17. // first thing to do - if we have an unknown with zero length on left or right side, create appropriate
  18. // this is a code optimization that enables easier processing of cases elsewhere.
  19. if (cul1.Length > 0 && cul2.Length == 0)
  20. {
  21. var deletedCorrelatedSequence = new CorrelatedSequence
  22. {
  23. CorrelationStatus = CorrelationStatus.Deleted,
  24. ComparisonUnitArray1 = cul1,
  25. ComparisonUnitArray2 = null
  26. };
  27. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  28. return newListOfCorrelatedSequence;
  29. }
  30. if (cul1.Length == 0 && cul2.Length > 0)
  31. {
  32. var insertedCorrelatedSequence = new CorrelatedSequence
  33. {
  34. CorrelationStatus = CorrelationStatus.Inserted,
  35. ComparisonUnitArray1 = null,
  36. ComparisonUnitArray2 = cul2
  37. };
  38. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  39. return newListOfCorrelatedSequence;
  40. }
  41. if (cul1.Length == 0 && cul2.Length == 0)
  42. {
  43. // this will effectively remove the unknown with no data on either side from the current data model.
  44. return newListOfCorrelatedSequence;
  45. }
  46. var currentLongestCommonSequenceLength = 0;
  47. int currentI1 = -1;
  48. int currentI2 = -1;
  49. for (var i1 = 0; i1 < cul1.Length - currentLongestCommonSequenceLength; i1++)
  50. {
  51. for (var i2 = 0; i2 < cul2.Length - currentLongestCommonSequenceLength; i2++)
  52. {
  53. var thisSequenceLength = 0;
  54. int thisI1 = i1;
  55. int thisI2 = i2;
  56. while (true)
  57. {
  58. if (cul1[thisI1].SHA1Hash == cul2[thisI2].SHA1Hash)
  59. {
  60. thisI1++;
  61. thisI2++;
  62. thisSequenceLength++;
  63. if (thisI1 == cul1.Length || thisI2 == cul2.Length)
  64. {
  65. if (thisSequenceLength > currentLongestCommonSequenceLength)
  66. {
  67. currentLongestCommonSequenceLength = thisSequenceLength;
  68. currentI1 = i1;
  69. currentI2 = i2;
  70. }
  71. break;
  72. }
  73. }
  74. else
  75. {
  76. if (thisSequenceLength > currentLongestCommonSequenceLength)
  77. {
  78. currentLongestCommonSequenceLength = thisSequenceLength;
  79. currentI1 = i1;
  80. currentI2 = i2;
  81. }
  82. break;
  83. }
  84. }
  85. }
  86. }
  87. // never start a common section with a paragraph mark.
  88. while (true)
  89. {
  90. if (currentLongestCommonSequenceLength <= 1)
  91. break;
  92. ComparisonUnit firstCommon = cul1[currentI1];
  93. if (!(firstCommon is ComparisonUnitWord firstCommonWord))
  94. break;
  95. // if the word contains more than one atom, then not a paragraph mark
  96. if (firstCommonWord.Contents.Count != 1)
  97. break;
  98. if (!(firstCommonWord.Contents.First() is ComparisonUnitAtom firstCommonAtom))
  99. break;
  100. if (firstCommonAtom.ContentElement.Name != W.pPr)
  101. break;
  102. --currentLongestCommonSequenceLength;
  103. if (currentLongestCommonSequenceLength == 0)
  104. {
  105. currentI1 = -1;
  106. currentI2 = -1;
  107. }
  108. else
  109. {
  110. ++currentI1;
  111. ++currentI2;
  112. }
  113. }
  114. var isOnlyParagraphMark = false;
  115. if (currentLongestCommonSequenceLength == 1)
  116. {
  117. ComparisonUnit firstCommon = cul1[currentI1];
  118. if (firstCommon is ComparisonUnitWord firstCommonWord)
  119. {
  120. // if the word contains more than one atom, then not a paragraph mark
  121. if (firstCommonWord.Contents.Count == 1)
  122. {
  123. if (firstCommonWord.Contents.First() is ComparisonUnitAtom firstCommonAtom)
  124. {
  125. if (firstCommonAtom.ContentElement.Name == W.pPr)
  126. isOnlyParagraphMark = true;
  127. }
  128. }
  129. }
  130. }
  131. // don't match just a single character
  132. if (currentLongestCommonSequenceLength == 1)
  133. {
  134. if (cul2[currentI2] is ComparisonUnitAtom cuw2)
  135. {
  136. if (cuw2.ContentElement.Name == W.t && cuw2.ContentElement.Value == " ")
  137. {
  138. currentI1 = -1;
  139. currentI2 = -1;
  140. currentLongestCommonSequenceLength = 0;
  141. }
  142. }
  143. }
  144. // don't match only word break characters
  145. if (currentLongestCommonSequenceLength > 0 && currentLongestCommonSequenceLength <= 3)
  146. {
  147. ComparisonUnit[] commonSequence = cul1.Skip(currentI1).Take(currentLongestCommonSequenceLength).ToArray();
  148. // if they are all ComparisonUnitWord objects
  149. bool oneIsNotWord = commonSequence.Any(cs => !(cs is ComparisonUnitWord));
  150. bool allAreWords = !oneIsNotWord;
  151. if (allAreWords)
  152. {
  153. bool contentOtherThanWordSplitChars = commonSequence
  154. .Cast<ComparisonUnitWord>()
  155. .Any(cs =>
  156. {
  157. bool otherThanText = cs.DescendantContentAtoms().Any(dca => dca.ContentElement.Name != W.t);
  158. if (otherThanText) return true;
  159. bool otherThanWordSplit = cs
  160. .DescendantContentAtoms()
  161. .Any(dca =>
  162. {
  163. string charValue = dca.ContentElement.Value;
  164. bool isWordSplit = settings.WordSeparators.Contains(charValue[0]);
  165. return !isWordSplit;
  166. });
  167. return otherThanWordSplit;
  168. });
  169. if (!contentOtherThanWordSplitChars)
  170. {
  171. currentI1 = -1;
  172. currentI2 = -1;
  173. currentLongestCommonSequenceLength = 0;
  174. }
  175. }
  176. }
  177. // if we are only looking at text, and if the longest common subsequence is less than 15% of the whole, then forget it,
  178. // don't find that LCS.
  179. if (!isOnlyParagraphMark && currentLongestCommonSequenceLength > 0)
  180. {
  181. bool anyButWord1 = cul1.Any(cu => !(cu is ComparisonUnitWord));
  182. bool anyButWord2 = cul2.Any(cu => !(cu is ComparisonUnitWord));
  183. if (!anyButWord1 && !anyButWord2)
  184. {
  185. int maxLen = Math.Max(cul1.Length, cul2.Length);
  186. if (currentLongestCommonSequenceLength / (double) maxLen < settings.DetailThreshold)
  187. {
  188. currentI1 = -1;
  189. currentI2 = -1;
  190. currentLongestCommonSequenceLength = 0;
  191. }
  192. }
  193. }
  194. if (currentI1 == -1 && currentI2 == -1)
  195. {
  196. int leftLength = unknown.ComparisonUnitArray1.Length;
  197. int leftTables = unknown
  198. .ComparisonUnitArray1
  199. .OfType<ComparisonUnitGroup>()
  200. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Table);
  201. int leftRows = unknown
  202. .ComparisonUnitArray1
  203. .OfType<ComparisonUnitGroup>()
  204. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Row);
  205. int leftParagraphs = unknown
  206. .ComparisonUnitArray1
  207. .OfType<ComparisonUnitGroup>()
  208. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Paragraph);
  209. int leftTextboxes = unknown
  210. .ComparisonUnitArray1
  211. .OfType<ComparisonUnitGroup>()
  212. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox);
  213. int leftWords = unknown
  214. .ComparisonUnitArray1
  215. .OfType<ComparisonUnitWord>()
  216. .Count();
  217. int rightLength = unknown.ComparisonUnitArray2.Length;
  218. int rightTables = unknown
  219. .ComparisonUnitArray2
  220. .OfType<ComparisonUnitGroup>()
  221. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Table);
  222. int rightRows = unknown
  223. .ComparisonUnitArray2
  224. .OfType<ComparisonUnitGroup>()
  225. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Row);
  226. int rightParagraphs = unknown
  227. .ComparisonUnitArray2
  228. .OfType<ComparisonUnitGroup>()
  229. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Paragraph);
  230. int rightTextboxes = unknown
  231. .ComparisonUnitArray2
  232. .OfType<ComparisonUnitGroup>()
  233. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox);
  234. int rightWords = unknown
  235. .ComparisonUnitArray2
  236. .OfType<ComparisonUnitWord>()
  237. .Count();
  238. // if either side has both words, rows and text boxes, then we need to separate out into separate
  239. // unknown correlated sequences
  240. // group adjacent based on whether word, row, or textbox
  241. // in most cases, the count of groups will be the same, but they may differ
  242. // if the first group on either side is word, then create a deleted or inserted corr sequ for it.
  243. // then have counter on both sides pointing to the first matched pairs of rows
  244. // create an unknown corr sequ for it.
  245. // increment both counters
  246. // if one is at end but the other is not, then tag the remaining content as inserted or deleted, and done.
  247. // if both are at the end, then done
  248. // return the new list of corr sequ
  249. bool leftOnlyWordsRowsTextboxes = leftLength == leftWords + leftRows + leftTextboxes;
  250. bool rightOnlyWordsRowsTextboxes = rightLength == rightWords + rightRows + rightTextboxes;
  251. if ((leftWords > 0 || rightWords > 0) &&
  252. (leftRows > 0 || rightRows > 0 || leftTextboxes > 0 || rightTextboxes > 0) &&
  253. leftOnlyWordsRowsTextboxes &&
  254. rightOnlyWordsRowsTextboxes)
  255. {
  256. IGrouping<string, ComparisonUnit>[] leftGrouped = unknown
  257. .ComparisonUnitArray1
  258. .GroupAdjacent(cu =>
  259. {
  260. if (cu is ComparisonUnitWord)
  261. {
  262. return "Word";
  263. }
  264. var cug = cu as ComparisonUnitGroup;
  265. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  266. {
  267. return "Row";
  268. }
  269. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox)
  270. {
  271. return "Textbox";
  272. }
  273. throw new OpenXmlPowerToolsException("Internal error");
  274. })
  275. .ToArray();
  276. IGrouping<string, ComparisonUnit>[] rightGrouped = unknown
  277. .ComparisonUnitArray2
  278. .GroupAdjacent(cu =>
  279. {
  280. if (cu is ComparisonUnitWord)
  281. {
  282. return "Word";
  283. }
  284. var cug = cu as ComparisonUnitGroup;
  285. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  286. {
  287. return "Row";
  288. }
  289. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox)
  290. {
  291. return "Textbox";
  292. }
  293. throw new OpenXmlPowerToolsException("Internal error");
  294. })
  295. .ToArray();
  296. var iLeft = 0;
  297. var iRight = 0;
  298. // create an unknown corr sequ for it.
  299. // increment both counters
  300. // if one is at end but the other is not, then tag the remaining content as inserted or deleted, and done.
  301. // if both are at the end, then done
  302. // return the new list of corr sequ
  303. while (true)
  304. {
  305. if (leftGrouped[iLeft].Key == rightGrouped[iRight].Key)
  306. {
  307. var unknownCorrelatedSequence = new CorrelatedSequence
  308. {
  309. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  310. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  311. CorrelationStatus = CorrelationStatus.Unknown
  312. };
  313. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  314. ++iLeft;
  315. ++iRight;
  316. }
  317. // have to decide which of the following two branches to do first based on whether the left contains a paragraph mark
  318. // i.e. cant insert a string of deleted text right before a table.
  319. else if (leftGrouped[iLeft].Key == "Word" &&
  320. leftGrouped[iLeft]
  321. .Select(lg => lg.DescendantContentAtoms())
  322. .SelectMany(m => m).Last()
  323. .ContentElement
  324. .Name != W.pPr &&
  325. rightGrouped[iRight].Key == "Row")
  326. {
  327. var insertedCorrelatedSequence = new CorrelatedSequence
  328. {
  329. ComparisonUnitArray1 = null,
  330. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  331. CorrelationStatus = CorrelationStatus.Inserted
  332. };
  333. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  334. ++iRight;
  335. }
  336. else if (rightGrouped[iRight].Key == "Word" &&
  337. rightGrouped[iRight]
  338. .Select(lg => lg.DescendantContentAtoms())
  339. .SelectMany(m => m)
  340. .Last()
  341. .ContentElement
  342. .Name != W.pPr &&
  343. leftGrouped[iLeft].Key == "Row")
  344. {
  345. var insertedCorrelatedSequence = new CorrelatedSequence
  346. {
  347. ComparisonUnitArray1 = null,
  348. ComparisonUnitArray2 = leftGrouped[iLeft].ToArray(),
  349. CorrelationStatus = CorrelationStatus.Inserted
  350. };
  351. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  352. ++iLeft;
  353. }
  354. else if (leftGrouped[iLeft].Key == "Word" && rightGrouped[iRight].Key != "Word")
  355. {
  356. var deletedCorrelatedSequence = new CorrelatedSequence
  357. {
  358. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  359. ComparisonUnitArray2 = null,
  360. CorrelationStatus = CorrelationStatus.Deleted
  361. };
  362. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  363. ++iLeft;
  364. }
  365. else if (leftGrouped[iLeft].Key != "Word" && rightGrouped[iRight].Key == "Word")
  366. {
  367. var insertedCorrelatedSequence = new CorrelatedSequence
  368. {
  369. ComparisonUnitArray1 = null,
  370. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  371. CorrelationStatus = CorrelationStatus.Inserted
  372. };
  373. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  374. ++iRight;
  375. }
  376. if (iLeft == leftGrouped.Length && iRight == rightGrouped.Length)
  377. return newListOfCorrelatedSequence;
  378. // if there is content on the left, but not content on the right
  379. if (iRight == rightGrouped.Length)
  380. {
  381. for (int j = iLeft; j < leftGrouped.Length; j++)
  382. {
  383. var deletedCorrelatedSequence = new CorrelatedSequence
  384. {
  385. ComparisonUnitArray1 = leftGrouped[j].ToArray(),
  386. ComparisonUnitArray2 = null,
  387. CorrelationStatus = CorrelationStatus.Deleted
  388. };
  389. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  390. }
  391. return newListOfCorrelatedSequence;
  392. }
  393. // there is content on the right but not on the left
  394. if (iLeft == leftGrouped.Length)
  395. {
  396. for (int j = iRight; j < rightGrouped.Length; j++)
  397. {
  398. var insertedCorrelatedSequence = new CorrelatedSequence
  399. {
  400. ComparisonUnitArray1 = null,
  401. ComparisonUnitArray2 = rightGrouped[j].ToArray(),
  402. CorrelationStatus = CorrelationStatus.Inserted
  403. };
  404. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  405. }
  406. return newListOfCorrelatedSequence;
  407. }
  408. // else continue on next round.
  409. }
  410. }
  411. // if both sides contain tables and paragraphs, then split into multiple unknown corr sequ
  412. if (leftTables > 0 && rightTables > 0 &&
  413. leftParagraphs > 0 && rightParagraphs > 0 &&
  414. (leftLength > 1 || rightLength > 1))
  415. {
  416. IGrouping<string, ComparisonUnit>[] leftGrouped = unknown
  417. .ComparisonUnitArray1
  418. .GroupAdjacent(cu =>
  419. {
  420. var cug = (ComparisonUnitGroup) cu;
  421. return cug.ComparisonUnitGroupType == ComparisonUnitGroupType.Table ? "Table" : "Para";
  422. })
  423. .ToArray();
  424. IGrouping<string, ComparisonUnit>[] rightGrouped = unknown
  425. .ComparisonUnitArray2
  426. .GroupAdjacent(cu =>
  427. {
  428. var cug = (ComparisonUnitGroup) cu;
  429. return cug.ComparisonUnitGroupType == ComparisonUnitGroupType.Table ? "Table" : "Para";
  430. })
  431. .ToArray();
  432. var iLeft = 0;
  433. var iRight = 0;
  434. // create an unknown corr sequ for it.
  435. // increment both counters
  436. // if one is at end but the other is not, then tag the remaining content as inserted or deleted, and done.
  437. // if both are at the end, then done
  438. // return the new list of corr sequ
  439. while (true)
  440. {
  441. if (leftGrouped[iLeft].Key == "Table" && rightGrouped[iRight].Key == "Table" ||
  442. leftGrouped[iLeft].Key == "Para" && rightGrouped[iRight].Key == "Para")
  443. {
  444. var unknownCorrelatedSequence = new CorrelatedSequence
  445. {
  446. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  447. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  448. CorrelationStatus = CorrelationStatus.Unknown
  449. };
  450. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  451. ++iLeft;
  452. ++iRight;
  453. }
  454. else if (leftGrouped[iLeft].Key == "Para" && rightGrouped[iRight].Key == "Table")
  455. {
  456. var deletedCorrelatedSequence = new CorrelatedSequence
  457. {
  458. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  459. ComparisonUnitArray2 = null,
  460. CorrelationStatus = CorrelationStatus.Deleted
  461. };
  462. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  463. ++iLeft;
  464. }
  465. else if (leftGrouped[iLeft].Key == "Table" && rightGrouped[iRight].Key == "Para")
  466. {
  467. var insertedCorrelatedSequence = new CorrelatedSequence
  468. {
  469. ComparisonUnitArray1 = null,
  470. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  471. CorrelationStatus = CorrelationStatus.Inserted
  472. };
  473. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  474. ++iRight;
  475. }
  476. if (iLeft == leftGrouped.Length && iRight == rightGrouped.Length)
  477. return newListOfCorrelatedSequence;
  478. // if there is content on the left, but not content on the right
  479. if (iRight == rightGrouped.Length)
  480. {
  481. for (int j = iLeft; j < leftGrouped.Length; j++)
  482. {
  483. var deletedCorrelatedSequence = new CorrelatedSequence
  484. {
  485. ComparisonUnitArray1 = leftGrouped[j].ToArray(),
  486. ComparisonUnitArray2 = null,
  487. CorrelationStatus = CorrelationStatus.Deleted
  488. };
  489. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  490. }
  491. return newListOfCorrelatedSequence;
  492. }
  493. // there is content on the right but not on the left
  494. if (iLeft == leftGrouped.Length)
  495. {
  496. for (int j = iRight; j < rightGrouped.Length; j++)
  497. {
  498. var insertedCorrelatedSequence = new CorrelatedSequence
  499. {
  500. ComparisonUnitArray1 = null,
  501. ComparisonUnitArray2 = rightGrouped[j].ToArray(),
  502. CorrelationStatus = CorrelationStatus.Inserted
  503. };
  504. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  505. }
  506. return newListOfCorrelatedSequence;
  507. }
  508. // else continue on next round.
  509. }
  510. }
  511. // If both sides consists of a single table, and if the table contains merged cells, then mark as deleted/inserted
  512. if (leftTables == 1 && leftLength == 1 &&
  513. rightTables == 1 && rightLength == 1)
  514. {
  515. List<CorrelatedSequence> result = DoLcsAlgorithmForTable(unknown);
  516. if (result != null)
  517. return result;
  518. }
  519. // If either side contains only paras or tables, then flatten and iterate.
  520. bool leftOnlyParasTablesTextboxes = leftLength == leftTables + leftParagraphs + leftTextboxes;
  521. bool rightOnlyParasTablesTextboxes = rightLength == rightTables + rightParagraphs + rightTextboxes;
  522. if (leftOnlyParasTablesTextboxes && rightOnlyParasTablesTextboxes)
  523. {
  524. // flatten paras and tables, and iterate
  525. ComparisonUnit[] left = unknown
  526. .ComparisonUnitArray1
  527. .Select(cu => cu.Contents)
  528. .SelectMany(m => m)
  529. .ToArray();
  530. ComparisonUnit[] right = unknown
  531. .ComparisonUnitArray2
  532. .Select(cu => cu.Contents)
  533. .SelectMany(m => m)
  534. .ToArray();
  535. var unknownCorrelatedSequence = new CorrelatedSequence
  536. {
  537. CorrelationStatus = CorrelationStatus.Unknown,
  538. ComparisonUnitArray1 = left,
  539. ComparisonUnitArray2 = right
  540. };
  541. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  542. return newListOfCorrelatedSequence;
  543. }
  544. // if first of left is a row and first of right is a row
  545. // then flatten the row to cells and iterate.
  546. if (unknown.ComparisonUnitArray1.FirstOrDefault() is ComparisonUnitGroup firstLeft &&
  547. unknown.ComparisonUnitArray2.FirstOrDefault() is ComparisonUnitGroup firstRight)
  548. {
  549. if (firstLeft.ComparisonUnitGroupType == ComparisonUnitGroupType.Row &&
  550. firstRight.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  551. {
  552. ComparisonUnit[] leftContent = firstLeft.Contents.ToArray();
  553. ComparisonUnit[] rightContent = firstRight.Contents.ToArray();
  554. int lenLeft = leftContent.Length;
  555. int lenRight = rightContent.Length;
  556. if (lenLeft < lenRight)
  557. {
  558. leftContent = leftContent
  559. .Concat(Enumerable.Repeat<ComparisonUnit>(null, lenRight - lenLeft))
  560. .ToArray();
  561. }
  562. else if (lenRight < lenLeft)
  563. {
  564. rightContent = rightContent
  565. .Concat(Enumerable.Repeat<ComparisonUnit>(null, lenLeft - lenRight))
  566. .ToArray();
  567. }
  568. List<CorrelatedSequence> newCs = leftContent.Zip(rightContent, (l, r) =>
  569. {
  570. if (l != null && r != null)
  571. {
  572. var unknownCorrelatedSequence = new CorrelatedSequence
  573. {
  574. ComparisonUnitArray1 = new[] { l },
  575. ComparisonUnitArray2 = new[] { r },
  576. CorrelationStatus = CorrelationStatus.Unknown
  577. };
  578. return new[] { unknownCorrelatedSequence };
  579. }
  580. if (l == null)
  581. {
  582. var insertedCorrelatedSequence = new CorrelatedSequence
  583. {
  584. ComparisonUnitArray1 = null,
  585. ComparisonUnitArray2 = r.Contents.ToArray(),
  586. CorrelationStatus = CorrelationStatus.Inserted
  587. };
  588. return new[] { insertedCorrelatedSequence };
  589. }
  590. var deletedCorrelatedSequence = new CorrelatedSequence
  591. {
  592. ComparisonUnitArray1 = l.Contents.ToArray(),
  593. ComparisonUnitArray2 = null,
  594. CorrelationStatus = CorrelationStatus.Deleted
  595. };
  596. return new[] { deletedCorrelatedSequence };
  597. })
  598. .SelectMany(m => m)
  599. .ToList();
  600. foreach (CorrelatedSequence cs in newCs)
  601. {
  602. newListOfCorrelatedSequence.Add(cs);
  603. }
  604. ComparisonUnit[] remainderLeft = unknown
  605. .ComparisonUnitArray1
  606. .Skip(1)
  607. .ToArray();
  608. ComparisonUnit[] remainderRight = unknown
  609. .ComparisonUnitArray2
  610. .Skip(1)
  611. .ToArray();
  612. if (remainderLeft.Length > 0 && remainderRight.Length == 0)
  613. {
  614. var deletedCorrelatedSequence = new CorrelatedSequence
  615. {
  616. CorrelationStatus = CorrelationStatus.Deleted,
  617. ComparisonUnitArray1 = remainderLeft,
  618. ComparisonUnitArray2 = null
  619. };
  620. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  621. }
  622. else if (remainderRight.Length > 0 && remainderLeft.Length == 0)
  623. {
  624. var insertedCorrelatedSequence = new CorrelatedSequence
  625. {
  626. CorrelationStatus = CorrelationStatus.Inserted,
  627. ComparisonUnitArray1 = null,
  628. ComparisonUnitArray2 = remainderRight
  629. };
  630. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  631. }
  632. else if (remainderLeft.Length > 0 && remainderRight.Length > 0)
  633. {
  634. var unknownCorrelatedSequence2 = new CorrelatedSequence
  635. {
  636. CorrelationStatus = CorrelationStatus.Unknown,
  637. ComparisonUnitArray1 = remainderLeft,
  638. ComparisonUnitArray2 = remainderRight
  639. };
  640. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence2);
  641. }
  642. if (False)
  643. {
  644. var sb = new StringBuilder();
  645. foreach (CorrelatedSequence item in newListOfCorrelatedSequence)
  646. {
  647. sb.Append(item).Append(Environment.NewLine);
  648. }
  649. string sbs = sb.ToString();
  650. TestUtil.NotePad(sbs);
  651. }
  652. return newListOfCorrelatedSequence;
  653. }
  654. if (firstLeft.ComparisonUnitGroupType == ComparisonUnitGroupType.Cell &&
  655. firstRight.ComparisonUnitGroupType == ComparisonUnitGroupType.Cell)
  656. {
  657. ComparisonUnit[] left = firstLeft
  658. .Contents
  659. .ToArray();
  660. ComparisonUnit[] right = firstRight
  661. .Contents
  662. .ToArray();
  663. var unknownCorrelatedSequence = new CorrelatedSequence
  664. {
  665. CorrelationStatus = CorrelationStatus.Unknown,
  666. ComparisonUnitArray1 = left,
  667. ComparisonUnitArray2 = right
  668. };
  669. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  670. ComparisonUnit[] remainderLeft = unknown
  671. .ComparisonUnitArray1
  672. .Skip(1)
  673. .ToArray();
  674. ComparisonUnit[] remainderRight = unknown
  675. .ComparisonUnitArray2
  676. .Skip(1)
  677. .ToArray();
  678. if (remainderLeft.Length > 0 && remainderRight.Length == 0)
  679. {
  680. var deletedCorrelatedSequence = new CorrelatedSequence
  681. {
  682. CorrelationStatus = CorrelationStatus.Deleted,
  683. ComparisonUnitArray1 = remainderLeft,
  684. ComparisonUnitArray2 = null
  685. };
  686. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  687. }
  688. else if (remainderRight.Length > 0 && remainderLeft.Length == 0)
  689. {
  690. var insertedCorrelatedSequence = new CorrelatedSequence
  691. {
  692. CorrelationStatus = CorrelationStatus.Inserted,
  693. ComparisonUnitArray1 = null,
  694. ComparisonUnitArray2 = remainderRight
  695. };
  696. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  697. }
  698. else if (remainderLeft.Length > 0 && remainderRight.Length > 0)
  699. {
  700. var unknownCorrelatedSequence2 = new CorrelatedSequence
  701. {
  702. CorrelationStatus = CorrelationStatus.Unknown,
  703. ComparisonUnitArray1 = remainderLeft,
  704. ComparisonUnitArray2 = remainderRight
  705. };
  706. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence2);
  707. }
  708. return newListOfCorrelatedSequence;
  709. }
  710. }
  711. if (unknown.ComparisonUnitArray1.Any() && unknown.ComparisonUnitArray2.Any())
  712. {
  713. if (unknown.ComparisonUnitArray1.First() is ComparisonUnitWord &&
  714. unknown.ComparisonUnitArray2.First() is ComparisonUnitGroup right &&
  715. right.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  716. {
  717. var insertedCorrelatedSequence3 = new CorrelatedSequence
  718. {
  719. CorrelationStatus = CorrelationStatus.Inserted,
  720. ComparisonUnitArray1 = null,
  721. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  722. };
  723. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence3);
  724. var deletedCorrelatedSequence3 = new CorrelatedSequence
  725. {
  726. CorrelationStatus = CorrelationStatus.Deleted,
  727. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  728. ComparisonUnitArray2 = null
  729. };
  730. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence3);
  731. return newListOfCorrelatedSequence;
  732. }
  733. if (unknown.ComparisonUnitArray2.First() is ComparisonUnitWord &&
  734. unknown.ComparisonUnitArray1.First() is ComparisonUnitGroup left2 &&
  735. left2.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  736. {
  737. var deletedCorrelatedSequence3 = new CorrelatedSequence
  738. {
  739. CorrelationStatus = CorrelationStatus.Deleted,
  740. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  741. ComparisonUnitArray2 = null
  742. };
  743. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence3);
  744. var insertedCorrelatedSequence3 = new CorrelatedSequence
  745. {
  746. CorrelationStatus = CorrelationStatus.Inserted,
  747. ComparisonUnitArray1 = null,
  748. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  749. };
  750. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence3);
  751. return newListOfCorrelatedSequence;
  752. }
  753. ComparisonUnitAtom lastContentAtomLeft = unknown
  754. .ComparisonUnitArray1
  755. .Select(cu => cu.DescendantContentAtoms().Last())
  756. .LastOrDefault();
  757. ComparisonUnitAtom lastContentAtomRight = unknown
  758. .ComparisonUnitArray2
  759. .Select(cu => cu.DescendantContentAtoms().Last())
  760. .LastOrDefault();
  761. if (lastContentAtomLeft != null && lastContentAtomRight != null)
  762. {
  763. if (lastContentAtomLeft.ContentElement.Name == W.pPr &&
  764. lastContentAtomRight.ContentElement.Name != W.pPr)
  765. {
  766. var insertedCorrelatedSequence5 = new CorrelatedSequence
  767. {
  768. CorrelationStatus = CorrelationStatus.Inserted,
  769. ComparisonUnitArray1 = null,
  770. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  771. };
  772. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence5);
  773. var deletedCorrelatedSequence5 = new CorrelatedSequence
  774. {
  775. CorrelationStatus = CorrelationStatus.Deleted,
  776. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  777. ComparisonUnitArray2 = null
  778. };
  779. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence5);
  780. return newListOfCorrelatedSequence;
  781. }
  782. if (lastContentAtomLeft.ContentElement.Name != W.pPr &&
  783. lastContentAtomRight.ContentElement.Name == W.pPr)
  784. {
  785. var deletedCorrelatedSequence5 = new CorrelatedSequence
  786. {
  787. CorrelationStatus = CorrelationStatus.Deleted,
  788. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  789. ComparisonUnitArray2 = null
  790. };
  791. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence5);
  792. var insertedCorrelatedSequence5 = new CorrelatedSequence
  793. {
  794. CorrelationStatus = CorrelationStatus.Inserted,
  795. ComparisonUnitArray1 = null,
  796. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  797. };
  798. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence5);
  799. return newListOfCorrelatedSequence;
  800. }
  801. }
  802. }
  803. var deletedCorrelatedSequence4 = new CorrelatedSequence
  804. {
  805. CorrelationStatus = CorrelationStatus.Deleted,
  806. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  807. ComparisonUnitArray2 = null
  808. };
  809. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence4);
  810. var insertedCorrelatedSequence4 = new CorrelatedSequence
  811. {
  812. CorrelationStatus = CorrelationStatus.Inserted,
  813. ComparisonUnitArray1 = null,
  814. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  815. };
  816. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence4);
  817. return newListOfCorrelatedSequence;
  818. }
  819. ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  820. // here we have the longest common subsequence.
  821. // but it may start in the middle of a paragraph.
  822. // therefore need to dispose of the content from the beginning of the longest common subsequence to the
  823. // beginning of the paragraph.
  824. // this should be in a separate unknown region
  825. // if countCommonAtEnd != 0, and if it contains a paragraph mark, then if there are comparison units in
  826. // the same paragraph before the common at end (in either version)
  827. // then we want to put all of those comparison units into a single unknown, where they must be resolved
  828. // against each other. We don't want those comparison units to go into the middle unknown comparison unit.
  829. var remainingInLeftParagraph = 0;
  830. var remainingInRightParagraph = 0;
  831. if (currentLongestCommonSequenceLength != 0)
  832. {
  833. List<ComparisonUnit> commonSeq = unknown
  834. .ComparisonUnitArray1
  835. .Skip(currentI1)
  836. .Take(currentLongestCommonSequenceLength)
  837. .ToList();
  838. ComparisonUnit firstOfCommonSeq = commonSeq.First();
  839. if (firstOfCommonSeq is ComparisonUnitWord)
  840. {
  841. // are there any paragraph marks in the common seq at end?
  842. if (commonSeq.Any(cu =>
  843. {
  844. ComparisonUnitAtom firstComparisonUnitAtom = cu.Contents.OfType<ComparisonUnitAtom>().FirstOrDefault();
  845. if (firstComparisonUnitAtom == null)
  846. return false;
  847. return firstComparisonUnitAtom.ContentElement.Name == W.pPr;
  848. }))
  849. {
  850. remainingInLeftParagraph = unknown
  851. .ComparisonUnitArray1
  852. .Take(currentI1)
  853. .Reverse()
  854. .TakeWhile(cu =>
  855. {
  856. if (!(cu is ComparisonUnitWord))
  857. return false;
  858. ComparisonUnitAtom firstComparisonUnitAtom =
  859. cu.Contents.OfType<ComparisonUnitAtom>().FirstOrDefault();
  860. if (firstComparisonUnitAtom == null)
  861. return true;
  862. return firstComparisonUnitAtom.ContentElement.Name != W.pPr;
  863. })
  864. .Count();
  865. remainingInRightParagraph = unknown
  866. .ComparisonUnitArray2
  867. .Take(currentI2)
  868. .Reverse()
  869. .TakeWhile(cu =>
  870. {
  871. if (!(cu is ComparisonUnitWord))
  872. return false;
  873. ComparisonUnitAtom firstComparisonUnitAtom =
  874. cu.Contents.OfType<ComparisonUnitAtom>().FirstOrDefault();
  875. if (firstComparisonUnitAtom == null)
  876. return true;
  877. return firstComparisonUnitAtom.ContentElement.Name != W.pPr;
  878. })
  879. .Count();
  880. }
  881. }
  882. }
  883. int countBeforeCurrentParagraphLeft = currentI1 - remainingInLeftParagraph;
  884. int countBeforeCurrentParagraphRight = currentI2 - remainingInRightParagraph;
  885. if (countBeforeCurrentParagraphLeft > 0 && countBeforeCurrentParagraphRight == 0)
  886. {
  887. var deletedCorrelatedSequence = new CorrelatedSequence
  888. {
  889. CorrelationStatus = CorrelationStatus.Deleted,
  890. ComparisonUnitArray1 = cul1
  891. .Take(countBeforeCurrentParagraphLeft)
  892. .ToArray(),
  893. ComparisonUnitArray2 = null
  894. };
  895. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  896. }
  897. else if (countBeforeCurrentParagraphLeft == 0 && countBeforeCurrentParagraphRight > 0)
  898. {
  899. var insertedCorrelatedSequence = new CorrelatedSequence
  900. {
  901. CorrelationStatus = CorrelationStatus.Inserted,
  902. ComparisonUnitArray1 = null,
  903. ComparisonUnitArray2 = cul2
  904. .Take(countBeforeCurrentParagraphRight)
  905. .ToArray()
  906. };
  907. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  908. }
  909. else if (countBeforeCurrentParagraphLeft > 0 && countBeforeCurrentParagraphRight > 0)
  910. {
  911. var unknownCorrelatedSequence = new CorrelatedSequence
  912. {
  913. CorrelationStatus = CorrelationStatus.Unknown,
  914. ComparisonUnitArray1 = cul1
  915. .Take(countBeforeCurrentParagraphLeft)
  916. .ToArray(),
  917. ComparisonUnitArray2 = cul2
  918. .Take(countBeforeCurrentParagraphRight)
  919. .ToArray()
  920. };
  921. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  922. }
  923. else if (countBeforeCurrentParagraphLeft == 0 && countBeforeCurrentParagraphRight == 0)
  924. {
  925. // nothing to do
  926. }
  927. if (remainingInLeftParagraph > 0 && remainingInRightParagraph == 0)
  928. {
  929. var deletedCorrelatedSequence = new CorrelatedSequence
  930. {
  931. CorrelationStatus = CorrelationStatus.Deleted,
  932. ComparisonUnitArray1 = cul1
  933. .Skip(countBeforeCurrentParagraphLeft)
  934. .Take(remainingInLeftParagraph)
  935. .ToArray(),
  936. ComparisonUnitArray2 = null
  937. };
  938. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  939. }
  940. else if (remainingInLeftParagraph == 0 && remainingInRightParagraph > 0)
  941. {
  942. var insertedCorrelatedSequence = new CorrelatedSequence
  943. {
  944. CorrelationStatus = CorrelationStatus.Inserted,
  945. ComparisonUnitArray1 = null,
  946. ComparisonUnitArray2 = cul2
  947. .Skip(countBeforeCurrentParagraphRight)
  948. .Take(remainingInRightParagraph)
  949. .ToArray()
  950. };
  951. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  952. }
  953. else if (remainingInLeftParagraph > 0 && remainingInRightParagraph > 0)
  954. {
  955. var unknownCorrelatedSequence = new CorrelatedSequence
  956. {
  957. CorrelationStatus = CorrelationStatus.Unknown,
  958. ComparisonUnitArray1 = cul1
  959. .Skip(countBeforeCurrentParagraphLeft)
  960. .Take(remainingInLeftParagraph)
  961. .ToArray(),
  962. ComparisonUnitArray2 = cul2
  963. .Skip(countBeforeCurrentParagraphRight)
  964. .Take(remainingInRightParagraph)
  965. .ToArray()
  966. };
  967. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  968. }
  969. else if (remainingInLeftParagraph == 0 && remainingInRightParagraph == 0)
  970. {
  971. // nothing to do
  972. }
  973. var middleEqual = new CorrelatedSequence
  974. {
  975. CorrelationStatus = CorrelationStatus.Equal,
  976. ComparisonUnitArray1 = cul1
  977. .Skip(currentI1)
  978. .Take(currentLongestCommonSequenceLength)
  979. .ToArray(),
  980. ComparisonUnitArray2 = cul2
  981. .Skip(currentI2)
  982. .Take(currentLongestCommonSequenceLength)
  983. .ToArray()
  984. };
  985. newListOfCorrelatedSequence.Add(middleEqual);
  986. int endI1 = currentI1 + currentLongestCommonSequenceLength;
  987. int endI2 = currentI2 + currentLongestCommonSequenceLength;
  988. ComparisonUnit[] remaining1 = cul1
  989. .Skip(endI1)
  990. .ToArray();
  991. ComparisonUnit[] remaining2 = cul2
  992. .Skip(endI2)
  993. .ToArray();
  994. // here is the point that we want to make a new unknown from this point to the end of the paragraph that
  995. // contains the equal parts.
  996. // this will never hurt anything, and will in many cases result in a better difference.
  997. if (middleEqual.ComparisonUnitArray1[middleEqual.ComparisonUnitArray1.Length - 1] is ComparisonUnitWord leftCuw)
  998. {
  999. ComparisonUnitAtom lastContentAtom = leftCuw.DescendantContentAtoms().LastOrDefault();
  1000. // if the middleEqual did not end with a paragraph mark
  1001. if (lastContentAtom != null && lastContentAtom.ContentElement.Name != W.pPr)
  1002. {
  1003. int idx1 = FindIndexOfNextParaMark(remaining1);
  1004. int idx2 = FindIndexOfNextParaMark(remaining2);
  1005. var unknownCorrelatedSequenceRemaining = new CorrelatedSequence
  1006. {
  1007. CorrelationStatus = CorrelationStatus.Unknown,
  1008. ComparisonUnitArray1 = remaining1.Take(idx1).ToArray(),
  1009. ComparisonUnitArray2 = remaining2.Take(idx2).ToArray()
  1010. };
  1011. newListOfCorrelatedSequence.Add(unknownCorrelatedSequenceRemaining);
  1012. var unknownCorrelatedSequenceAfter = new CorrelatedSequence
  1013. {
  1014. CorrelationStatus = CorrelationStatus.Unknown,
  1015. ComparisonUnitArray1 = remaining1.Skip(idx1).ToArray(),
  1016. ComparisonUnitArray2 = remaining2.Skip(idx2).ToArray()
  1017. };
  1018. newListOfCorrelatedSequence.Add(unknownCorrelatedSequenceAfter);
  1019. return newListOfCorrelatedSequence;
  1020. }
  1021. }
  1022. var unknownCorrelatedSequence20 = new CorrelatedSequence
  1023. {
  1024. CorrelationStatus = CorrelationStatus.Unknown,
  1025. ComparisonUnitArray1 = remaining1,
  1026. ComparisonUnitArray2 = remaining2
  1027. };
  1028. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence20);
  1029. return newListOfCorrelatedSequence;
  1030. }
  1031. private static List<CorrelatedSequence> DoLcsAlgorithmForTable(CorrelatedSequence unknown)
  1032. {
  1033. var newListOfCorrelatedSequence = new List<CorrelatedSequence>();
  1034. ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  1035. // if we have a table with the same number of rows, and all rows have equal CorrelatedSHA1Hash, then we can
  1036. // flatten and compare every corresponding row.
  1037. // This is true regardless of whether there are horizontally or vertically merged cells, since that
  1038. // characteristic is incorporated into the CorrespondingSHA1Hash. This is probably not very common, but it
  1039. // will never do any harm.
  1040. var tblGroup1 = (ComparisonUnitGroup) unknown.ComparisonUnitArray1.First();
  1041. var tblGroup2 = (ComparisonUnitGroup) unknown.ComparisonUnitArray2.First();
  1042. if (tblGroup1.Contents.Count == tblGroup2.Contents.Count) // if there are the same number of rows
  1043. {
  1044. var zipped = tblGroup1
  1045. .Contents
  1046. .Zip(
  1047. tblGroup2.Contents,
  1048. (r1, r2) => new
  1049. {
  1050. Row1 = r1 as ComparisonUnitGroup,
  1051. Row2 = r2 as ComparisonUnitGroup
  1052. })
  1053. .ToList();
  1054. bool canCollapse = zipped.All(z => z.Row1.CorrelatedSHA1Hash == z.Row2.CorrelatedSHA1Hash);
  1055. if (canCollapse)
  1056. {
  1057. newListOfCorrelatedSequence = zipped
  1058. .Select(z =>
  1059. {
  1060. var unknownCorrelatedSequence = new CorrelatedSequence
  1061. {
  1062. ComparisonUnitArray1 = new ComparisonUnit[] { z.Row1 },
  1063. ComparisonUnitArray2 = new ComparisonUnit[] { z.Row2 },
  1064. CorrelationStatus = CorrelationStatus.Unknown
  1065. };
  1066. return unknownCorrelatedSequence;
  1067. })
  1068. .ToList();
  1069. return newListOfCorrelatedSequence;
  1070. }
  1071. }
  1072. ComparisonUnitAtom firstContentAtom1 = tblGroup1.DescendantContentAtoms().FirstOrDefault();
  1073. if (firstContentAtom1 == null)
  1074. {
  1075. throw new OpenXmlPowerToolsException("Internal error");
  1076. }
  1077. XElement tblElement1 = firstContentAtom1
  1078. .AncestorElements
  1079. .Reverse()
  1080. .First(a => a.Name == W.tbl);
  1081. ComparisonUnitAtom firstContentAtom2 = tblGroup2.DescendantContentAtoms().FirstOrDefault();
  1082. if (firstContentAtom2 == null)
  1083. {
  1084. throw new OpenXmlPowerToolsException("Internal error");
  1085. }
  1086. XElement tblElement2 = firstContentAtom2
  1087. .AncestorElements
  1088. .Reverse()
  1089. .First(a => a.Name == W.tbl);
  1090. bool leftContainsMerged = tblElement1
  1091. .Descendants()
  1092. .Any(d => d.Name == W.vMerge || d.Name == W.gridSpan);
  1093. bool rightContainsMerged = tblElement2
  1094. .Descendants()
  1095. .Any(d => d.Name == W.vMerge || d.Name == W.gridSpan);
  1096. if (leftContainsMerged || rightContainsMerged)
  1097. {
  1098. // If StructureSha1Hash is the same for both tables, then we know that the structure of the tables is
  1099. // identical, so we can break into correlated sequences for rows.
  1100. if (tblGroup1.StructureSHA1Hash != null &&
  1101. tblGroup2.StructureSHA1Hash != null &&
  1102. tblGroup1.StructureSHA1Hash == tblGroup2.StructureSHA1Hash)
  1103. {
  1104. var zipped = tblGroup1.Contents.Zip(tblGroup2.Contents, (r1, r2) => new
  1105. {
  1106. Row1 = r1 as ComparisonUnitGroup,
  1107. Row2 = r2 as ComparisonUnitGroup
  1108. });
  1109. newListOfCorrelatedSequence = zipped
  1110. .Select(z =>
  1111. {
  1112. var unknownCorrelatedSequence = new CorrelatedSequence
  1113. {
  1114. ComparisonUnitArray1 = new ComparisonUnit[] { z.Row1 },
  1115. ComparisonUnitArray2 = new ComparisonUnit[] { z.Row2 },
  1116. CorrelationStatus = CorrelationStatus.Unknown
  1117. };
  1118. return unknownCorrelatedSequence;
  1119. })
  1120. .ToList();
  1121. return newListOfCorrelatedSequence;
  1122. }
  1123. // otherwise flatten to rows
  1124. var deletedCorrelatedSequence = new CorrelatedSequence
  1125. {
  1126. ComparisonUnitArray1 = unknown
  1127. .ComparisonUnitArray1
  1128. .Select(z => z.Contents)
  1129. .SelectMany(m => m)
  1130. .ToArray(),
  1131. ComparisonUnitArray2 = null,
  1132. CorrelationStatus = CorrelationStatus.Deleted
  1133. };
  1134. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  1135. var insertedCorrelatedSequence = new CorrelatedSequence
  1136. {
  1137. ComparisonUnitArray1 = null,
  1138. ComparisonUnitArray2 = unknown
  1139. .ComparisonUnitArray2
  1140. .Select(z => z.Contents)
  1141. .SelectMany(m => m)
  1142. .ToArray(),
  1143. CorrelationStatus = CorrelationStatus.Inserted
  1144. };
  1145. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  1146. return newListOfCorrelatedSequence;
  1147. }
  1148. return null;
  1149. }
  1150. private static int FindIndexOfNextParaMark(ComparisonUnit[] cul)
  1151. {
  1152. for (var i = 0; i < cul.Length; i++)
  1153. {
  1154. var cuw = (ComparisonUnitWord) cul[i];
  1155. ComparisonUnitAtom lastAtom = cuw.DescendantContentAtoms().LastOrDefault();
  1156. if (lastAtom?.ContentElement.Name == W.pPr)
  1157. {
  1158. return i;
  1159. }
  1160. }
  1161. return cul.Length;
  1162. }
  1163. }
  1164. }