WmlComparer.Private.Methods.Lcs.cs 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using OpenXmlPowerTools.Previous;
  4. using System;
  5. using System.Collections.Generic;
  6. using System.Linq;
  7. using System.Text;
  8. using System.Xml.Linq;
  9. namespace OpenXmlPowerTools
  10. {
  11. public static partial class WmlComparer
  12. {
  13. private static List<CorrelatedSequence> DoLcsAlgorithm(CorrelatedSequence unknown, WmlComparerSettings settings)
  14. {
  15. var newListOfCorrelatedSequence = new List<CorrelatedSequence>();
  16. ComparisonUnit[] cul1 = unknown.ComparisonUnitArray1;
  17. ComparisonUnit[] cul2 = unknown.ComparisonUnitArray2;
  18. // first thing to do - if we have an unknown with zero length on left or right side, create appropriate
  19. // this is a code optimization that enables easier processing of cases elsewhere.
  20. if (cul1.Length > 0 && cul2.Length == 0)
  21. {
  22. var deletedCorrelatedSequence = new CorrelatedSequence
  23. {
  24. CorrelationStatus = CorrelationStatus.Deleted,
  25. ComparisonUnitArray1 = cul1,
  26. ComparisonUnitArray2 = null
  27. };
  28. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  29. return newListOfCorrelatedSequence;
  30. }
  31. if (cul1.Length == 0 && cul2.Length > 0)
  32. {
  33. var insertedCorrelatedSequence = new CorrelatedSequence
  34. {
  35. CorrelationStatus = CorrelationStatus.Inserted,
  36. ComparisonUnitArray1 = null,
  37. ComparisonUnitArray2 = cul2
  38. };
  39. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  40. return newListOfCorrelatedSequence;
  41. }
  42. if (cul1.Length == 0 && cul2.Length == 0)
  43. {
  44. // this will effectively remove the unknown with no data on either side from the current data model.
  45. return newListOfCorrelatedSequence;
  46. }
  47. var currentLongestCommonSequenceLength = 0;
  48. int currentI1 = -1;
  49. int currentI2 = -1;
  50. for (var i1 = 0; i1 < cul1.Length - currentLongestCommonSequenceLength; i1++)
  51. {
  52. for (var i2 = 0; i2 < cul2.Length - currentLongestCommonSequenceLength; i2++)
  53. {
  54. var thisSequenceLength = 0;
  55. int thisI1 = i1;
  56. int thisI2 = i2;
  57. while (true)
  58. {
  59. if (cul1[thisI1].SHA1Hash == cul2[thisI2].SHA1Hash)
  60. {
  61. thisI1++;
  62. thisI2++;
  63. thisSequenceLength++;
  64. if (thisI1 == cul1.Length || thisI2 == cul2.Length)
  65. {
  66. if (thisSequenceLength > currentLongestCommonSequenceLength)
  67. {
  68. currentLongestCommonSequenceLength = thisSequenceLength;
  69. currentI1 = i1;
  70. currentI2 = i2;
  71. }
  72. break;
  73. }
  74. }
  75. else
  76. {
  77. if (thisSequenceLength > currentLongestCommonSequenceLength)
  78. {
  79. currentLongestCommonSequenceLength = thisSequenceLength;
  80. currentI1 = i1;
  81. currentI2 = i2;
  82. }
  83. break;
  84. }
  85. }
  86. }
  87. }
  88. // never start a common section with a paragraph mark.
  89. while (true)
  90. {
  91. if (currentLongestCommonSequenceLength <= 1)
  92. break;
  93. ComparisonUnit firstCommon = cul1[currentI1];
  94. if (!(firstCommon is ComparisonUnitWord firstCommonWord))
  95. break;
  96. // if the word contains more than one atom, then not a paragraph mark
  97. if (firstCommonWord.Contents.Count != 1)
  98. break;
  99. if (!(firstCommonWord.Contents.First() is ComparisonUnitAtom firstCommonAtom))
  100. break;
  101. if (firstCommonAtom.ContentElement.Name != W.pPr)
  102. break;
  103. --currentLongestCommonSequenceLength;
  104. if (currentLongestCommonSequenceLength == 0)
  105. {
  106. currentI1 = -1;
  107. currentI2 = -1;
  108. }
  109. else
  110. {
  111. ++currentI1;
  112. ++currentI2;
  113. }
  114. }
  115. var isOnlyParagraphMark = false;
  116. if (currentLongestCommonSequenceLength == 1)
  117. {
  118. ComparisonUnit firstCommon = cul1[currentI1];
  119. if (firstCommon is ComparisonUnitWord firstCommonWord)
  120. {
  121. // if the word contains more than one atom, then not a paragraph mark
  122. if (firstCommonWord.Contents.Count == 1)
  123. {
  124. if (firstCommonWord.Contents.First() is ComparisonUnitAtom firstCommonAtom)
  125. {
  126. if (firstCommonAtom.ContentElement.Name == W.pPr)
  127. isOnlyParagraphMark = true;
  128. }
  129. }
  130. }
  131. }
  132. // don't match just a single character
  133. if (currentLongestCommonSequenceLength == 1)
  134. {
  135. if (cul2[currentI2] is ComparisonUnitAtom cuw2)
  136. {
  137. if (cuw2.ContentElement.Name == W.t && cuw2.ContentElement.Value == " ")
  138. {
  139. currentI1 = -1;
  140. currentI2 = -1;
  141. currentLongestCommonSequenceLength = 0;
  142. }
  143. }
  144. }
  145. // don't match only word break characters
  146. if (currentLongestCommonSequenceLength > 0 && currentLongestCommonSequenceLength <= 3)
  147. {
  148. ComparisonUnit[] commonSequence = cul1.Skip(currentI1).Take(currentLongestCommonSequenceLength).ToArray();
  149. // if they are all ComparisonUnitWord objects
  150. bool oneIsNotWord = commonSequence.Any(cs => !(cs is ComparisonUnitWord));
  151. bool allAreWords = !oneIsNotWord;
  152. if (allAreWords)
  153. {
  154. bool contentOtherThanWordSplitChars = commonSequence
  155. .Cast<ComparisonUnitWord>()
  156. .Any(cs =>
  157. {
  158. bool otherThanText = cs.DescendantContentAtoms().Any(dca => dca.ContentElement.Name != W.t);
  159. if (otherThanText) return true;
  160. bool otherThanWordSplit = cs
  161. .DescendantContentAtoms()
  162. .Any(dca =>
  163. {
  164. string charValue = dca.ContentElement.Value;
  165. bool isWordSplit = settings.WordSeparators.Contains(charValue[0]);
  166. return !isWordSplit;
  167. });
  168. return otherThanWordSplit;
  169. });
  170. if (!contentOtherThanWordSplitChars)
  171. {
  172. currentI1 = -1;
  173. currentI2 = -1;
  174. currentLongestCommonSequenceLength = 0;
  175. }
  176. }
  177. }
  178. // if we are only looking at text, and if the longest common subsequence is less than 15% of the whole, then forget it,
  179. // don't find that LCS.
  180. if (!isOnlyParagraphMark && currentLongestCommonSequenceLength > 0)
  181. {
  182. bool anyButWord1 = cul1.Any(cu => !(cu is ComparisonUnitWord));
  183. bool anyButWord2 = cul2.Any(cu => !(cu is ComparisonUnitWord));
  184. if (!anyButWord1 && !anyButWord2)
  185. {
  186. int maxLen = Math.Max(cul1.Length, cul2.Length);
  187. if (currentLongestCommonSequenceLength / (double) maxLen < settings.DetailThreshold)
  188. {
  189. currentI1 = -1;
  190. currentI2 = -1;
  191. currentLongestCommonSequenceLength = 0;
  192. }
  193. }
  194. }
  195. if (currentI1 == -1 && currentI2 == -1)
  196. {
  197. int leftLength = unknown.ComparisonUnitArray1.Length;
  198. int leftTables = unknown
  199. .ComparisonUnitArray1
  200. .OfType<ComparisonUnitGroup>()
  201. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Table);
  202. int leftRows = unknown
  203. .ComparisonUnitArray1
  204. .OfType<ComparisonUnitGroup>()
  205. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Row);
  206. int leftParagraphs = unknown
  207. .ComparisonUnitArray1
  208. .OfType<ComparisonUnitGroup>()
  209. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Paragraph);
  210. int leftTextboxes = unknown
  211. .ComparisonUnitArray1
  212. .OfType<ComparisonUnitGroup>()
  213. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox);
  214. int leftWords = unknown
  215. .ComparisonUnitArray1
  216. .OfType<ComparisonUnitWord>()
  217. .Count();
  218. int rightLength = unknown.ComparisonUnitArray2.Length;
  219. int rightTables = unknown
  220. .ComparisonUnitArray2
  221. .OfType<ComparisonUnitGroup>()
  222. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Table);
  223. int rightRows = unknown
  224. .ComparisonUnitArray2
  225. .OfType<ComparisonUnitGroup>()
  226. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Row);
  227. int rightParagraphs = unknown
  228. .ComparisonUnitArray2
  229. .OfType<ComparisonUnitGroup>()
  230. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Paragraph);
  231. int rightTextboxes = unknown
  232. .ComparisonUnitArray2
  233. .OfType<ComparisonUnitGroup>()
  234. .Count(l => l.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox);
  235. int rightWords = unknown
  236. .ComparisonUnitArray2
  237. .OfType<ComparisonUnitWord>()
  238. .Count();
  239. // if either side has both words, rows and text boxes, then we need to separate out into separate
  240. // unknown correlated sequences
  241. // group adjacent based on whether word, row, or textbox
  242. // in most cases, the count of groups will be the same, but they may differ
  243. // if the first group on either side is word, then create a deleted or inserted corr sequ for it.
  244. // then have counter on both sides pointing to the first matched pairs of rows
  245. // create an unknown corr sequ for it.
  246. // increment both counters
  247. // if one is at end but the other is not, then tag the remaining content as inserted or deleted, and done.
  248. // if both are at the end, then done
  249. // return the new list of corr sequ
  250. bool leftOnlyWordsRowsTextboxes = leftLength == leftWords + leftRows + leftTextboxes;
  251. bool rightOnlyWordsRowsTextboxes = rightLength == rightWords + rightRows + rightTextboxes;
  252. if ((leftWords > 0 || rightWords > 0) &&
  253. (leftRows > 0 || rightRows > 0 || leftTextboxes > 0 || rightTextboxes > 0) &&
  254. leftOnlyWordsRowsTextboxes &&
  255. rightOnlyWordsRowsTextboxes)
  256. {
  257. IGrouping<string, ComparisonUnit>[] leftGrouped = unknown
  258. .ComparisonUnitArray1
  259. .GroupAdjacent(cu =>
  260. {
  261. if (cu is ComparisonUnitWord)
  262. {
  263. return "Word";
  264. }
  265. var cug = cu as ComparisonUnitGroup;
  266. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  267. {
  268. return "Row";
  269. }
  270. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox)
  271. {
  272. return "Textbox";
  273. }
  274. throw new OpenXmlPowerToolsException("Internal error");
  275. })
  276. .ToArray();
  277. IGrouping<string, ComparisonUnit>[] rightGrouped = unknown
  278. .ComparisonUnitArray2
  279. .GroupAdjacent(cu =>
  280. {
  281. if (cu is ComparisonUnitWord)
  282. {
  283. return "Word";
  284. }
  285. var cug = cu as ComparisonUnitGroup;
  286. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  287. {
  288. return "Row";
  289. }
  290. if (cug?.ComparisonUnitGroupType == ComparisonUnitGroupType.Textbox)
  291. {
  292. return "Textbox";
  293. }
  294. throw new OpenXmlPowerToolsException("Internal error");
  295. })
  296. .ToArray();
  297. var iLeft = 0;
  298. var iRight = 0;
  299. // create an unknown corr sequ for it.
  300. // increment both counters
  301. // if one is at end but the other is not, then tag the remaining content as inserted or deleted, and done.
  302. // if both are at the end, then done
  303. // return the new list of corr sequ
  304. while (true)
  305. {
  306. if (leftGrouped[iLeft].Key == rightGrouped[iRight].Key)
  307. {
  308. var unknownCorrelatedSequence = new CorrelatedSequence
  309. {
  310. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  311. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  312. CorrelationStatus = CorrelationStatus.Unknown
  313. };
  314. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  315. ++iLeft;
  316. ++iRight;
  317. }
  318. // have to decide which of the following two branches to do first based on whether the left contains a paragraph mark
  319. // i.e. cant insert a string of deleted text right before a table.
  320. else if (leftGrouped[iLeft].Key == "Word" &&
  321. leftGrouped[iLeft]
  322. .Select(lg => lg.DescendantContentAtoms())
  323. .SelectMany(m => m).Last()
  324. .ContentElement
  325. .Name != W.pPr &&
  326. rightGrouped[iRight].Key == "Row")
  327. {
  328. var insertedCorrelatedSequence = new CorrelatedSequence
  329. {
  330. ComparisonUnitArray1 = null,
  331. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  332. CorrelationStatus = CorrelationStatus.Inserted
  333. };
  334. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  335. ++iRight;
  336. }
  337. else if (rightGrouped[iRight].Key == "Word" &&
  338. rightGrouped[iRight]
  339. .Select(lg => lg.DescendantContentAtoms())
  340. .SelectMany(m => m)
  341. .Last()
  342. .ContentElement
  343. .Name != W.pPr &&
  344. leftGrouped[iLeft].Key == "Row")
  345. {
  346. var insertedCorrelatedSequence = new CorrelatedSequence
  347. {
  348. ComparisonUnitArray1 = null,
  349. ComparisonUnitArray2 = leftGrouped[iLeft].ToArray(),
  350. CorrelationStatus = CorrelationStatus.Inserted
  351. };
  352. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  353. ++iLeft;
  354. }
  355. else if (leftGrouped[iLeft].Key == "Word" && rightGrouped[iRight].Key != "Word")
  356. {
  357. var deletedCorrelatedSequence = new CorrelatedSequence
  358. {
  359. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  360. ComparisonUnitArray2 = null,
  361. CorrelationStatus = CorrelationStatus.Deleted
  362. };
  363. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  364. ++iLeft;
  365. }
  366. else if (leftGrouped[iLeft].Key != "Word" && rightGrouped[iRight].Key == "Word")
  367. {
  368. var insertedCorrelatedSequence = new CorrelatedSequence
  369. {
  370. ComparisonUnitArray1 = null,
  371. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  372. CorrelationStatus = CorrelationStatus.Inserted
  373. };
  374. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  375. ++iRight;
  376. }
  377. if (iLeft == leftGrouped.Length && iRight == rightGrouped.Length)
  378. return newListOfCorrelatedSequence;
  379. // if there is content on the left, but not content on the right
  380. if (iRight == rightGrouped.Length)
  381. {
  382. for (int j = iLeft; j < leftGrouped.Length; j++)
  383. {
  384. var deletedCorrelatedSequence = new CorrelatedSequence
  385. {
  386. ComparisonUnitArray1 = leftGrouped[j].ToArray(),
  387. ComparisonUnitArray2 = null,
  388. CorrelationStatus = CorrelationStatus.Deleted
  389. };
  390. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  391. }
  392. return newListOfCorrelatedSequence;
  393. }
  394. // there is content on the right but not on the left
  395. if (iLeft == leftGrouped.Length)
  396. {
  397. for (int j = iRight; j < rightGrouped.Length; j++)
  398. {
  399. var insertedCorrelatedSequence = new CorrelatedSequence
  400. {
  401. ComparisonUnitArray1 = null,
  402. ComparisonUnitArray2 = rightGrouped[j].ToArray(),
  403. CorrelationStatus = CorrelationStatus.Inserted
  404. };
  405. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  406. }
  407. return newListOfCorrelatedSequence;
  408. }
  409. // else continue on next round.
  410. }
  411. }
  412. // if both sides contain tables and paragraphs, then split into multiple unknown corr sequ
  413. if (leftTables > 0 && rightTables > 0 &&
  414. leftParagraphs > 0 && rightParagraphs > 0 &&
  415. (leftLength > 1 || rightLength > 1))
  416. {
  417. IGrouping<string, ComparisonUnit>[] leftGrouped = unknown
  418. .ComparisonUnitArray1
  419. .GroupAdjacent(cu =>
  420. {
  421. var cug = (ComparisonUnitGroup) cu;
  422. return cug.ComparisonUnitGroupType == ComparisonUnitGroupType.Table ? "Table" : "Para";
  423. })
  424. .ToArray();
  425. IGrouping<string, ComparisonUnit>[] rightGrouped = unknown
  426. .ComparisonUnitArray2
  427. .GroupAdjacent(cu =>
  428. {
  429. var cug = (ComparisonUnitGroup) cu;
  430. return cug.ComparisonUnitGroupType == ComparisonUnitGroupType.Table ? "Table" : "Para";
  431. })
  432. .ToArray();
  433. var iLeft = 0;
  434. var iRight = 0;
  435. // create an unknown corr sequ for it.
  436. // increment both counters
  437. // if one is at end but the other is not, then tag the remaining content as inserted or deleted, and done.
  438. // if both are at the end, then done
  439. // return the new list of corr sequ
  440. while (true)
  441. {
  442. if (leftGrouped[iLeft].Key == "Table" && rightGrouped[iRight].Key == "Table" ||
  443. leftGrouped[iLeft].Key == "Para" && rightGrouped[iRight].Key == "Para")
  444. {
  445. var unknownCorrelatedSequence = new CorrelatedSequence
  446. {
  447. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  448. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  449. CorrelationStatus = CorrelationStatus.Unknown
  450. };
  451. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  452. ++iLeft;
  453. ++iRight;
  454. }
  455. else if (leftGrouped[iLeft].Key == "Para" && rightGrouped[iRight].Key == "Table")
  456. {
  457. var deletedCorrelatedSequence = new CorrelatedSequence
  458. {
  459. ComparisonUnitArray1 = leftGrouped[iLeft].ToArray(),
  460. ComparisonUnitArray2 = null,
  461. CorrelationStatus = CorrelationStatus.Deleted
  462. };
  463. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  464. ++iLeft;
  465. }
  466. else if (leftGrouped[iLeft].Key == "Table" && rightGrouped[iRight].Key == "Para")
  467. {
  468. var insertedCorrelatedSequence = new CorrelatedSequence
  469. {
  470. ComparisonUnitArray1 = null,
  471. ComparisonUnitArray2 = rightGrouped[iRight].ToArray(),
  472. CorrelationStatus = CorrelationStatus.Inserted
  473. };
  474. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  475. ++iRight;
  476. }
  477. if (iLeft == leftGrouped.Length && iRight == rightGrouped.Length)
  478. return newListOfCorrelatedSequence;
  479. // if there is content on the left, but not content on the right
  480. if (iRight == rightGrouped.Length)
  481. {
  482. for (int j = iLeft; j < leftGrouped.Length; j++)
  483. {
  484. var deletedCorrelatedSequence = new CorrelatedSequence
  485. {
  486. ComparisonUnitArray1 = leftGrouped[j].ToArray(),
  487. ComparisonUnitArray2 = null,
  488. CorrelationStatus = CorrelationStatus.Deleted
  489. };
  490. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  491. }
  492. return newListOfCorrelatedSequence;
  493. }
  494. // there is content on the right but not on the left
  495. if (iLeft == leftGrouped.Length)
  496. {
  497. for (int j = iRight; j < rightGrouped.Length; j++)
  498. {
  499. var insertedCorrelatedSequence = new CorrelatedSequence
  500. {
  501. ComparisonUnitArray1 = null,
  502. ComparisonUnitArray2 = rightGrouped[j].ToArray(),
  503. CorrelationStatus = CorrelationStatus.Inserted
  504. };
  505. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  506. }
  507. return newListOfCorrelatedSequence;
  508. }
  509. // else continue on next round.
  510. }
  511. }
  512. // If both sides consists of a single table, and if the table contains merged cells, then mark as deleted/inserted
  513. if (leftTables == 1 && leftLength == 1 &&
  514. rightTables == 1 && rightLength == 1)
  515. {
  516. List<CorrelatedSequence> result = DoLcsAlgorithmForTable(unknown);
  517. if (result != null)
  518. return result;
  519. }
  520. // If either side contains only paras or tables, then flatten and iterate.
  521. bool leftOnlyParasTablesTextboxes = leftLength == leftTables + leftParagraphs + leftTextboxes;
  522. bool rightOnlyParasTablesTextboxes = rightLength == rightTables + rightParagraphs + rightTextboxes;
  523. if (leftOnlyParasTablesTextboxes && rightOnlyParasTablesTextboxes)
  524. {
  525. // flatten paras and tables, and iterate
  526. ComparisonUnit[] left = unknown
  527. .ComparisonUnitArray1
  528. .Select(cu => cu.Contents)
  529. .SelectMany(m => m)
  530. .ToArray();
  531. ComparisonUnit[] right = unknown
  532. .ComparisonUnitArray2
  533. .Select(cu => cu.Contents)
  534. .SelectMany(m => m)
  535. .ToArray();
  536. var unknownCorrelatedSequence = new CorrelatedSequence
  537. {
  538. CorrelationStatus = CorrelationStatus.Unknown,
  539. ComparisonUnitArray1 = left,
  540. ComparisonUnitArray2 = right
  541. };
  542. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  543. return newListOfCorrelatedSequence;
  544. }
  545. // if first of left is a row and first of right is a row
  546. // then flatten the row to cells and iterate.
  547. if (unknown.ComparisonUnitArray1.FirstOrDefault() is ComparisonUnitGroup firstLeft &&
  548. unknown.ComparisonUnitArray2.FirstOrDefault() is ComparisonUnitGroup firstRight)
  549. {
  550. if (firstLeft.ComparisonUnitGroupType == ComparisonUnitGroupType.Row &&
  551. firstRight.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  552. {
  553. ComparisonUnit[] leftContent = firstLeft.Contents.ToArray();
  554. ComparisonUnit[] rightContent = firstRight.Contents.ToArray();
  555. int lenLeft = leftContent.Length;
  556. int lenRight = rightContent.Length;
  557. if (lenLeft < lenRight)
  558. {
  559. leftContent = leftContent
  560. .Concat(Enumerable.Repeat<ComparisonUnit>(null, lenRight - lenLeft))
  561. .ToArray();
  562. }
  563. else if (lenRight < lenLeft)
  564. {
  565. rightContent = rightContent
  566. .Concat(Enumerable.Repeat<ComparisonUnit>(null, lenLeft - lenRight))
  567. .ToArray();
  568. }
  569. List<CorrelatedSequence> newCs = leftContent.Zip(rightContent, (l, r) =>
  570. {
  571. if (l != null && r != null)
  572. {
  573. var unknownCorrelatedSequence = new CorrelatedSequence
  574. {
  575. ComparisonUnitArray1 = new[] { l },
  576. ComparisonUnitArray2 = new[] { r },
  577. CorrelationStatus = CorrelationStatus.Unknown
  578. };
  579. return new[] { unknownCorrelatedSequence };
  580. }
  581. if (l == null)
  582. {
  583. var insertedCorrelatedSequence = new CorrelatedSequence
  584. {
  585. ComparisonUnitArray1 = null,
  586. ComparisonUnitArray2 = r.Contents.ToArray(),
  587. CorrelationStatus = CorrelationStatus.Inserted
  588. };
  589. return new[] { insertedCorrelatedSequence };
  590. }
  591. var deletedCorrelatedSequence = new CorrelatedSequence
  592. {
  593. ComparisonUnitArray1 = l.Contents.ToArray(),
  594. ComparisonUnitArray2 = null,
  595. CorrelationStatus = CorrelationStatus.Deleted
  596. };
  597. return new[] { deletedCorrelatedSequence };
  598. })
  599. .SelectMany(m => m)
  600. .ToList();
  601. foreach (CorrelatedSequence cs in newCs)
  602. {
  603. newListOfCorrelatedSequence.Add(cs);
  604. }
  605. ComparisonUnit[] remainderLeft = unknown
  606. .ComparisonUnitArray1
  607. .Skip(1)
  608. .ToArray();
  609. ComparisonUnit[] remainderRight = unknown
  610. .ComparisonUnitArray2
  611. .Skip(1)
  612. .ToArray();
  613. if (remainderLeft.Length > 0 && remainderRight.Length == 0)
  614. {
  615. var deletedCorrelatedSequence = new CorrelatedSequence
  616. {
  617. CorrelationStatus = CorrelationStatus.Deleted,
  618. ComparisonUnitArray1 = remainderLeft,
  619. ComparisonUnitArray2 = null
  620. };
  621. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  622. }
  623. else if (remainderRight.Length > 0 && remainderLeft.Length == 0)
  624. {
  625. var insertedCorrelatedSequence = new CorrelatedSequence
  626. {
  627. CorrelationStatus = CorrelationStatus.Inserted,
  628. ComparisonUnitArray1 = null,
  629. ComparisonUnitArray2 = remainderRight
  630. };
  631. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  632. }
  633. else if (remainderLeft.Length > 0 && remainderRight.Length > 0)
  634. {
  635. var unknownCorrelatedSequence2 = new CorrelatedSequence
  636. {
  637. CorrelationStatus = CorrelationStatus.Unknown,
  638. ComparisonUnitArray1 = remainderLeft,
  639. ComparisonUnitArray2 = remainderRight
  640. };
  641. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence2);
  642. }
  643. if (False)
  644. {
  645. var sb = new StringBuilder();
  646. foreach (CorrelatedSequence item in newListOfCorrelatedSequence)
  647. {
  648. sb.Append(item).Append(Environment.NewLine);
  649. }
  650. string sbs = sb.ToString();
  651. TestUtil.NotePad(sbs);
  652. }
  653. return newListOfCorrelatedSequence;
  654. }
  655. if (firstLeft.ComparisonUnitGroupType == ComparisonUnitGroupType.Cell &&
  656. firstRight.ComparisonUnitGroupType == ComparisonUnitGroupType.Cell)
  657. {
  658. ComparisonUnit[] left = firstLeft
  659. .Contents
  660. .ToArray();
  661. ComparisonUnit[] right = firstRight
  662. .Contents
  663. .ToArray();
  664. var unknownCorrelatedSequence = new CorrelatedSequence
  665. {
  666. CorrelationStatus = CorrelationStatus.Unknown,
  667. ComparisonUnitArray1 = left,
  668. ComparisonUnitArray2 = right
  669. };
  670. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  671. ComparisonUnit[] remainderLeft = unknown
  672. .ComparisonUnitArray1
  673. .Skip(1)
  674. .ToArray();
  675. ComparisonUnit[] remainderRight = unknown
  676. .ComparisonUnitArray2
  677. .Skip(1)
  678. .ToArray();
  679. if (remainderLeft.Length > 0 && remainderRight.Length == 0)
  680. {
  681. var deletedCorrelatedSequence = new CorrelatedSequence
  682. {
  683. CorrelationStatus = CorrelationStatus.Deleted,
  684. ComparisonUnitArray1 = remainderLeft,
  685. ComparisonUnitArray2 = null
  686. };
  687. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  688. }
  689. else if (remainderRight.Length > 0 && remainderLeft.Length == 0)
  690. {
  691. var insertedCorrelatedSequence = new CorrelatedSequence
  692. {
  693. CorrelationStatus = CorrelationStatus.Inserted,
  694. ComparisonUnitArray1 = null,
  695. ComparisonUnitArray2 = remainderRight
  696. };
  697. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  698. }
  699. else if (remainderLeft.Length > 0 && remainderRight.Length > 0)
  700. {
  701. var unknownCorrelatedSequence2 = new CorrelatedSequence
  702. {
  703. CorrelationStatus = CorrelationStatus.Unknown,
  704. ComparisonUnitArray1 = remainderLeft,
  705. ComparisonUnitArray2 = remainderRight
  706. };
  707. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence2);
  708. }
  709. return newListOfCorrelatedSequence;
  710. }
  711. }
  712. if (unknown.ComparisonUnitArray1.Any() && unknown.ComparisonUnitArray2.Any())
  713. {
  714. if (unknown.ComparisonUnitArray1.First() is ComparisonUnitWord &&
  715. unknown.ComparisonUnitArray2.First() is ComparisonUnitGroup right &&
  716. right.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  717. {
  718. var insertedCorrelatedSequence3 = new CorrelatedSequence
  719. {
  720. CorrelationStatus = CorrelationStatus.Inserted,
  721. ComparisonUnitArray1 = null,
  722. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  723. };
  724. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence3);
  725. var deletedCorrelatedSequence3 = new CorrelatedSequence
  726. {
  727. CorrelationStatus = CorrelationStatus.Deleted,
  728. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  729. ComparisonUnitArray2 = null
  730. };
  731. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence3);
  732. return newListOfCorrelatedSequence;
  733. }
  734. if (unknown.ComparisonUnitArray2.First() is ComparisonUnitWord &&
  735. unknown.ComparisonUnitArray1.First() is ComparisonUnitGroup left2 &&
  736. left2.ComparisonUnitGroupType == ComparisonUnitGroupType.Row)
  737. {
  738. var deletedCorrelatedSequence3 = new CorrelatedSequence
  739. {
  740. CorrelationStatus = CorrelationStatus.Deleted,
  741. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  742. ComparisonUnitArray2 = null
  743. };
  744. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence3);
  745. var insertedCorrelatedSequence3 = new CorrelatedSequence
  746. {
  747. CorrelationStatus = CorrelationStatus.Inserted,
  748. ComparisonUnitArray1 = null,
  749. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  750. };
  751. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence3);
  752. return newListOfCorrelatedSequence;
  753. }
  754. ComparisonUnitAtom lastContentAtomLeft = unknown
  755. .ComparisonUnitArray1
  756. .Select(cu => cu.DescendantContentAtoms().Last())
  757. .LastOrDefault();
  758. ComparisonUnitAtom lastContentAtomRight = unknown
  759. .ComparisonUnitArray2
  760. .Select(cu => cu.DescendantContentAtoms().Last())
  761. .LastOrDefault();
  762. if (lastContentAtomLeft != null && lastContentAtomRight != null)
  763. {
  764. if (lastContentAtomLeft.ContentElement.Name == W.pPr &&
  765. lastContentAtomRight.ContentElement.Name != W.pPr)
  766. {
  767. var insertedCorrelatedSequence5 = new CorrelatedSequence
  768. {
  769. CorrelationStatus = CorrelationStatus.Inserted,
  770. ComparisonUnitArray1 = null,
  771. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  772. };
  773. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence5);
  774. var deletedCorrelatedSequence5 = new CorrelatedSequence
  775. {
  776. CorrelationStatus = CorrelationStatus.Deleted,
  777. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  778. ComparisonUnitArray2 = null
  779. };
  780. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence5);
  781. return newListOfCorrelatedSequence;
  782. }
  783. if (lastContentAtomLeft.ContentElement.Name != W.pPr &&
  784. lastContentAtomRight.ContentElement.Name == W.pPr)
  785. {
  786. var deletedCorrelatedSequence5 = new CorrelatedSequence
  787. {
  788. CorrelationStatus = CorrelationStatus.Deleted,
  789. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  790. ComparisonUnitArray2 = null
  791. };
  792. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence5);
  793. var insertedCorrelatedSequence5 = new CorrelatedSequence
  794. {
  795. CorrelationStatus = CorrelationStatus.Inserted,
  796. ComparisonUnitArray1 = null,
  797. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  798. };
  799. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence5);
  800. return newListOfCorrelatedSequence;
  801. }
  802. }
  803. }
  804. var deletedCorrelatedSequence4 = new CorrelatedSequence
  805. {
  806. CorrelationStatus = CorrelationStatus.Deleted,
  807. ComparisonUnitArray1 = unknown.ComparisonUnitArray1,
  808. ComparisonUnitArray2 = null
  809. };
  810. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence4);
  811. var insertedCorrelatedSequence4 = new CorrelatedSequence
  812. {
  813. CorrelationStatus = CorrelationStatus.Inserted,
  814. ComparisonUnitArray1 = null,
  815. ComparisonUnitArray2 = unknown.ComparisonUnitArray2
  816. };
  817. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence4);
  818. return newListOfCorrelatedSequence;
  819. }
  820. ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  821. // here we have the longest common subsequence.
  822. // but it may start in the middle of a paragraph.
  823. // therefore need to dispose of the content from the beginning of the longest common subsequence to the
  824. // beginning of the paragraph.
  825. // this should be in a separate unknown region
  826. // if countCommonAtEnd != 0, and if it contains a paragraph mark, then if there are comparison units in
  827. // the same paragraph before the common at end (in either version)
  828. // then we want to put all of those comparison units into a single unknown, where they must be resolved
  829. // against each other. We don't want those comparison units to go into the middle unknown comparison unit.
  830. var remainingInLeftParagraph = 0;
  831. var remainingInRightParagraph = 0;
  832. if (currentLongestCommonSequenceLength != 0)
  833. {
  834. List<ComparisonUnit> commonSeq = unknown
  835. .ComparisonUnitArray1
  836. .Skip(currentI1)
  837. .Take(currentLongestCommonSequenceLength)
  838. .ToList();
  839. ComparisonUnit firstOfCommonSeq = commonSeq.First();
  840. if (firstOfCommonSeq is ComparisonUnitWord)
  841. {
  842. // are there any paragraph marks in the common seq at end?
  843. if (commonSeq.Any(cu =>
  844. {
  845. ComparisonUnitAtom firstComparisonUnitAtom = cu.Contents.OfType<ComparisonUnitAtom>().FirstOrDefault();
  846. if (firstComparisonUnitAtom == null)
  847. return false;
  848. return firstComparisonUnitAtom.ContentElement.Name == W.pPr;
  849. }))
  850. {
  851. remainingInLeftParagraph = unknown
  852. .ComparisonUnitArray1
  853. .Take(currentI1)
  854. .Reverse()
  855. .TakeWhile(cu =>
  856. {
  857. if (!(cu is ComparisonUnitWord))
  858. return false;
  859. ComparisonUnitAtom firstComparisonUnitAtom =
  860. cu.Contents.OfType<ComparisonUnitAtom>().FirstOrDefault();
  861. if (firstComparisonUnitAtom == null)
  862. return true;
  863. return firstComparisonUnitAtom.ContentElement.Name != W.pPr;
  864. })
  865. .Count();
  866. remainingInRightParagraph = unknown
  867. .ComparisonUnitArray2
  868. .Take(currentI2)
  869. .Reverse()
  870. .TakeWhile(cu =>
  871. {
  872. if (!(cu is ComparisonUnitWord))
  873. return false;
  874. ComparisonUnitAtom firstComparisonUnitAtom =
  875. cu.Contents.OfType<ComparisonUnitAtom>().FirstOrDefault();
  876. if (firstComparisonUnitAtom == null)
  877. return true;
  878. return firstComparisonUnitAtom.ContentElement.Name != W.pPr;
  879. })
  880. .Count();
  881. }
  882. }
  883. }
  884. int countBeforeCurrentParagraphLeft = currentI1 - remainingInLeftParagraph;
  885. int countBeforeCurrentParagraphRight = currentI2 - remainingInRightParagraph;
  886. if (countBeforeCurrentParagraphLeft > 0 && countBeforeCurrentParagraphRight == 0)
  887. {
  888. var deletedCorrelatedSequence = new CorrelatedSequence
  889. {
  890. CorrelationStatus = CorrelationStatus.Deleted,
  891. ComparisonUnitArray1 = cul1
  892. .Take(countBeforeCurrentParagraphLeft)
  893. .ToArray(),
  894. ComparisonUnitArray2 = null
  895. };
  896. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  897. }
  898. else if (countBeforeCurrentParagraphLeft == 0 && countBeforeCurrentParagraphRight > 0)
  899. {
  900. var insertedCorrelatedSequence = new CorrelatedSequence
  901. {
  902. CorrelationStatus = CorrelationStatus.Inserted,
  903. ComparisonUnitArray1 = null,
  904. ComparisonUnitArray2 = cul2
  905. .Take(countBeforeCurrentParagraphRight)
  906. .ToArray()
  907. };
  908. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  909. }
  910. else if (countBeforeCurrentParagraphLeft > 0 && countBeforeCurrentParagraphRight > 0)
  911. {
  912. var unknownCorrelatedSequence = new CorrelatedSequence
  913. {
  914. CorrelationStatus = CorrelationStatus.Unknown,
  915. ComparisonUnitArray1 = cul1
  916. .Take(countBeforeCurrentParagraphLeft)
  917. .ToArray(),
  918. ComparisonUnitArray2 = cul2
  919. .Take(countBeforeCurrentParagraphRight)
  920. .ToArray()
  921. };
  922. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  923. }
  924. else if (countBeforeCurrentParagraphLeft == 0 && countBeforeCurrentParagraphRight == 0)
  925. {
  926. // nothing to do
  927. }
  928. if (remainingInLeftParagraph > 0 && remainingInRightParagraph == 0)
  929. {
  930. var deletedCorrelatedSequence = new CorrelatedSequence
  931. {
  932. CorrelationStatus = CorrelationStatus.Deleted,
  933. ComparisonUnitArray1 = cul1
  934. .Skip(countBeforeCurrentParagraphLeft)
  935. .Take(remainingInLeftParagraph)
  936. .ToArray(),
  937. ComparisonUnitArray2 = null
  938. };
  939. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  940. }
  941. else if (remainingInLeftParagraph == 0 && remainingInRightParagraph > 0)
  942. {
  943. var insertedCorrelatedSequence = new CorrelatedSequence
  944. {
  945. CorrelationStatus = CorrelationStatus.Inserted,
  946. ComparisonUnitArray1 = null,
  947. ComparisonUnitArray2 = cul2
  948. .Skip(countBeforeCurrentParagraphRight)
  949. .Take(remainingInRightParagraph)
  950. .ToArray()
  951. };
  952. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  953. }
  954. else if (remainingInLeftParagraph > 0 && remainingInRightParagraph > 0)
  955. {
  956. var unknownCorrelatedSequence = new CorrelatedSequence
  957. {
  958. CorrelationStatus = CorrelationStatus.Unknown,
  959. ComparisonUnitArray1 = cul1
  960. .Skip(countBeforeCurrentParagraphLeft)
  961. .Take(remainingInLeftParagraph)
  962. .ToArray(),
  963. ComparisonUnitArray2 = cul2
  964. .Skip(countBeforeCurrentParagraphRight)
  965. .Take(remainingInRightParagraph)
  966. .ToArray()
  967. };
  968. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence);
  969. }
  970. else if (remainingInLeftParagraph == 0 && remainingInRightParagraph == 0)
  971. {
  972. // nothing to do
  973. }
  974. var middleEqual = new CorrelatedSequence
  975. {
  976. CorrelationStatus = CorrelationStatus.Equal,
  977. ComparisonUnitArray1 = cul1
  978. .Skip(currentI1)
  979. .Take(currentLongestCommonSequenceLength)
  980. .ToArray(),
  981. ComparisonUnitArray2 = cul2
  982. .Skip(currentI2)
  983. .Take(currentLongestCommonSequenceLength)
  984. .ToArray()
  985. };
  986. newListOfCorrelatedSequence.Add(middleEqual);
  987. int endI1 = currentI1 + currentLongestCommonSequenceLength;
  988. int endI2 = currentI2 + currentLongestCommonSequenceLength;
  989. ComparisonUnit[] remaining1 = cul1
  990. .Skip(endI1)
  991. .ToArray();
  992. ComparisonUnit[] remaining2 = cul2
  993. .Skip(endI2)
  994. .ToArray();
  995. // here is the point that we want to make a new unknown from this point to the end of the paragraph that
  996. // contains the equal parts.
  997. // this will never hurt anything, and will in many cases result in a better difference.
  998. if (middleEqual.ComparisonUnitArray1[middleEqual.ComparisonUnitArray1.Length - 1] is ComparisonUnitWord leftCuw)
  999. {
  1000. ComparisonUnitAtom lastContentAtom = leftCuw.DescendantContentAtoms().LastOrDefault();
  1001. // if the middleEqual did not end with a paragraph mark
  1002. if (lastContentAtom != null && lastContentAtom.ContentElement.Name != W.pPr)
  1003. {
  1004. int idx1 = FindIndexOfNextParaMark(remaining1);
  1005. int idx2 = FindIndexOfNextParaMark(remaining2);
  1006. var unknownCorrelatedSequenceRemaining = new CorrelatedSequence
  1007. {
  1008. CorrelationStatus = CorrelationStatus.Unknown,
  1009. ComparisonUnitArray1 = remaining1.Take(idx1).ToArray(),
  1010. ComparisonUnitArray2 = remaining2.Take(idx2).ToArray()
  1011. };
  1012. newListOfCorrelatedSequence.Add(unknownCorrelatedSequenceRemaining);
  1013. var unknownCorrelatedSequenceAfter = new CorrelatedSequence
  1014. {
  1015. CorrelationStatus = CorrelationStatus.Unknown,
  1016. ComparisonUnitArray1 = remaining1.Skip(idx1).ToArray(),
  1017. ComparisonUnitArray2 = remaining2.Skip(idx2).ToArray()
  1018. };
  1019. newListOfCorrelatedSequence.Add(unknownCorrelatedSequenceAfter);
  1020. return newListOfCorrelatedSequence;
  1021. }
  1022. }
  1023. var unknownCorrelatedSequence20 = new CorrelatedSequence
  1024. {
  1025. CorrelationStatus = CorrelationStatus.Unknown,
  1026. ComparisonUnitArray1 = remaining1,
  1027. ComparisonUnitArray2 = remaining2
  1028. };
  1029. newListOfCorrelatedSequence.Add(unknownCorrelatedSequence20);
  1030. return newListOfCorrelatedSequence;
  1031. }
  1032. private static List<CorrelatedSequence> DoLcsAlgorithmForTable(CorrelatedSequence unknown)
  1033. {
  1034. var newListOfCorrelatedSequence = new List<CorrelatedSequence>();
  1035. ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  1036. // if we have a table with the same number of rows, and all rows have equal CorrelatedSHA1Hash, then we can
  1037. // flatten and compare every corresponding row.
  1038. // This is true regardless of whether there are horizontally or vertically merged cells, since that
  1039. // characteristic is incorporated into the CorrespondingSHA1Hash. This is probably not very common, but it
  1040. // will never do any harm.
  1041. var tblGroup1 = (ComparisonUnitGroup) unknown.ComparisonUnitArray1.First();
  1042. var tblGroup2 = (ComparisonUnitGroup) unknown.ComparisonUnitArray2.First();
  1043. if (tblGroup1.Contents.Count == tblGroup2.Contents.Count) // if there are the same number of rows
  1044. {
  1045. var zipped = tblGroup1
  1046. .Contents
  1047. .Zip(
  1048. tblGroup2.Contents,
  1049. (r1, r2) => new
  1050. {
  1051. Row1 = r1 as ComparisonUnitGroup,
  1052. Row2 = r2 as ComparisonUnitGroup
  1053. })
  1054. .ToList();
  1055. bool canCollapse = zipped.All(z => z.Row1.CorrelatedSHA1Hash == z.Row2.CorrelatedSHA1Hash);
  1056. if (canCollapse)
  1057. {
  1058. newListOfCorrelatedSequence = zipped
  1059. .Select(z =>
  1060. {
  1061. var unknownCorrelatedSequence = new CorrelatedSequence
  1062. {
  1063. ComparisonUnitArray1 = new ComparisonUnit[] { z.Row1 },
  1064. ComparisonUnitArray2 = new ComparisonUnit[] { z.Row2 },
  1065. CorrelationStatus = CorrelationStatus.Unknown
  1066. };
  1067. return unknownCorrelatedSequence;
  1068. })
  1069. .ToList();
  1070. return newListOfCorrelatedSequence;
  1071. }
  1072. }
  1073. ComparisonUnitAtom firstContentAtom1 = tblGroup1.DescendantContentAtoms().FirstOrDefault();
  1074. if (firstContentAtom1 == null)
  1075. {
  1076. throw new OpenXmlPowerToolsException("Internal error");
  1077. }
  1078. XElement tblElement1 = firstContentAtom1
  1079. .AncestorElements
  1080. .Reverse()
  1081. .First(a => a.Name == W.tbl);
  1082. ComparisonUnitAtom firstContentAtom2 = tblGroup2.DescendantContentAtoms().FirstOrDefault();
  1083. if (firstContentAtom2 == null)
  1084. {
  1085. throw new OpenXmlPowerToolsException("Internal error");
  1086. }
  1087. XElement tblElement2 = firstContentAtom2
  1088. .AncestorElements
  1089. .Reverse()
  1090. .First(a => a.Name == W.tbl);
  1091. bool leftContainsMerged = tblElement1
  1092. .Descendants()
  1093. .Any(d => d.Name == W.vMerge || d.Name == W.gridSpan);
  1094. bool rightContainsMerged = tblElement2
  1095. .Descendants()
  1096. .Any(d => d.Name == W.vMerge || d.Name == W.gridSpan);
  1097. if (leftContainsMerged || rightContainsMerged)
  1098. {
  1099. // If StructureSha1Hash is the same for both tables, then we know that the structure of the tables is
  1100. // identical, so we can break into correlated sequences for rows.
  1101. if (tblGroup1.StructureSHA1Hash != null &&
  1102. tblGroup2.StructureSHA1Hash != null &&
  1103. tblGroup1.StructureSHA1Hash == tblGroup2.StructureSHA1Hash)
  1104. {
  1105. var zipped = tblGroup1.Contents.Zip(tblGroup2.Contents, (r1, r2) => new
  1106. {
  1107. Row1 = r1 as ComparisonUnitGroup,
  1108. Row2 = r2 as ComparisonUnitGroup
  1109. });
  1110. newListOfCorrelatedSequence = zipped
  1111. .Select(z =>
  1112. {
  1113. var unknownCorrelatedSequence = new CorrelatedSequence
  1114. {
  1115. ComparisonUnitArray1 = new ComparisonUnit[] { z.Row1 },
  1116. ComparisonUnitArray2 = new ComparisonUnit[] { z.Row2 },
  1117. CorrelationStatus = CorrelationStatus.Unknown
  1118. };
  1119. return unknownCorrelatedSequence;
  1120. })
  1121. .ToList();
  1122. return newListOfCorrelatedSequence;
  1123. }
  1124. // otherwise flatten to rows
  1125. var deletedCorrelatedSequence = new CorrelatedSequence
  1126. {
  1127. ComparisonUnitArray1 = unknown
  1128. .ComparisonUnitArray1
  1129. .Select(z => z.Contents)
  1130. .SelectMany(m => m)
  1131. .ToArray(),
  1132. ComparisonUnitArray2 = null,
  1133. CorrelationStatus = CorrelationStatus.Deleted
  1134. };
  1135. newListOfCorrelatedSequence.Add(deletedCorrelatedSequence);
  1136. var insertedCorrelatedSequence = new CorrelatedSequence
  1137. {
  1138. ComparisonUnitArray1 = null,
  1139. ComparisonUnitArray2 = unknown
  1140. .ComparisonUnitArray2
  1141. .Select(z => z.Contents)
  1142. .SelectMany(m => m)
  1143. .ToArray(),
  1144. CorrelationStatus = CorrelationStatus.Inserted
  1145. };
  1146. newListOfCorrelatedSequence.Add(insertedCorrelatedSequence);
  1147. return newListOfCorrelatedSequence;
  1148. }
  1149. return null;
  1150. }
  1151. private static int FindIndexOfNextParaMark(ComparisonUnit[] cul)
  1152. {
  1153. for (var i = 0; i < cul.Length; i++)
  1154. {
  1155. var cuw = (ComparisonUnitWord) cul[i];
  1156. ComparisonUnitAtom lastAtom = cuw.DescendantContentAtoms().LastOrDefault();
  1157. if (lastAtom?.ContentElement.Name == W.pPr)
  1158. {
  1159. return i;
  1160. }
  1161. }
  1162. return cul.Length;
  1163. }
  1164. }
  1165. }