OpenXmlRegex.cs 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. using System;
  4. using System.Collections.Generic;
  5. using System.Linq;
  6. using System.Text.RegularExpressions;
  7. using System.Xml.Linq;
  8. namespace OpenXmlPowerTools
  9. {
  10. public class OpenXmlRegex
  11. {
  12. private const string DontConsolidate = "DontConsolidate";
  13. private static readonly HashSet<XName> RevTrackMarkupWithId = new HashSet<XName>
  14. {
  15. W.cellDel,
  16. W.cellIns,
  17. W.cellMerge,
  18. W.customXmlDelRangeEnd,
  19. W.customXmlDelRangeStart,
  20. W.customXmlInsRangeEnd,
  21. W.customXmlInsRangeStart,
  22. W.customXmlMoveFromRangeEnd,
  23. W.customXmlMoveFromRangeStart,
  24. W.customXmlMoveToRangeEnd,
  25. W.customXmlMoveToRangeStart,
  26. W.del,
  27. W.ins,
  28. W.moveFrom,
  29. W.moveFromRangeEnd,
  30. W.moveFromRangeStart,
  31. W.moveTo,
  32. W.moveToRangeEnd,
  33. W.moveToRangeStart,
  34. W.pPrChange,
  35. W.rPrChange,
  36. W.sectPrChange,
  37. W.tblGridChange,
  38. W.tblPrChange,
  39. W.tblPrExChange,
  40. W.tcPrChange
  41. };
  42. public static int Match(IEnumerable<XElement> content, Regex regex)
  43. {
  44. return ReplaceInternal(content, regex, null, null, false, null, true);
  45. }
  46. /// <summary>
  47. /// If callback == null Then returns count of matches in the content
  48. /// If callback != null Then Match calls Found for each match
  49. /// </summary>
  50. public static int Match(IEnumerable<XElement> content, Regex regex, Action<XElement, Match> found)
  51. {
  52. return ReplaceInternal(content, regex, null,
  53. (x, m) =>
  54. {
  55. if (found != null) found.Invoke(x, m);
  56. return true;
  57. },
  58. false, null, true);
  59. }
  60. /// <summary>
  61. /// If replacement == "new content" && callback == null
  62. /// Then replaces all matches
  63. /// If replacement == "" && callback == null)
  64. /// Then deletes all matches
  65. /// If replacement == "new content" && callback != null)
  66. /// Then the callback can return true / false to indicate whether to replace or not
  67. /// If the callback returns true once, and false on all subsequent calls, then this method replaces only the first found.
  68. /// If replacement == "" && callback != null)
  69. /// Then the callback can return true / false to indicate whether to delete or not
  70. /// </summary>
  71. public static int Replace(IEnumerable<XElement> content, Regex regex, string replacement,
  72. Func<XElement, Match, bool> doReplacement)
  73. {
  74. return ReplaceInternal(content, regex, replacement, doReplacement, false, null, true);
  75. }
  76. /// <summary>
  77. /// This overload enables not coalescing content, which is necessary for DocumentAssembler.
  78. /// </summary>
  79. public static int Replace(IEnumerable<XElement> content, Regex regex, string replacement,
  80. Func<XElement, Match, bool> doReplacement, bool coalesceContent)
  81. {
  82. return ReplaceInternal(content, regex, replacement, doReplacement, false, null, coalesceContent);
  83. }
  84. /// <summary>
  85. /// If replacement == "new content" && callback == null
  86. /// Then replaces all matches
  87. /// If replacement == "" && callback == null)
  88. /// Then deletes all matches
  89. /// If replacement == "new content" && callback != null)
  90. /// Then the callback can return true / false to indicate whether to replace or not
  91. /// If the callback returns true once, and false on all subsequent calls, then this method replaces only the first found.
  92. /// If replacement == "" && callback != null)
  93. /// Then the callback can return true / false to indicate whether to delete or not
  94. /// If trackRevisions == true
  95. /// Then replacement is done using revision tracking markup, with author as the revision tracking author
  96. /// If trackRevisions == true for a PPTX
  97. /// Then code throws an exception
  98. /// </summary>
  99. public static int Replace(IEnumerable<XElement> content, Regex regex, string replacement,
  100. Func<XElement, Match, bool> doReplacement, bool trackRevisions, string author)
  101. {
  102. return ReplaceInternal(content, regex, replacement, doReplacement, trackRevisions, author, true);
  103. }
  104. private static int ReplaceInternal(IEnumerable<XElement> content, Regex regex, string replacement,
  105. Func<XElement, Match, bool> callback, bool trackRevisions, string revisionTrackingAuthor,
  106. bool coalesceContent)
  107. {
  108. if (content == null) throw new ArgumentNullException("content");
  109. if (regex == null) throw new ArgumentNullException("regex");
  110. IEnumerable<XElement> contentList = content as IList<XElement> ?? content.ToList();
  111. XElement first = contentList.FirstOrDefault();
  112. if (first == null)
  113. return 0;
  114. if (first.Name.Namespace == W.w)
  115. {
  116. if (!contentList.Any())
  117. return 0;
  118. var replInfo = new ReplaceInternalInfo { Count = 0 };
  119. foreach (XElement c in contentList)
  120. {
  121. var newC = (XElement) WmlSearchAndReplaceTransform(c, regex, replacement, callback, trackRevisions,
  122. revisionTrackingAuthor, replInfo, coalesceContent);
  123. c.ReplaceNodes(newC.Nodes());
  124. }
  125. XElement root = contentList.First().AncestorsAndSelf().Last();
  126. int nextId = new[] { 0 }
  127. .Concat(root
  128. .Descendants()
  129. .Where(d => RevTrackMarkupWithId.Contains(d.Name))
  130. .Attributes(W.id)
  131. .Select(a => (int) a))
  132. .Max() + 1;
  133. IEnumerable<XElement> revTrackingWithoutId = root
  134. .DescendantsAndSelf()
  135. .Where(d => RevTrackMarkupWithId.Contains(d.Name) && (d.Attribute(W.id) == null));
  136. foreach (XElement item in revTrackingWithoutId)
  137. item.Add(new XAttribute(W.id, nextId++));
  138. List<IGrouping<int, XElement>> revTrackingWithDuplicateIds = root
  139. .DescendantsAndSelf()
  140. .Where(d => RevTrackMarkupWithId.Contains(d.Name))
  141. .GroupBy(d => (int) d.Attribute(W.id))
  142. .Where(g => g.Count() > 1)
  143. .ToList();
  144. foreach (IGrouping<int, XElement> group in revTrackingWithDuplicateIds)
  145. foreach (XElement gc in group.Skip(1))
  146. {
  147. XAttribute xAttribute = gc.Attribute(W.id);
  148. if (xAttribute != null) xAttribute.Value = nextId.ToString();
  149. nextId++;
  150. }
  151. return replInfo.Count;
  152. }
  153. if ((first.Name.Namespace == P.p) || (first.Name.Namespace == A.a))
  154. {
  155. if (trackRevisions)
  156. throw new OpenXmlPowerToolsException("PPTX does not support revision tracking");
  157. var counter = new ReplaceInternalInfo { Count = 0 };
  158. foreach (XElement c in contentList)
  159. {
  160. var newC = (XElement) PmlSearchAndReplaceTransform(c, regex, replacement, callback, counter);
  161. c.ReplaceNodes(newC.Nodes());
  162. }
  163. return counter.Count;
  164. }
  165. return 0;
  166. }
  167. private static object WmlSearchAndReplaceTransform(XNode node, Regex regex, string replacement,
  168. Func<XElement, Match, bool> callback, bool trackRevisions, string revisionTrackingAuthor,
  169. ReplaceInternalInfo replInfo, bool coalesceContent)
  170. {
  171. var element = node as XElement;
  172. if (element == null) return node;
  173. if (element.Name == W.p)
  174. {
  175. XElement paragraph = element;
  176. string preliminaryContent = paragraph
  177. .DescendantsTrimmed(W.txbxContent)
  178. .Where(d => d.Name == W.r && (d.Parent == null || d.Parent.Name != W.del))
  179. .Select(UnicodeMapper.RunToString)
  180. .StringConcatenate();
  181. if (regex.IsMatch(preliminaryContent))
  182. {
  183. var paragraphWithSplitRuns = new XElement(W.p,
  184. paragraph.Attributes(),
  185. paragraph.Nodes().Select(n => WmlSearchAndReplaceTransform(n, regex, replacement, callback,
  186. trackRevisions, revisionTrackingAuthor, replInfo, coalesceContent)));
  187. IEnumerable<XElement> runsTrimmed = paragraphWithSplitRuns
  188. .DescendantsTrimmed(W.txbxContent)
  189. .Where(d => d.Name == W.r && (d.Parent == null || d.Parent.Name != W.del));
  190. var charsAndRuns = runsTrimmed
  191. .Select(r => new { Ch = UnicodeMapper.RunToString(r), r })
  192. .ToList();
  193. string content = charsAndRuns.Select(t => t.Ch).StringConcatenate();
  194. XElement[] alignedRuns = charsAndRuns.Select(t => t.r).ToArray();
  195. MatchCollection matchCollection = regex.Matches(content);
  196. replInfo.Count += matchCollection.Count;
  197. // Process Match
  198. if (replacement == null)
  199. {
  200. if (callback == null) return paragraph;
  201. foreach (Match match in matchCollection.Cast<Match>())
  202. callback(paragraph, match);
  203. return paragraph;
  204. }
  205. // Process Replace
  206. foreach (Match match in matchCollection.Cast<Match>())
  207. {
  208. if (match.Length == 0) continue;
  209. if ((callback != null) && !callback(paragraph, match)) continue;
  210. List<XElement> runCollection = alignedRuns
  211. .Skip(match.Index)
  212. .Take(match.Length)
  213. .ToList();
  214. // uses the Skip / Take special semantics of array to implement efficient finding of sub array
  215. XElement firstRun = runCollection.First();
  216. XElement firstRunProperties = firstRun.Elements(W.rPr).FirstOrDefault();
  217. // save away first run properties
  218. if (trackRevisions)
  219. {
  220. if (replacement != "")
  221. {
  222. // We coalesce runs as some methods, e.g., in DocumentAssembler,
  223. // will try to find the replacement string even though they
  224. // set coalesceContent to false.
  225. string newTextValue = match.Result(replacement);
  226. List<XElement> newRuns = UnicodeMapper.StringToCoalescedRunList(newTextValue,
  227. firstRunProperties);
  228. var newIns = new XElement(W.ins,
  229. new XAttribute(W.author, revisionTrackingAuthor),
  230. new XAttribute(W.date, DateTime.UtcNow.ToString("s") + "Z"),
  231. newRuns);
  232. if (firstRun.Parent != null && firstRun.Parent.Name == W.ins)
  233. firstRun.Parent.AddBeforeSelf(newIns);
  234. else
  235. firstRun.AddBeforeSelf(newIns);
  236. }
  237. foreach (XElement run in runCollection)
  238. {
  239. bool isInIns = run.Parent != null && run.Parent.Name == W.ins;
  240. if (isInIns)
  241. {
  242. XElement parentIns = run.Parent;
  243. XElement grandParentParagraph = parentIns.Parent;
  244. if (grandParentParagraph != null)
  245. {
  246. if ((string) parentIns.Attributes(W.author).FirstOrDefault() ==
  247. revisionTrackingAuthor)
  248. {
  249. List<XElement> parentInsSiblings = grandParentParagraph
  250. .Elements()
  251. .Where(c => c != parentIns)
  252. .ToList();
  253. grandParentParagraph.ReplaceNodes(parentInsSiblings);
  254. }
  255. else
  256. {
  257. List<XElement> parentInsSiblings = grandParentParagraph
  258. .Elements()
  259. .Select(c => c == parentIns
  260. ? new XElement(W.ins,
  261. parentIns.Attributes(),
  262. new XElement(W.del,
  263. new XAttribute(W.author, revisionTrackingAuthor),
  264. new XAttribute(W.date, DateTime.UtcNow.ToString("s") + "Z"),
  265. parentIns.Elements().Select(TransformToDelText)))
  266. : c)
  267. .ToList();
  268. grandParentParagraph.ReplaceNodes(parentInsSiblings);
  269. }
  270. }
  271. }
  272. else
  273. {
  274. var delRun = new XElement(W.del,
  275. new XAttribute(W.author, revisionTrackingAuthor),
  276. new XAttribute(W.date, DateTime.UtcNow.ToString("s") + "Z"),
  277. TransformToDelText(run));
  278. run.ReplaceWith(delRun);
  279. }
  280. }
  281. }
  282. else // not tracked revisions
  283. {
  284. foreach (XElement runToDelete in runCollection.Skip(1).ToList())
  285. if (runToDelete.Parent != null && runToDelete.Parent.Name == W.ins)
  286. runToDelete.Parent.Remove();
  287. else
  288. runToDelete.Remove();
  289. // We coalesce runs as some methods, e.g., in DocumentAssembler,
  290. // will try to find the replacement string even though they
  291. // set coalesceContent to false.
  292. string newTextValue = match.Result(replacement);
  293. List<XElement> newRuns = UnicodeMapper.StringToCoalescedRunList(newTextValue,
  294. firstRunProperties);
  295. if (firstRun.Parent != null && firstRun.Parent.Name == W.ins)
  296. firstRun.Parent.ReplaceWith(newRuns);
  297. else
  298. firstRun.ReplaceWith(newRuns);
  299. }
  300. }
  301. return coalesceContent
  302. ? WordprocessingMLUtil.CoalesceAdjacentRunsWithIdenticalFormatting(paragraphWithSplitRuns)
  303. : paragraphWithSplitRuns;
  304. }
  305. var newParagraph = new XElement(W.p,
  306. paragraph.Attributes(),
  307. paragraph.Nodes().Select(n =>
  308. {
  309. var e = n as XElement;
  310. if (e == null) return n;
  311. if (e.Name == W.pPr)
  312. return e;
  313. if (((e.Name == W.r) && e.Elements(W.t).Any()) || e.Elements(W.tab).Any())
  314. return e;
  315. if ((e.Name == W.ins) && e.Elements(W.r).Elements(W.t).Any())
  316. return e;
  317. return WmlSearchAndReplaceTransform(e, regex, replacement, callback,
  318. trackRevisions, revisionTrackingAuthor, replInfo, coalesceContent);
  319. }));
  320. return coalesceContent
  321. ? WordprocessingMLUtil.CoalesceAdjacentRunsWithIdenticalFormatting(newParagraph) // CoalesceContent(newParagraph)
  322. : newParagraph;
  323. }
  324. if (element.Name == W.ins && element.Elements(W.r).Any())
  325. {
  326. List<object> collectionOfCollections = element
  327. .Elements()
  328. .Select(n => WmlSearchAndReplaceTransform(n, regex, replacement, callback, trackRevisions,
  329. revisionTrackingAuthor, replInfo, coalesceContent))
  330. .ToList();
  331. List<object> collectionOfIns = collectionOfCollections
  332. .Select(c =>
  333. {
  334. var elements = c as IEnumerable<XElement>;
  335. return elements != null
  336. ? elements.Select(ixc => new XElement(W.ins, element.Attributes(), ixc))
  337. : c;
  338. })
  339. .ToList();
  340. return collectionOfIns;
  341. }
  342. if (element.Name == W.r)
  343. {
  344. return element.Elements()
  345. .Where(e => e.Name != W.rPr)
  346. .Select(e => e.Name == W.t
  347. ? ((string) e).Select(c =>
  348. new XElement(W.r,
  349. element.Elements(W.rPr),
  350. new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(c), c)))
  351. : new[] { new XElement(W.r, element.Elements(W.rPr), e) })
  352. .SelectMany(t => t);
  353. }
  354. return new XElement(element.Name,
  355. element.Attributes(),
  356. element.Nodes()
  357. .Select(n => WmlSearchAndReplaceTransform(n, regex, replacement, callback, trackRevisions,
  358. revisionTrackingAuthor, replInfo, coalesceContent)));
  359. }
  360. private static object TransformToDelText(XNode node)
  361. {
  362. var element = node as XElement;
  363. if (element == null) return node;
  364. if (element.Name == W.t)
  365. return new XElement(W.delText,
  366. XmlUtil.GetXmlSpaceAttribute(element.Value),
  367. element.Value);
  368. return new XElement(element.Name,
  369. element.Attributes(),
  370. element.Nodes().Select(TransformToDelText));
  371. }
  372. private static object PmlSearchAndReplaceTransform(XNode node, Regex regex, string replacement,
  373. Func<XElement, Match, bool> callback, ReplaceInternalInfo counter)
  374. {
  375. var element = node as XElement;
  376. if (element == null) return node;
  377. if (element.Name == A.p)
  378. {
  379. XElement paragraph = element;
  380. string contents = element.Descendants(A.t).Select(t => (string) t).StringConcatenate();
  381. if (!regex.IsMatch(contents))
  382. return new XElement(element.Name, element.Attributes(), element.Nodes());
  383. var paragraphWithSplitRuns = new XElement(A.p,
  384. paragraph.Attributes(),
  385. paragraph.Nodes()
  386. .Select(n => PmlSearchAndReplaceTransform(n, regex, replacement, callback, counter)));
  387. List<XElement> runsTrimmed = paragraphWithSplitRuns
  388. .Descendants(A.r)
  389. .ToList();
  390. var charsAndRuns = runsTrimmed
  391. .Select(r =>
  392. r.Element(A.t) != null
  393. ? new { Ch = r.Element(A.t).Value, r }
  394. : new { Ch = "\x01", r })
  395. .ToList();
  396. string content = charsAndRuns.Select(t => t.Ch).StringConcatenate();
  397. XElement[] alignedRuns = charsAndRuns.Select(t => t.r).ToArray();
  398. MatchCollection matchCollection = regex.Matches(content);
  399. counter.Count += matchCollection.Count;
  400. if (replacement == null)
  401. {
  402. foreach (Match match in matchCollection.Cast<Match>())
  403. callback(paragraph, match);
  404. }
  405. else
  406. {
  407. foreach (Match match in matchCollection.Cast<Match>())
  408. {
  409. if ((callback != null) && !callback(paragraph, match)) continue;
  410. List<XElement> runCollection = alignedRuns
  411. .Skip(match.Index)
  412. .Take(match.Length)
  413. .ToList();
  414. // uses the Skip / Take special semantics of array to implement efficient finding of sub array
  415. XElement firstRun = runCollection.First();
  416. // save away first run because we want the run properties
  417. runCollection.Skip(1).Remove();
  418. // binds to Remove(this IEnumerable<XElement> elements), which is an extension
  419. // in LINQ to XML that uses snapshot semantics and removes every element from
  420. // its parent.
  421. var newFirstRun = new XElement(A.r,
  422. firstRun.Element(A.rPr),
  423. new XElement(A.t, replacement));
  424. // creates a new run with proper run properties
  425. firstRun.ReplaceWith(newFirstRun);
  426. // finds firstRun in its parent's list of children, unparents firstRun,
  427. // sets newFirstRun's parent to firstRuns old parent, and inserts in the list
  428. // of children at the right place.
  429. }
  430. XElement paragraphWithReplacedRuns = paragraphWithSplitRuns;
  431. IEnumerable<IGrouping<string, XElement>> groupedAdjacentRunsWithIdenticalFormatting =
  432. paragraphWithReplacedRuns
  433. .Elements()
  434. .GroupAdjacent(ce =>
  435. {
  436. if (ce.Name != A.r)
  437. return DontConsolidate;
  438. if ((ce.Elements().Count(e => e.Name != A.rPr) != 1) || (ce.Element(A.t) == null))
  439. return DontConsolidate;
  440. XElement rPr = ce.Element(A.rPr);
  441. return rPr == null ? "" : rPr.ToString(SaveOptions.None);
  442. });
  443. var paragraphWithConsolidatedRuns = new XElement(A.p,
  444. groupedAdjacentRunsWithIdenticalFormatting.Select(g =>
  445. {
  446. if (g.Key == DontConsolidate)
  447. return (object) g;
  448. string textValue = g.Select(r => r.Element(A.t).Value).StringConcatenate();
  449. XAttribute xs = XmlUtil.GetXmlSpaceAttribute(textValue);
  450. return new XElement(A.r,
  451. g.First().Elements(A.rPr),
  452. new XElement(A.t, xs, textValue));
  453. }));
  454. paragraph = paragraphWithConsolidatedRuns;
  455. }
  456. return paragraph;
  457. }
  458. if ((element.Name == A.r) && element.Elements(A.t).Any())
  459. {
  460. return element.Elements()
  461. .Where(e => e.Name != A.rPr)
  462. .Select(e =>
  463. {
  464. if (e.Name == A.t)
  465. {
  466. var s = (string) e;
  467. IEnumerable<XElement> collectionOfSubRuns = s.Select(c => new XElement(A.r,
  468. element.Elements(A.rPr),
  469. new XElement(A.t, XmlUtil.GetXmlSpaceAttribute(c), c)));
  470. return (object) collectionOfSubRuns;
  471. }
  472. return new XElement(A.r,
  473. element.Elements(A.rPr),
  474. e);
  475. });
  476. }
  477. return new XElement(element.Name,
  478. element.Attributes(),
  479. element.Nodes().Select(n => PmlSearchAndReplaceTransform(n, regex, replacement, callback, counter)));
  480. }
  481. private class ReplaceInternalInfo
  482. {
  483. public int Count;
  484. }
  485. }
  486. }