UnicodeMapper.cs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. /***************************************************************************
  4. Copyright (c) Microsoft Corporation 2016.
  5. This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here:
  6. http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx
  7. Developer: Thomas Barnekow
  8. Email: thomas@barnekow.info
  9. ***************************************************************************/
  10. using System;
  11. using System.Collections.Generic;
  12. using System.Linq;
  13. using System.Xml.Linq;
  14. namespace OpenXmlPowerTools
  15. {
  16. public class UnicodeMapper
  17. {
  18. // Unicode character values.
  19. public static readonly char StartOfHeading = '\u0001';
  20. public static readonly char HorizontalTabulation = '\u0009';
  21. public static readonly char LineFeed = '\u000A';
  22. public static readonly char FormFeed = '\u000C';
  23. public static readonly char CarriageReturn = '\u000D';
  24. public static readonly char SoftHyphen = '\u00AD';
  25. public static readonly char NonBreakingHyphen = '\u2011';
  26. // Unicode area boundaries.
  27. public static readonly char StartOfPrivateUseArea = '\uE000';
  28. public static readonly char StartOfSymbolArea = '\uF000';
  29. public static readonly char EndOfPrivateUseArea = '\uF8FF';
  30. // Dictionaries for w:sym stringification.
  31. private static readonly Dictionary<string, char> SymStringToUnicodeCharDictionary =
  32. new Dictionary<string, char>();
  33. private static readonly Dictionary<char, XElement> UnicodeCharToSymDictionary =
  34. new Dictionary<char, XElement>();
  35. // Represents the Unicode value that was last used to map an actual character
  36. // onto a special value in the private use area, which starts at U+E000.
  37. // In Open XML, U+F000 is added to the actual Unicode values, so we should be
  38. // well outside that range and would have to map 4096 different characters
  39. // to get into the area starting at U+F000.
  40. private static char _lastUnicodeChar = StartOfPrivateUseArea;
  41. /// <summary>
  42. /// Stringify an Open XML run, turning (a) w:t, w:br, w:cr, w:noBreakHyphen,
  43. /// w:softHyphen, w:sym, and w:tab into their corresponding Unicode strings
  44. /// and (b) everything else into U+0001.
  45. /// </summary>
  46. /// <param name="element">An Open XML run or run child element.</param>
  47. /// <returns>The corresponding Unicode value or U+0001.</returns>
  48. public static string RunToString(XElement element)
  49. {
  50. if (element.Name == W.r && (element.Parent == null || element.Parent.Name != W.del))
  51. return element.Elements().Select(RunToString).StringConcatenate();
  52. // We need to ignore run properties.
  53. if (element.Name == W.rPr)
  54. return string.Empty;
  55. // For w:t elements, we obviously want the element's value.
  56. if (element.Name == W.t)
  57. return (string) element;
  58. // Turn elements representing special characters into their corresponding
  59. // unicode characters.
  60. if (element.Name == W.br)
  61. {
  62. XAttribute typeAttribute = element.Attribute(W.type);
  63. string type = typeAttribute != null ? typeAttribute.Value : null;
  64. if (type == null || type == "textWrapping")
  65. return CarriageReturn.ToString();
  66. if (type == "page")
  67. return FormFeed.ToString();
  68. }
  69. if (element.Name == W.cr)
  70. return CarriageReturn.ToString();
  71. if (element.Name == W.noBreakHyphen)
  72. return NonBreakingHyphen.ToString();
  73. if (element.Name == W.softHyphen)
  74. return SoftHyphen.ToString();
  75. if (element.Name == W.tab)
  76. return HorizontalTabulation.ToString();
  77. if (element.Name == W.fldChar)
  78. {
  79. var fldCharType = element.Attributes(W.fldCharType).Select(a => a.Value).FirstOrDefault();
  80. switch (fldCharType)
  81. {
  82. case "begin":
  83. return "{";
  84. case "end":
  85. return "}";
  86. default:
  87. return "_";
  88. }
  89. }
  90. if (element.Name == W.instrText)
  91. return "_";
  92. // Turn w:sym elements into Unicode character values. A w:char attribute
  93. // value can be stored (a) directly in its Unicode character value from
  94. // the font glyph or (b) in a Unicode character value created by adding
  95. // U+F000 to the character value, thereby shifting the value into the
  96. // Unicode private use area.
  97. if (element.Name == W.sym)
  98. return SymToChar(element).ToString();
  99. // Elements we don't recognize will be turned into a character that
  100. // doesn't typically appear in documents.
  101. return StartOfHeading.ToString();
  102. }
  103. /// <summary>
  104. /// Translate a symbol into a Unicode character, using the specified w:font attribute
  105. /// value and unicode value (represented by the w:sym element's w:char attribute),
  106. /// using a substitute value for the actual Unicode value if the same Unicode value
  107. /// is already used in conjunction with a different w:font attribute value.
  108. ///
  109. /// Add U+F000 to the Unicode value if the specified value is less than U+1000, which
  110. /// shifts the value into the Unicode private use area (which is also done by MS Word).
  111. /// </summary>
  112. /// <remarks>
  113. /// For w:sym elements, the w:char attribute value is typically greater than "F000",
  114. /// because U+F000 is added to the actual Unicode value to shift the value into
  115. /// the Unicode private use area.
  116. /// </remarks>
  117. /// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
  118. /// <param name="unicodeValue">The unicode value.</param>
  119. /// <returns>The Unicode character used to represent the symbol.</returns>
  120. public static char SymToChar(string fontAttributeValue, char unicodeValue)
  121. {
  122. return SymToChar(fontAttributeValue, (int) unicodeValue);
  123. }
  124. /// <summary>
  125. /// Translate a symbol into a Unicode character, using the specified w:font attribute
  126. /// value and unicode value (represented by the w:sym element's w:char attribute),
  127. /// using a substitute value for the actual Unicode value if the same Unicode value
  128. /// is already used in conjunction with a different w:font attribute value.
  129. ///
  130. /// Add U+F000 to the Unicode value if the specified value is less than U+1000, which
  131. /// shifts the value into the Unicode private use area (which is also done by MS Word).
  132. /// </summary>
  133. /// <remarks>
  134. /// For w:sym elements, the w:char attribute value is typically greater than "F000",
  135. /// because U+F000 is added to the actual Unicode value to shift the value into
  136. /// the Unicode private use area.
  137. /// </remarks>
  138. /// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
  139. /// <param name="unicodeValue">The unicode value.</param>
  140. /// <returns>The Unicode character used to represent the symbol.</returns>
  141. public static char SymToChar(string fontAttributeValue, int unicodeValue)
  142. {
  143. int effectiveUnicodeValue = unicodeValue < 0x1000 ? 0xF000 + unicodeValue : unicodeValue;
  144. return SymToChar(fontAttributeValue, effectiveUnicodeValue.ToString("X4"));
  145. }
  146. /// <summary>
  147. /// Translate a symbol into a Unicode character, using the specified w:font and
  148. /// w:char attribute values, using a substitute value for the actual Unicode
  149. /// value if the same Unicode value is already used in conjunction with a different
  150. /// w:font attribute value.
  151. ///
  152. /// Do not alter the w:char attribute value.
  153. /// </summary>
  154. /// <remarks>
  155. /// For w:sym elements, the w:char attribute value is typically greater than "F000",
  156. /// because U+F000 is added to the actual Unicode value to shift the value into
  157. /// the Unicode private use area.
  158. /// </remarks>
  159. /// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
  160. /// <param name="charAttributeValue">The w:char attribute value, e.g., "F028".</param>
  161. /// <returns>The Unicode character used to represent the symbol.</returns>
  162. public static char SymToChar(string fontAttributeValue, string charAttributeValue)
  163. {
  164. if (string.IsNullOrEmpty(fontAttributeValue))
  165. throw new ArgumentException("Argument is null or empty.", "fontAttributeValue");
  166. if (string.IsNullOrEmpty(charAttributeValue))
  167. throw new ArgumentException("Argument is null or empty.", "charAttributeValue");
  168. return SymToChar(new XElement(W.sym,
  169. new XAttribute(W.font, fontAttributeValue),
  170. new XAttribute(W._char, charAttributeValue),
  171. new XAttribute(XNamespace.Xmlns + "w", W.w)));
  172. }
  173. /// <summary>
  174. /// Represent a w:sym element as a Unicode value, mapping the Unicode value
  175. /// specified in the w:char attribute to a substitute value to be able to
  176. /// use a Unicode value in conjunction with different fonts.
  177. /// </summary>
  178. /// <param name="sym">The w:sym element to be stringified.</param>
  179. /// <returns>A single-character Unicode string representing the w:sym element.</returns>
  180. public static char SymToChar(XElement sym)
  181. {
  182. if (sym == null)
  183. throw new ArgumentNullException("sym");
  184. if (sym.Name != W.sym)
  185. throw new ArgumentException(string.Format("Not a w:sym: {0}", sym.Name), "sym");
  186. XAttribute fontAttribute = sym.Attribute(W.font);
  187. string fontAttributeValue = fontAttribute != null ? fontAttribute.Value : null;
  188. if (fontAttributeValue == null)
  189. throw new ArgumentException("w:sym element has no w:font attribute.", "sym");
  190. XAttribute charAttribute = sym.Attribute(W._char);
  191. string charAttributeValue = charAttribute != null ? charAttribute.Value : null;
  192. if (charAttributeValue == null)
  193. throw new ArgumentException("w:sym element has no w:char attribute.", "sym");
  194. // Return Unicode value if it is in the dictionary.
  195. var standardizedSym = new XElement(W.sym,
  196. new XAttribute(W.font, fontAttributeValue),
  197. new XAttribute(W._char, charAttributeValue),
  198. new XAttribute(XNamespace.Xmlns + "w", W.w));
  199. string standardizedSymString = standardizedSym.ToString(SaveOptions.None);
  200. if (SymStringToUnicodeCharDictionary.ContainsKey(standardizedSymString))
  201. return SymStringToUnicodeCharDictionary[standardizedSymString];
  202. // Determine Unicode value to be used to represent the current w:sym element.
  203. // Use the actual Unicode value if it has not yet been used with another font.
  204. // Otherwise, create a special Unicode value in the private use area to represent
  205. // the current w:sym element.
  206. var unicodeChar = (char) Convert.ToInt32(charAttributeValue, 16);
  207. if (UnicodeCharToSymDictionary.ContainsKey(unicodeChar))
  208. unicodeChar = ++_lastUnicodeChar;
  209. SymStringToUnicodeCharDictionary.Add(standardizedSymString, unicodeChar);
  210. UnicodeCharToSymDictionary.Add(unicodeChar, standardizedSym);
  211. return unicodeChar;
  212. }
  213. /// <summary>
  214. /// Turn the specified text value into a list of runs with coalesced text elements.
  215. /// Each run will have the specified run properties.
  216. /// </summary>
  217. /// <param name="textValue">The text value to transform.</param>
  218. /// <param name="runProperties">The run properties to apply.</param>
  219. /// <returns>A list of runs representing the text value.</returns>
  220. public static List<XElement> StringToCoalescedRunList(string textValue, XElement runProperties)
  221. {
  222. return textValue
  223. .Select(CharToRunChild)
  224. .GroupAdjacent(e => e.Name == W.t)
  225. .SelectMany(grouping => grouping.Key
  226. ? StringToSingleRunList(grouping.Select(t => (string) t).StringConcatenate(), runProperties)
  227. : grouping.Select(e => new XElement(W.r, runProperties, e)))
  228. .ToList();
  229. }
  230. /// <summary>
  231. /// Turn the specified text value into a list consisting of a single run having one
  232. /// text element with that text value. The run will have the specified run properties.
  233. /// </summary>
  234. /// <param name="textValue">The text value to transform.</param>
  235. /// <param name="runProperties">The run properties to apply.</param>
  236. /// <returns>A list with a single run.</returns>
  237. public static IEnumerable<XElement> StringToSingleRunList(string textValue, XElement runProperties)
  238. {
  239. var run = new XElement(W.r,
  240. runProperties,
  241. new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(textValue), textValue));
  242. return new List<XElement> { run };
  243. }
  244. /// <summary>
  245. /// Turn the specified text value into a list of runs, each having the specified
  246. /// run properties.
  247. /// </summary>
  248. /// <param name="textValue">The text value to transform.</param>
  249. /// <param name="runProperties">The run properties to apply.</param>
  250. /// <returns>A list of runs representing the text value.</returns>
  251. public static List<XElement> StringToRunList(string textValue, XElement runProperties)
  252. {
  253. return textValue.Select(character => CharToRun(character, runProperties)).ToList();
  254. }
  255. /// <summary>
  256. /// Create a w:r element from the specified character, which will be turned
  257. /// into a corresponding Open XML element (e.g., w:t, w:br, w:tab).
  258. /// </summary>
  259. /// <param name="character">The character.</param>
  260. /// <param name="runProperties">The w:rPr element to be added to the w:r element.</param>
  261. /// <returns>The w:r element.</returns>
  262. public static XElement CharToRun(char character, XElement runProperties)
  263. {
  264. return new XElement(W.r, runProperties, CharToRunChild(character));
  265. }
  266. /// <summary>
  267. /// Create an Open XML element (e.g., w:t, w:br, w:tab) from the specified
  268. /// character.
  269. /// </summary>
  270. /// <param name="character">The character.</param>
  271. /// <returns>The Open XML element or null, if the character equals <see cref="StartOfHeading" /> (U+0001).</returns>
  272. public static XElement CharToRunChild(char character)
  273. {
  274. // Ignore the special character that represents the Open XML elements we
  275. // wanted to ignore.
  276. if (character == StartOfHeading)
  277. return null;
  278. // Translate special characters into their corresponding Open XML elements.
  279. // Turn a Carriage Return into an empty w:br element, regardless of whether
  280. // the former was created from an equivalent w:cr element.
  281. if (character == CarriageReturn)
  282. return new XElement(W.br);
  283. if (character == FormFeed)
  284. return new XElement(W.br, new XAttribute(W.type, "page"));
  285. if (character == HorizontalTabulation)
  286. return new XElement(W.tab);
  287. if (character == NonBreakingHyphen)
  288. return new XElement(W.noBreakHyphen);
  289. if (character == SoftHyphen)
  290. return new XElement(W.softHyphen);
  291. // Translate symbol characters into their corresponding w:sym elements.
  292. if (UnicodeCharToSymDictionary.ContainsKey(character))
  293. return UnicodeCharToSymDictionary[character];
  294. // Turn "normal" characters into text elements.
  295. return new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(character), character);
  296. }
  297. }
  298. }