UnicodeMapper.cs 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. // Copyright (c) Microsoft. All rights reserved.
  2. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
  3. /***************************************************************************
  4. Copyright (c) Thomas Barnekow 2016.
  5. Developer: Thomas Barnekow
  6. Email: thomas@barnekow.info
  7. ***************************************************************************/
  8. using System;
  9. using System.Collections.Generic;
  10. using System.Linq;
  11. using System.Xml.Linq;
  12. namespace OpenXmlPowerTools
  13. {
  14. public class UnicodeMapper
  15. {
  16. // Unicode character values.
  17. public static readonly char StartOfHeading = '\u0001';
  18. public static readonly char HorizontalTabulation = '\u0009';
  19. public static readonly char LineFeed = '\u000A';
  20. public static readonly char FormFeed = '\u000C';
  21. public static readonly char CarriageReturn = '\u000D';
  22. public static readonly char SoftHyphen = '\u00AD';
  23. public static readonly char NonBreakingHyphen = '\u2011';
  24. // Unicode area boundaries.
  25. public static readonly char StartOfPrivateUseArea = '\uE000';
  26. public static readonly char StartOfSymbolArea = '\uF000';
  27. public static readonly char EndOfPrivateUseArea = '\uF8FF';
  28. // Dictionaries for w:sym stringification.
  29. private static readonly Dictionary<string, char> SymStringToUnicodeCharDictionary =
  30. new Dictionary<string, char>();
  31. private static readonly Dictionary<char, XElement> UnicodeCharToSymDictionary =
  32. new Dictionary<char, XElement>();
  33. // Represents the Unicode value that was last used to map an actual character
  34. // onto a special value in the private use area, which starts at U+E000.
  35. // In Open XML, U+F000 is added to the actual Unicode values, so we should be
  36. // well outside that range and would have to map 4096 different characters
  37. // to get into the area starting at U+F000.
  38. private static char _lastUnicodeChar = StartOfPrivateUseArea;
  39. /// <summary>
  40. /// Stringify an Open XML run, turning (a) w:t, w:br, w:cr, w:noBreakHyphen,
  41. /// w:softHyphen, w:sym, and w:tab into their corresponding Unicode strings
  42. /// and (b) everything else into U+0001.
  43. /// </summary>
  44. /// <param name="element">An Open XML run or run child element.</param>
  45. /// <returns>The corresponding Unicode value or U+0001.</returns>
  46. public static string RunToString(XElement element)
  47. {
  48. if (element.Name == W.r && (element.Parent == null || element.Parent.Name != W.del))
  49. return element.Elements().Select(RunToString).StringConcatenate();
  50. // We need to ignore run properties.
  51. if (element.Name == W.rPr)
  52. return string.Empty;
  53. // For w:t elements, we obviously want the element's value.
  54. if (element.Name == W.t)
  55. return (string)element;
  56. // Turn elements representing special characters into their corresponding
  57. // unicode characters.
  58. if (element.Name == W.br)
  59. {
  60. XAttribute typeAttribute = element.Attribute(W.type);
  61. string type = typeAttribute != null ? typeAttribute.Value : null;
  62. if (type == null || type == "textWrapping")
  63. return CarriageReturn.ToString();
  64. if (type == "page")
  65. return FormFeed.ToString();
  66. }
  67. if (element.Name == W.cr)
  68. return CarriageReturn.ToString();
  69. if (element.Name == W.noBreakHyphen)
  70. return NonBreakingHyphen.ToString();
  71. if (element.Name == W.softHyphen)
  72. return SoftHyphen.ToString();
  73. if (element.Name == W.tab)
  74. return HorizontalTabulation.ToString();
  75. if (element.Name == W.fldChar)
  76. {
  77. var fldCharType = element.Attributes(W.fldCharType).Select(a => a.Value).FirstOrDefault();
  78. switch (fldCharType)
  79. {
  80. case "begin":
  81. return "{";
  82. case "end":
  83. return "}";
  84. default:
  85. return "_";
  86. }
  87. }
  88. if (element.Name == W.instrText)
  89. return "_";
  90. // Turn w:sym elements into Unicode character values. A w:char attribute
  91. // value can be stored (a) directly in its Unicode character value from
  92. // the font glyph or (b) in a Unicode character value created by adding
  93. // U+F000 to the character value, thereby shifting the value into the
  94. // Unicode private use area.
  95. if (element.Name == W.sym)
  96. return SymToChar(element).ToString();
  97. // Elements we don't recognize will be turned into a character that
  98. // doesn't typically appear in documents.
  99. return StartOfHeading.ToString();
  100. }
  101. /// <summary>
  102. /// Translate a symbol into a Unicode character, using the specified w:font attribute
  103. /// value and unicode value (represented by the w:sym element's w:char attribute),
  104. /// using a substitute value for the actual Unicode value if the same Unicode value
  105. /// is already used in conjunction with a different w:font attribute value.
  106. ///
  107. /// Add U+F000 to the Unicode value if the specified value is less than U+1000, which
  108. /// shifts the value into the Unicode private use area (which is also done by MS Word).
  109. /// </summary>
  110. /// <remarks>
  111. /// For w:sym elements, the w:char attribute value is typically greater than "F000",
  112. /// because U+F000 is added to the actual Unicode value to shift the value into
  113. /// the Unicode private use area.
  114. /// </remarks>
  115. /// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
  116. /// <param name="unicodeValue">The unicode value.</param>
  117. /// <returns>The Unicode character used to represent the symbol.</returns>
  118. public static char SymToChar(string fontAttributeValue, char unicodeValue)
  119. {
  120. return SymToChar(fontAttributeValue, (int)unicodeValue);
  121. }
  122. /// <summary>
  123. /// Translate a symbol into a Unicode character, using the specified w:font attribute
  124. /// value and unicode value (represented by the w:sym element's w:char attribute),
  125. /// using a substitute value for the actual Unicode value if the same Unicode value
  126. /// is already used in conjunction with a different w:font attribute value.
  127. ///
  128. /// Add U+F000 to the Unicode value if the specified value is less than U+1000, which
  129. /// shifts the value into the Unicode private use area (which is also done by MS Word).
  130. /// </summary>
  131. /// <remarks>
  132. /// For w:sym elements, the w:char attribute value is typically greater than "F000",
  133. /// because U+F000 is added to the actual Unicode value to shift the value into
  134. /// the Unicode private use area.
  135. /// </remarks>
  136. /// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
  137. /// <param name="unicodeValue">The unicode value.</param>
  138. /// <returns>The Unicode character used to represent the symbol.</returns>
  139. public static char SymToChar(string fontAttributeValue, int unicodeValue)
  140. {
  141. int effectiveUnicodeValue = unicodeValue < 0x1000 ? 0xF000 + unicodeValue : unicodeValue;
  142. return SymToChar(fontAttributeValue, effectiveUnicodeValue.ToString("X4"));
  143. }
  144. /// <summary>
  145. /// Translate a symbol into a Unicode character, using the specified w:font and
  146. /// w:char attribute values, using a substitute value for the actual Unicode
  147. /// value if the same Unicode value is already used in conjunction with a different
  148. /// w:font attribute value.
  149. ///
  150. /// Do not alter the w:char attribute value.
  151. /// </summary>
  152. /// <remarks>
  153. /// For w:sym elements, the w:char attribute value is typically greater than "F000",
  154. /// because U+F000 is added to the actual Unicode value to shift the value into
  155. /// the Unicode private use area.
  156. /// </remarks>
  157. /// <param name="fontAttributeValue">The w:font attribute value, e.g., "Wingdings".</param>
  158. /// <param name="charAttributeValue">The w:char attribute value, e.g., "F028".</param>
  159. /// <returns>The Unicode character used to represent the symbol.</returns>
  160. public static char SymToChar(string fontAttributeValue, string charAttributeValue)
  161. {
  162. if (string.IsNullOrEmpty(fontAttributeValue))
  163. throw new ArgumentException("Argument is null or empty.", "fontAttributeValue");
  164. if (string.IsNullOrEmpty(charAttributeValue))
  165. throw new ArgumentException("Argument is null or empty.", "charAttributeValue");
  166. return SymToChar(new XElement(W.sym,
  167. new XAttribute(W.font, fontAttributeValue),
  168. new XAttribute(W._char, charAttributeValue),
  169. new XAttribute(XNamespace.Xmlns + "w", W.w)));
  170. }
  171. /// <summary>
  172. /// Represent a w:sym element as a Unicode value, mapping the Unicode value
  173. /// specified in the w:char attribute to a substitute value to be able to
  174. /// use a Unicode value in conjunction with different fonts.
  175. /// </summary>
  176. /// <param name="sym">The w:sym element to be stringified.</param>
  177. /// <returns>A single-character Unicode string representing the w:sym element.</returns>
  178. public static char SymToChar(XElement sym)
  179. {
  180. if (sym == null)
  181. throw new ArgumentNullException("sym");
  182. if (sym.Name != W.sym)
  183. throw new ArgumentException(string.Format("Not a w:sym: {0}", sym.Name), "sym");
  184. XAttribute fontAttribute = sym.Attribute(W.font);
  185. string fontAttributeValue = fontAttribute != null ? fontAttribute.Value : null;
  186. if (fontAttributeValue == null)
  187. throw new ArgumentException("w:sym element has no w:font attribute.", "sym");
  188. XAttribute charAttribute = sym.Attribute(W._char);
  189. string charAttributeValue = charAttribute != null ? charAttribute.Value : null;
  190. if (charAttributeValue == null)
  191. throw new ArgumentException("w:sym element has no w:char attribute.", "sym");
  192. // Return Unicode value if it is in the dictionary.
  193. var standardizedSym = new XElement(W.sym,
  194. new XAttribute(W.font, fontAttributeValue),
  195. new XAttribute(W._char, charAttributeValue),
  196. new XAttribute(XNamespace.Xmlns + "w", W.w));
  197. string standardizedSymString = standardizedSym.ToString(SaveOptions.None);
  198. if (SymStringToUnicodeCharDictionary.ContainsKey(standardizedSymString))
  199. return SymStringToUnicodeCharDictionary[standardizedSymString];
  200. // Determine Unicode value to be used to represent the current w:sym element.
  201. // Use the actual Unicode value if it has not yet been used with another font.
  202. // Otherwise, create a special Unicode value in the private use area to represent
  203. // the current w:sym element.
  204. var unicodeChar = (char)Convert.ToInt32(charAttributeValue, 16);
  205. if (UnicodeCharToSymDictionary.ContainsKey(unicodeChar))
  206. unicodeChar = ++_lastUnicodeChar;
  207. SymStringToUnicodeCharDictionary.Add(standardizedSymString, unicodeChar);
  208. UnicodeCharToSymDictionary.Add(unicodeChar, standardizedSym);
  209. return unicodeChar;
  210. }
  211. /// <summary>
  212. /// Turn the specified text value into a list of runs with coalesced text elements.
  213. /// Each run will have the specified run properties.
  214. /// </summary>
  215. /// <param name="textValue">The text value to transform.</param>
  216. /// <param name="runProperties">The run properties to apply.</param>
  217. /// <returns>A list of runs representing the text value.</returns>
  218. public static List<XElement> StringToCoalescedRunList(string textValue, XElement runProperties)
  219. {
  220. return textValue
  221. .Select(CharToRunChild)
  222. .GroupAdjacent(e => e.Name == W.t)
  223. .SelectMany(grouping => grouping.Key
  224. ? StringToSingleRunList(grouping.Select(t => (string)t).StringConcatenate(), runProperties)
  225. : grouping.Select(e => new XElement(W.r, runProperties, e)))
  226. .ToList();
  227. }
  228. /// <summary>
  229. /// Turn the specified text value into a list consisting of a single run having one
  230. /// text element with that text value. The run will have the specified run properties.
  231. /// </summary>
  232. /// <param name="textValue">The text value to transform.</param>
  233. /// <param name="runProperties">The run properties to apply.</param>
  234. /// <returns>A list with a single run.</returns>
  235. public static IEnumerable<XElement> StringToSingleRunList(string textValue, XElement runProperties)
  236. {
  237. var run = new XElement(W.r,
  238. runProperties,
  239. new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(textValue), textValue));
  240. return new List<XElement> { run };
  241. }
  242. /// <summary>
  243. /// Turn the specified text value into a list of runs, each having the specified
  244. /// run properties.
  245. /// </summary>
  246. /// <param name="textValue">The text value to transform.</param>
  247. /// <param name="runProperties">The run properties to apply.</param>
  248. /// <returns>A list of runs representing the text value.</returns>
  249. public static List<XElement> StringToRunList(string textValue, XElement runProperties)
  250. {
  251. return textValue.Select(character => CharToRun(character, runProperties)).ToList();
  252. }
  253. /// <summary>
  254. /// Create a w:r element from the specified character, which will be turned
  255. /// into a corresponding Open XML element (e.g., w:t, w:br, w:tab).
  256. /// </summary>
  257. /// <param name="character">The character.</param>
  258. /// <param name="runProperties">The w:rPr element to be added to the w:r element.</param>
  259. /// <returns>The w:r element.</returns>
  260. public static XElement CharToRun(char character, XElement runProperties)
  261. {
  262. return new XElement(W.r, runProperties, CharToRunChild(character));
  263. }
  264. /// <summary>
  265. /// Create an Open XML element (e.g., w:t, w:br, w:tab) from the specified
  266. /// character.
  267. /// </summary>
  268. /// <param name="character">The character.</param>
  269. /// <returns>The Open XML element or null, if the character equals <see cref="StartOfHeading" /> (U+0001).</returns>
  270. public static XElement CharToRunChild(char character)
  271. {
  272. // Ignore the special character that represents the Open XML elements we
  273. // wanted to ignore.
  274. if (character == StartOfHeading)
  275. return null;
  276. // Translate special characters into their corresponding Open XML elements.
  277. // Turn a Carriage Return into an empty w:br element, regardless of whether
  278. // the former was created from an equivalent w:cr element.
  279. if (character == CarriageReturn)
  280. return new XElement(W.br);
  281. if (character == FormFeed)
  282. return new XElement(W.br, new XAttribute(W.type, "page"));
  283. if (character == HorizontalTabulation)
  284. return new XElement(W.tab);
  285. if (character == NonBreakingHyphen)
  286. return new XElement(W.noBreakHyphen);
  287. if (character == SoftHyphen)
  288. return new XElement(W.softHyphen);
  289. // Translate symbol characters into their corresponding w:sym elements.
  290. if (UnicodeCharToSymDictionary.ContainsKey(character))
  291. return UnicodeCharToSymDictionary[character];
  292. // Turn "normal" characters into text elements.
  293. return new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(character), character);
  294. }
  295. }
  296. }