// Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for full license information. /*************************************************************************** Copyright (c) Microsoft Corporation 2016. This code is licensed using the Microsoft Public License (Ms-PL). The text of the license can be found here: http://www.microsoft.com/resources/sharedsource/licensingbasics/publiclicense.mspx Developer: Thomas Barnekow Email: thomas@barnekow.info ***************************************************************************/ using System; using System.Collections.Generic; using System.Linq; using System.Xml.Linq; namespace OpenXmlPowerTools { public class UnicodeMapper { // Unicode character values. public static readonly char StartOfHeading = '\u0001'; public static readonly char HorizontalTabulation = '\u0009'; public static readonly char LineFeed = '\u000A'; public static readonly char FormFeed = '\u000C'; public static readonly char CarriageReturn = '\u000D'; public static readonly char SoftHyphen = '\u00AD'; public static readonly char NonBreakingHyphen = '\u2011'; // Unicode area boundaries. public static readonly char StartOfPrivateUseArea = '\uE000'; public static readonly char StartOfSymbolArea = '\uF000'; public static readonly char EndOfPrivateUseArea = '\uF8FF'; // Dictionaries for w:sym stringification. private static readonly Dictionary SymStringToUnicodeCharDictionary = new Dictionary(); private static readonly Dictionary UnicodeCharToSymDictionary = new Dictionary(); // Represents the Unicode value that was last used to map an actual character // onto a special value in the private use area, which starts at U+E000. // In Open XML, U+F000 is added to the actual Unicode values, so we should be // well outside that range and would have to map 4096 different characters // to get into the area starting at U+F000. private static char _lastUnicodeChar = StartOfPrivateUseArea; /// /// Stringify an Open XML run, turning (a) w:t, w:br, w:cr, w:noBreakHyphen, /// w:softHyphen, w:sym, and w:tab into their corresponding Unicode strings /// and (b) everything else into U+0001. /// /// An Open XML run or run child element. /// The corresponding Unicode value or U+0001. public static string RunToString(XElement element) { if (element.Name == W.r && (element.Parent == null || element.Parent.Name != W.del)) return element.Elements().Select(RunToString).StringConcatenate(); // We need to ignore run properties. if (element.Name == W.rPr) return string.Empty; // For w:t elements, we obviously want the element's value. if (element.Name == W.t) return (string) element; // Turn elements representing special characters into their corresponding // unicode characters. if (element.Name == W.br) { XAttribute typeAttribute = element.Attribute(W.type); string type = typeAttribute != null ? typeAttribute.Value : null; if (type == null || type == "textWrapping") return CarriageReturn.ToString(); if (type == "page") return FormFeed.ToString(); } if (element.Name == W.cr) return CarriageReturn.ToString(); if (element.Name == W.noBreakHyphen) return NonBreakingHyphen.ToString(); if (element.Name == W.softHyphen) return SoftHyphen.ToString(); if (element.Name == W.tab) return HorizontalTabulation.ToString(); if (element.Name == W.fldChar) { var fldCharType = element.Attributes(W.fldCharType).Select(a => a.Value).FirstOrDefault(); switch (fldCharType) { case "begin": return "{"; case "end": return "}"; default: return "_"; } } if (element.Name == W.instrText) return "_"; // Turn w:sym elements into Unicode character values. A w:char attribute // value can be stored (a) directly in its Unicode character value from // the font glyph or (b) in a Unicode character value created by adding // U+F000 to the character value, thereby shifting the value into the // Unicode private use area. if (element.Name == W.sym) return SymToChar(element).ToString(); // Elements we don't recognize will be turned into a character that // doesn't typically appear in documents. return StartOfHeading.ToString(); } /// /// Translate a symbol into a Unicode character, using the specified w:font attribute /// value and unicode value (represented by the w:sym element's w:char attribute), /// using a substitute value for the actual Unicode value if the same Unicode value /// is already used in conjunction with a different w:font attribute value. /// /// Add U+F000 to the Unicode value if the specified value is less than U+1000, which /// shifts the value into the Unicode private use area (which is also done by MS Word). /// /// /// For w:sym elements, the w:char attribute value is typically greater than "F000", /// because U+F000 is added to the actual Unicode value to shift the value into /// the Unicode private use area. /// /// The w:font attribute value, e.g., "Wingdings". /// The unicode value. /// The Unicode character used to represent the symbol. public static char SymToChar(string fontAttributeValue, char unicodeValue) { return SymToChar(fontAttributeValue, (int) unicodeValue); } /// /// Translate a symbol into a Unicode character, using the specified w:font attribute /// value and unicode value (represented by the w:sym element's w:char attribute), /// using a substitute value for the actual Unicode value if the same Unicode value /// is already used in conjunction with a different w:font attribute value. /// /// Add U+F000 to the Unicode value if the specified value is less than U+1000, which /// shifts the value into the Unicode private use area (which is also done by MS Word). /// /// /// For w:sym elements, the w:char attribute value is typically greater than "F000", /// because U+F000 is added to the actual Unicode value to shift the value into /// the Unicode private use area. /// /// The w:font attribute value, e.g., "Wingdings". /// The unicode value. /// The Unicode character used to represent the symbol. public static char SymToChar(string fontAttributeValue, int unicodeValue) { int effectiveUnicodeValue = unicodeValue < 0x1000 ? 0xF000 + unicodeValue : unicodeValue; return SymToChar(fontAttributeValue, effectiveUnicodeValue.ToString("X4")); } /// /// Translate a symbol into a Unicode character, using the specified w:font and /// w:char attribute values, using a substitute value for the actual Unicode /// value if the same Unicode value is already used in conjunction with a different /// w:font attribute value. /// /// Do not alter the w:char attribute value. /// /// /// For w:sym elements, the w:char attribute value is typically greater than "F000", /// because U+F000 is added to the actual Unicode value to shift the value into /// the Unicode private use area. /// /// The w:font attribute value, e.g., "Wingdings". /// The w:char attribute value, e.g., "F028". /// The Unicode character used to represent the symbol. public static char SymToChar(string fontAttributeValue, string charAttributeValue) { if (string.IsNullOrEmpty(fontAttributeValue)) throw new ArgumentException("Argument is null or empty.", "fontAttributeValue"); if (string.IsNullOrEmpty(charAttributeValue)) throw new ArgumentException("Argument is null or empty.", "charAttributeValue"); return SymToChar(new XElement(W.sym, new XAttribute(W.font, fontAttributeValue), new XAttribute(W._char, charAttributeValue), new XAttribute(XNamespace.Xmlns + "w", W.w))); } /// /// Represent a w:sym element as a Unicode value, mapping the Unicode value /// specified in the w:char attribute to a substitute value to be able to /// use a Unicode value in conjunction with different fonts. /// /// The w:sym element to be stringified. /// A single-character Unicode string representing the w:sym element. public static char SymToChar(XElement sym) { if (sym == null) throw new ArgumentNullException("sym"); if (sym.Name != W.sym) throw new ArgumentException(string.Format("Not a w:sym: {0}", sym.Name), "sym"); XAttribute fontAttribute = sym.Attribute(W.font); string fontAttributeValue = fontAttribute != null ? fontAttribute.Value : null; if (fontAttributeValue == null) throw new ArgumentException("w:sym element has no w:font attribute.", "sym"); XAttribute charAttribute = sym.Attribute(W._char); string charAttributeValue = charAttribute != null ? charAttribute.Value : null; if (charAttributeValue == null) throw new ArgumentException("w:sym element has no w:char attribute.", "sym"); // Return Unicode value if it is in the dictionary. var standardizedSym = new XElement(W.sym, new XAttribute(W.font, fontAttributeValue), new XAttribute(W._char, charAttributeValue), new XAttribute(XNamespace.Xmlns + "w", W.w)); string standardizedSymString = standardizedSym.ToString(SaveOptions.None); if (SymStringToUnicodeCharDictionary.ContainsKey(standardizedSymString)) return SymStringToUnicodeCharDictionary[standardizedSymString]; // Determine Unicode value to be used to represent the current w:sym element. // Use the actual Unicode value if it has not yet been used with another font. // Otherwise, create a special Unicode value in the private use area to represent // the current w:sym element. var unicodeChar = (char) Convert.ToInt32(charAttributeValue, 16); if (UnicodeCharToSymDictionary.ContainsKey(unicodeChar)) unicodeChar = ++_lastUnicodeChar; SymStringToUnicodeCharDictionary.Add(standardizedSymString, unicodeChar); UnicodeCharToSymDictionary.Add(unicodeChar, standardizedSym); return unicodeChar; } /// /// Turn the specified text value into a list of runs with coalesced text elements. /// Each run will have the specified run properties. /// /// The text value to transform. /// The run properties to apply. /// A list of runs representing the text value. public static List StringToCoalescedRunList(string textValue, XElement runProperties) { return textValue .Select(CharToRunChild) .GroupAdjacent(e => e.Name == W.t) .SelectMany(grouping => grouping.Key ? StringToSingleRunList(grouping.Select(t => (string) t).StringConcatenate(), runProperties) : grouping.Select(e => new XElement(W.r, runProperties, e))) .ToList(); } /// /// Turn the specified text value into a list consisting of a single run having one /// text element with that text value. The run will have the specified run properties. /// /// The text value to transform. /// The run properties to apply. /// A list with a single run. public static IEnumerable StringToSingleRunList(string textValue, XElement runProperties) { var run = new XElement(W.r, runProperties, new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(textValue), textValue)); return new List { run }; } /// /// Turn the specified text value into a list of runs, each having the specified /// run properties. /// /// The text value to transform. /// The run properties to apply. /// A list of runs representing the text value. public static List StringToRunList(string textValue, XElement runProperties) { return textValue.Select(character => CharToRun(character, runProperties)).ToList(); } /// /// Create a w:r element from the specified character, which will be turned /// into a corresponding Open XML element (e.g., w:t, w:br, w:tab). /// /// The character. /// The w:rPr element to be added to the w:r element. /// The w:r element. public static XElement CharToRun(char character, XElement runProperties) { return new XElement(W.r, runProperties, CharToRunChild(character)); } /// /// Create an Open XML element (e.g., w:t, w:br, w:tab) from the specified /// character. /// /// The character. /// The Open XML element or null, if the character equals (U+0001). public static XElement CharToRunChild(char character) { // Ignore the special character that represents the Open XML elements we // wanted to ignore. if (character == StartOfHeading) return null; // Translate special characters into their corresponding Open XML elements. // Turn a Carriage Return into an empty w:br element, regardless of whether // the former was created from an equivalent w:cr element. if (character == CarriageReturn) return new XElement(W.br); if (character == FormFeed) return new XElement(W.br, new XAttribute(W.type, "page")); if (character == HorizontalTabulation) return new XElement(W.tab); if (character == NonBreakingHyphen) return new XElement(W.noBreakHyphen); if (character == SoftHyphen) return new XElement(W.softHyphen); // Translate symbol characters into their corresponding w:sym elements. if (UnicodeCharToSymDictionary.ContainsKey(character)) return UnicodeCharToSymDictionary[character]; // Turn "normal" characters into text elements. return new XElement(W.t, XmlUtil.GetXmlSpaceAttribute(character), character); } } }