// Copyright (c) Microsoft. All rights reserved. // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; using System.Collections.Generic; using System.Drawing; using System.IO; using System.IO.Packaging; using System.Linq; using System.Text; using System.Xml.Linq; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Validation; using System.Globalization; namespace OpenXmlPowerTools { public class MetricsGetterSettings { public bool IncludeTextInContentControls; public bool IncludeXlsxTableCellData; public bool RetrieveNamespaceList; public bool RetrieveContentTypeList; } public class MetricsGetter { private static Lazy Graphics { get; } = new Lazy(() => { Image image = new Bitmap(1, 1); return System.Drawing.Graphics.FromImage(image); }); public static XElement GetMetrics(string fileName, MetricsGetterSettings settings) { FileInfo fi = new FileInfo(fileName); if (!fi.Exists) throw new FileNotFoundException("{0} does not exist.", fi.FullName); if (Util.IsWordprocessingML(fi.Extension)) { WmlDocument wmlDoc = new WmlDocument(fi.FullName, true); return GetDocxMetrics(wmlDoc, settings); } if (Util.IsSpreadsheetML(fi.Extension)) { SmlDocument smlDoc = new SmlDocument(fi.FullName, true); return GetXlsxMetrics(smlDoc, settings); } if (Util.IsPresentationML(fi.Extension)) { PmlDocument pmlDoc = new PmlDocument(fi.FullName, true); return GetPptxMetrics(pmlDoc, settings); } return null; } public static XElement GetDocxMetrics(WmlDocument wmlDoc, MetricsGetterSettings settings) { try { using (MemoryStream ms = new MemoryStream()) { ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length); using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true)) { bool hasTrackedRevisions = RevisionAccepter.HasTrackedRevisions(document); if (hasTrackedRevisions) RevisionAccepter.AcceptRevisions(document); XElement metrics1 = GetWmlMetrics(wmlDoc.FileName, false, document, settings); if (hasTrackedRevisions) metrics1.Add(new XElement(H.RevisionTracking, new XAttribute(H.Val, true))); return metrics1; } } } catch (OpenXmlPowerToolsException e) { if (e.ToString().Contains("Invalid Hyperlink")) { using (MemoryStream ms = new MemoryStream()) { ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length); #if !NET35 UriFixer.FixInvalidUri(ms, brokenUri => FixUri(brokenUri)); #endif wmlDoc = new WmlDocument("dummy.docx", ms.ToArray()); } using (MemoryStream ms = new MemoryStream()) { ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length); using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true)) { bool hasTrackedRevisions = RevisionAccepter.HasTrackedRevisions(document); if (hasTrackedRevisions) RevisionAccepter.AcceptRevisions(document); XElement metrics2 = GetWmlMetrics(wmlDoc.FileName, true, document, settings); if (hasTrackedRevisions) metrics2.Add(new XElement(H.RevisionTracking, new XAttribute(H.Val, true))); return metrics2; } } } } var metrics = new XElement(H.Metrics, new XAttribute(H.FileName, wmlDoc.FileName), new XAttribute(H.FileType, "WordprocessingML"), new XAttribute(H.Error, "Unknown error, metrics not determined")); return metrics; } private static int _getTextWidth(FontFamily ff, FontStyle fs, decimal sz, string text) { try { using (var f = new Font(ff, (float)sz / 2f, fs)) { var proposedSize = new Size(int.MaxValue, int.MaxValue); var sf = Graphics.Value.MeasureString(text, f, proposedSize); return (int) sf.Width; } } catch { return 0; } } public static int GetTextWidth(FontFamily ff, FontStyle fs, decimal sz, string text) { try { return _getTextWidth(ff, fs, sz, text); } catch (ArgumentException) { try { const FontStyle fs2 = FontStyle.Regular; return _getTextWidth(ff, fs2, sz, text); } catch (ArgumentException) { const FontStyle fs2 = FontStyle.Bold; try { return _getTextWidth(ff, fs2, sz, text); } catch (ArgumentException) { // if both regular and bold fail, then get metrics for Times New Roman // use the original FontStyle (in fs) var ff2 = new FontFamily("Times New Roman"); return _getTextWidth(ff2, fs, sz, text); } } } catch (OverflowException) { // This happened on Azure but interestingly enough not while testing locally. return 0; } } private static Uri FixUri(string brokenUri) { return new Uri("http://broken-link/"); } private static XElement GetWmlMetrics(string fileName, bool invalidHyperlink, WordprocessingDocument wDoc, MetricsGetterSettings settings) { var parts = new XElement(H.Parts, wDoc.GetAllParts().Select(part => { return GetMetricsForWmlPart(part, settings); })); if (!parts.HasElements) parts = null; var metrics = new XElement(H.Metrics, new XAttribute(H.FileName, fileName), new XAttribute(H.FileType, "WordprocessingML"), GetStyleHierarchy(wDoc), GetMiscWmlMetrics(wDoc, invalidHyperlink), parts, settings.RetrieveNamespaceList ? RetrieveNamespaceList(wDoc) : null, settings.RetrieveContentTypeList ? RetrieveContentTypeList(wDoc) : null ); return metrics; } private static XElement RetrieveContentTypeList(OpenXmlPackage oxPkg) { Package pkg = oxPkg.Package; var nonRelationshipParts = pkg.GetParts().Cast().Where(p => p.ContentType != "application/vnd.openxmlformats-package.relationships+xml"); var contentTypes = nonRelationshipParts .Select(p => p.ContentType) .OrderBy(t => t) .Distinct(); var xe = new XElement(H.ContentTypes, contentTypes.Select(ct => new XElement(H.ContentType, new XAttribute(H.Val, ct)))); return xe; } private static XElement RetrieveNamespaceList(OpenXmlPackage oxPkg) { Package pkg = oxPkg.Package; var nonRelationshipParts = pkg.GetParts().Cast().Where(p => p.ContentType != "application/vnd.openxmlformats-package.relationships+xml"); var xmlParts = nonRelationshipParts .Where(p => p.ContentType.ToLower().EndsWith("xml")); var uniqueNamespaces = new HashSet(); foreach (var xp in xmlParts) { using (Stream st = xp.GetStream()) { try { XDocument xdoc = XDocument.Load(st); var namespaces = xdoc .Descendants() .Attributes() .Where(a => a.IsNamespaceDeclaration) .Select(a => string.Format("{0}|{1}", a.Name.LocalName, a.Value)) .OrderBy(t => t) .Distinct() .ToList(); foreach (var item in namespaces) uniqueNamespaces.Add(item); } // if catch exception, forget about it. Just trying to get a most complete survey possible of all namespaces in all documents. // if caught exception, chances are the document is bad anyway. catch (Exception) { continue; } } } var xe = new XElement(H.Namespaces, uniqueNamespaces.OrderBy(t => t).Select(n => { var spl = n.Split('|'); return new XElement(H.Namespace, new XAttribute(H.NamespacePrefix, spl[0]), new XAttribute(H.NamespaceName, spl[1])); })); return xe; } private static List GetMiscWmlMetrics(WordprocessingDocument document, bool invalidHyperlink) { List metrics = new List(); List notes = new List(); Dictionary elementCountDictionary = new Dictionary(); if (invalidHyperlink) metrics.Add(new XElement(H.InvalidHyperlink, new XAttribute(H.Val, invalidHyperlink))); bool valid = ValidateWordprocessingDocument(document, metrics, notes, elementCountDictionary); if (invalidHyperlink) valid = false; return metrics; } private static bool ValidateWordprocessingDocument(WordprocessingDocument wDoc, List metrics, List notes, Dictionary metricCountDictionary) { bool valid = ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007); valid |= ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010); #if !NET35 valid |= ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013); #endif int elementCount = 0; int paragraphCount = 0; int textCount = 0; foreach (var part in wDoc.ContentParts()) { XDocument xDoc = part.GetXDocument(); foreach (var e in xDoc.Descendants()) { if (e.Name == W.txbxContent) IncrementMetric(metricCountDictionary, H.TextBox); else if (e.Name == W.sdt) IncrementMetric(metricCountDictionary, H.ContentControl); else if (e.Name == W.customXml) IncrementMetric(metricCountDictionary, H.CustomXmlMarkup); else if (e.Name == W.fldChar) IncrementMetric(metricCountDictionary, H.ComplexField); else if (e.Name == W.fldSimple) IncrementMetric(metricCountDictionary, H.SimpleField); else if (e.Name == W.altChunk) IncrementMetric(metricCountDictionary, H.AltChunk); else if (e.Name == W.tbl) IncrementMetric(metricCountDictionary, H.Table); else if (e.Name == W.hyperlink) IncrementMetric(metricCountDictionary, H.Hyperlink); else if (e.Name == W.framePr) IncrementMetric(metricCountDictionary, H.LegacyFrame); else if (e.Name == W.control) IncrementMetric(metricCountDictionary, H.ActiveX); else if (e.Name == W.subDoc) IncrementMetric(metricCountDictionary, H.SubDocument); else if (e.Name == VML.imagedata || e.Name == VML.fill || e.Name == VML.stroke || e.Name == A.blip) { var relId = (string)e.Attribute(R.embed); if (relId != null) ValidateImageExists(part, relId, metricCountDictionary); relId = (string)e.Attribute(R.pict); if (relId != null) ValidateImageExists(part, relId, metricCountDictionary); relId = (string)e.Attribute(R.id); if (relId != null) ValidateImageExists(part, relId, metricCountDictionary); } if (part.Uri == wDoc.MainDocumentPart.Uri) { elementCount++; if (e.Name == W.p) paragraphCount++; if (e.Name == W.t) textCount += ((string)e).Length; } } } foreach (var item in metricCountDictionary) { metrics.Add( new XElement(item.Key, new XAttribute(H.Val, item.Value))); } metrics.Add(new XElement(H.ElementCount, new XAttribute(H.Val, elementCount))); metrics.Add(new XElement(H.AverageParagraphLength, new XAttribute(H.Val, (int)((double)textCount / (double)paragraphCount)))); if (wDoc.GetAllParts().Any(part => part.ContentType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) metrics.Add(new XElement(H.EmbeddedXlsx, new XAttribute(H.Val, true))); NumberingFormatListAssembly(wDoc, metrics); XDocument wxDoc = wDoc.MainDocumentPart.GetXDocument(); foreach (var d in wxDoc.Descendants()) { if (d.Name == W.saveThroughXslt) { string rid = (string)d.Attribute(R.id); var tempExternalRelationship = wDoc .MainDocumentPart .DocumentSettingsPart .ExternalRelationships .FirstOrDefault(h => h.Id == rid); if (tempExternalRelationship == null) metrics.Add(new XElement(H.InvalidSaveThroughXslt, new XAttribute(H.Val, true))); valid = false; } else if (d.Name == W.trackRevisions) metrics.Add(new XElement(H.TrackRevisionsEnabled, new XAttribute(H.Val, true))); else if (d.Name == W.documentProtection) metrics.Add(new XElement(H.DocumentProtection, new XAttribute(H.Val, true))); } FontAndCharSetAnalysis(wDoc, metrics, notes); return valid; } private static bool ValidateAgainstSpecificVersion(WordprocessingDocument wDoc, List metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName) { OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst); var errors = validator.Validate(wDoc); bool valid = errors.Count() == 0; if (!valid) { if (!metrics.Any(e => e.Name == H.SdkValidationError)) metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true))); metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true), errors.Take(3).Select(err => { StringBuilder sb = new StringBuilder(); if (err.Description.Length > 300) sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine); else sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine); sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine); sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine); return sb.ToString(); }))); } return valid; } private static bool ValidateAgainstSpecificVersion(SpreadsheetDocument sDoc, List metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName) { OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst); var errors = validator.Validate(sDoc); bool valid = errors.Count() == 0; if (!valid) { if (!metrics.Any(e => e.Name == H.SdkValidationError)) metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true))); metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true), errors.Take(3).Select(err => { StringBuilder sb = new StringBuilder(); if (err.Description.Length > 300) sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine); else sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine); sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine); sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine); return sb.ToString(); }))); } return valid; } private static bool ValidateAgainstSpecificVersion(PresentationDocument pDoc, List metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName) { OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst); var errors = validator.Validate(pDoc); bool valid = errors.Count() == 0; if (!valid) { if (!metrics.Any(e => e.Name == H.SdkValidationError)) metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true))); metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true), errors.Take(3).Select(err => { StringBuilder sb = new StringBuilder(); if (err.Description.Length > 300) sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine); else sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine); sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine); sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine); return sb.ToString(); }))); } return valid; } private static void IncrementMetric(Dictionary metricCountDictionary, XName xName) { if (metricCountDictionary.ContainsKey(xName)) metricCountDictionary[xName] = metricCountDictionary[xName] + 1; else metricCountDictionary.Add(xName, 1); } private static void ValidateImageExists(OpenXmlPart part, string relId, Dictionary metrics) { var imagePart = part.Parts.FirstOrDefault(ipp => ipp.RelationshipId == relId); if (imagePart == null) IncrementMetric(metrics, H.ReferenceToNullImage); } private static void NumberingFormatListAssembly(WordprocessingDocument wDoc, List metrics) { List numFmtList = new List(); foreach (var part in wDoc.ContentParts()) { var xDoc = part.GetXDocument(); numFmtList = numFmtList.Concat(xDoc .Descendants(W.p) .Select(p => { ListItemRetriever.RetrieveListItem(wDoc, p, null); ListItemRetriever.ListItemInfo lif = p.Annotation(); if (lif != null && lif.IsListItem && lif.Lvl(ListItemRetriever.GetParagraphLevel(p)) != null) { string numFmtForLevel = (string)lif.Lvl(ListItemRetriever.GetParagraphLevel(p)).Elements(W.numFmt).Attributes(W.val).FirstOrDefault(); if (numFmtForLevel == null) { var numFmtElement = lif.Lvl(ListItemRetriever.GetParagraphLevel(p)).Elements(MC.AlternateContent).Elements(MC.Choice).Elements(W.numFmt).FirstOrDefault(); if (numFmtElement != null && (string)numFmtElement.Attribute(W.val) == "custom") numFmtForLevel = (string)numFmtElement.Attribute(W.format); } return numFmtForLevel; } return null; }) .Where(s => s != null) .Distinct()) .ToList(); } if (numFmtList.Any()) { var nfls = numFmtList.StringConcatenate(s => s + ",").TrimEnd(','); metrics.Add(new XElement(H.NumberingFormatList, new XAttribute(H.Val, PtUtils.MakeValidXml(nfls)))); } } class FormattingMetrics { public int RunCount; public int RunWithoutRprCount; public int ZeroLengthText; public int MultiFontRun; public int AsciiCharCount; public int CSCharCount; public int EastAsiaCharCount; public int HAnsiCharCount; public int AsciiRunCount; public int CSRunCount; public int EastAsiaRunCount; public int HAnsiRunCount; public List Languages; public FormattingMetrics() { Languages = new List(); } } private static void FontAndCharSetAnalysis(WordprocessingDocument wDoc, List metrics, List notes) { FormattingAssemblerSettings settings = new FormattingAssemblerSettings { RemoveStyleNamesFromParagraphAndRunProperties = false, ClearStyles = true, RestrictToSupportedNumberingFormats = false, RestrictToSupportedLanguages = false, }; FormattingAssembler.AssembleFormatting(wDoc, settings); var formattingMetrics = new FormattingMetrics(); foreach (var part in wDoc.ContentParts()) { var xDoc = part.GetXDocument(); foreach (var run in xDoc.Descendants(W.r)) { formattingMetrics.RunCount++; AnalyzeRun(run, metrics, notes, formattingMetrics, part.Uri.ToString()); } } metrics.Add(new XElement(H.RunCount, new XAttribute(H.Val, formattingMetrics.RunCount))); if (formattingMetrics.RunWithoutRprCount > 0) metrics.Add(new XElement(H.RunWithoutRprCount, new XAttribute(H.Val, formattingMetrics.RunWithoutRprCount))); if (formattingMetrics.ZeroLengthText > 0) metrics.Add(new XElement(H.ZeroLengthText, new XAttribute(H.Val, formattingMetrics.ZeroLengthText))); if (formattingMetrics.MultiFontRun > 0) metrics.Add(new XElement(H.MultiFontRun, new XAttribute(H.Val, formattingMetrics.MultiFontRun))); if (formattingMetrics.AsciiCharCount > 0) metrics.Add(new XElement(H.AsciiCharCount, new XAttribute(H.Val, formattingMetrics.AsciiCharCount))); if (formattingMetrics.CSCharCount > 0) metrics.Add(new XElement(H.CSCharCount, new XAttribute(H.Val, formattingMetrics.CSCharCount))); if (formattingMetrics.EastAsiaCharCount > 0) metrics.Add(new XElement(H.EastAsiaCharCount, new XAttribute(H.Val, formattingMetrics.EastAsiaCharCount))); if (formattingMetrics.HAnsiCharCount > 0) metrics.Add(new XElement(H.HAnsiCharCount, new XAttribute(H.Val, formattingMetrics.HAnsiCharCount))); if (formattingMetrics.AsciiRunCount > 0) metrics.Add(new XElement(H.AsciiRunCount, new XAttribute(H.Val, formattingMetrics.AsciiRunCount))); if (formattingMetrics.CSRunCount > 0) metrics.Add(new XElement(H.CSRunCount, new XAttribute(H.Val, formattingMetrics.CSRunCount))); if (formattingMetrics.EastAsiaRunCount > 0) metrics.Add(new XElement(H.EastAsiaRunCount, new XAttribute(H.Val, formattingMetrics.EastAsiaRunCount))); if (formattingMetrics.HAnsiRunCount > 0) metrics.Add(new XElement(H.HAnsiRunCount, new XAttribute(H.Val, formattingMetrics.HAnsiRunCount))); if (formattingMetrics.Languages.Any()) { var uls = formattingMetrics.Languages.StringConcatenate(s => s + ",").TrimEnd(','); metrics.Add(new XElement(H.Languages, new XAttribute(H.Val, PtUtils.MakeValidXml(uls)))); } } private static void AnalyzeRun(XElement run, List attList, List notes, FormattingMetrics formattingMetrics, string uri) { var runText = run.Elements() .Where(e => e.Name == W.t || e.Name == W.delText) .Select(t => (string)t) .StringConcatenate(); if (runText.Length == 0) { formattingMetrics.ZeroLengthText++; return; } var rPr = run.Element(W.rPr); if (rPr == null) { formattingMetrics.RunWithoutRprCount++; notes.Add(PtUtils.MakeValidXml(string.Format("Error in part {0}: run without rPr at {1}", uri, run.GetXPath()))); rPr = new XElement(W.rPr); } FormattingAssembler.CharStyleAttributes csa = new FormattingAssembler.CharStyleAttributes(null, rPr); var fontTypeArray = runText .Select(ch => FormattingAssembler.DetermineFontTypeFromCharacter(ch, csa)) .ToArray(); var distinctFontTypeArray = fontTypeArray .Distinct() .ToArray(); var distinctFonts = distinctFontTypeArray .Select(ft => { return GetFontFromFontType(csa, ft); }) .Distinct(); var languages = distinctFontTypeArray .Select(ft => { if (ft == FormattingAssembler.FontType.Ascii) return csa.LatinLang; if (ft == FormattingAssembler.FontType.CS) return csa.BidiLang; if (ft == FormattingAssembler.FontType.EastAsia) return csa.EastAsiaLang; //if (ft == FormattingAssembler.FontType.HAnsi) return csa.LatinLang; }) .Select(l => { if (l == "" || l == null) return /* "Dflt:" + */ CultureInfo.CurrentCulture.Name; return l; }) //.Where(l => l != null && l != "") .Distinct(); if (languages.Any(l => !formattingMetrics.Languages.Contains(l))) formattingMetrics.Languages = formattingMetrics.Languages.Concat(languages).Distinct().ToList(); var multiFontRun = distinctFonts.Count() > 1; if (multiFontRun) { formattingMetrics.MultiFontRun++; formattingMetrics.AsciiCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.Ascii).Count(); formattingMetrics.CSCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.CS).Count(); formattingMetrics.EastAsiaCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.EastAsia).Count(); formattingMetrics.HAnsiCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.HAnsi).Count(); } else { switch (fontTypeArray[0]) { case FormattingAssembler.FontType.Ascii: formattingMetrics.AsciiCharCount += runText.Length; formattingMetrics.AsciiRunCount++; break; case FormattingAssembler.FontType.CS: formattingMetrics.CSCharCount += runText.Length; formattingMetrics.CSRunCount++; break; case FormattingAssembler.FontType.EastAsia: formattingMetrics.EastAsiaCharCount += runText.Length; formattingMetrics.EastAsiaRunCount++; break; case FormattingAssembler.FontType.HAnsi: formattingMetrics.HAnsiCharCount += runText.Length; formattingMetrics.HAnsiRunCount++; break; } } } private static string GetFontFromFontType(FormattingAssembler.CharStyleAttributes csa, FormattingAssembler.FontType ft) { switch (ft) { case FormattingAssembler.FontType.Ascii: return csa.AsciiFont; case FormattingAssembler.FontType.CS: return csa.CsFont; case FormattingAssembler.FontType.EastAsia: return csa.EastAsiaFont; case FormattingAssembler.FontType.HAnsi: return csa.HAnsiFont; default: // dummy return csa.AsciiFont; } } public static XElement GetXlsxMetrics(SmlDocument smlDoc, MetricsGetterSettings settings) { using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(smlDoc)) { using (SpreadsheetDocument sDoc = streamDoc.GetSpreadsheetDocument()) { List metrics = new List(); bool valid = ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007); valid |= ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010); #if !NET35 valid |= ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013); #endif return new XElement(H.Metrics, new XAttribute(H.FileName, smlDoc.FileName), new XAttribute(H.FileType, "SpreadsheetML"), metrics, GetTableInfoForWorkbook(sDoc, settings), settings.RetrieveNamespaceList ? RetrieveNamespaceList(sDoc) : null, settings.RetrieveContentTypeList ? RetrieveContentTypeList(sDoc) : null); } } } private static XElement GetTableInfoForWorkbook(SpreadsheetDocument spreadsheet, MetricsGetterSettings settings) { var workbookPart = spreadsheet.WorkbookPart; var xd = workbookPart.GetXDocument(); var partInformation = new XElement(H.Sheets, xd.Root .Element(S.sheets) .Elements(S.sheet) .Select(sh => { var rid = (string)sh.Attribute(R.id); var sheetName = (string)sh.Attribute("name"); WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(rid); return GetTableInfoForSheet(spreadsheet, worksheetPart, sheetName, settings); })); return partInformation; } public static XElement GetTableInfoForSheet(SpreadsheetDocument spreadsheetDocument, WorksheetPart sheetPart, string sheetName, MetricsGetterSettings settings) { var xd = sheetPart.GetXDocument(); XElement sheetInformation = new XElement(H.Sheet, new XAttribute(H.Name, sheetName), xd.Root.Elements(S.tableParts).Elements(S.tablePart).Select(tp => { string rId = (string)tp.Attribute(R.id); TableDefinitionPart tablePart = (TableDefinitionPart)sheetPart.GetPartById(rId); var txd = tablePart.GetXDocument(); var tableName = (string)txd.Root.Attribute("displayName"); XElement tableCellData = null; if (settings.IncludeXlsxTableCellData) { var xlsxTable = spreadsheetDocument.Table(tableName); tableCellData = new XElement(H.TableData, xlsxTable.TableRows() .Select(row => { var rowElement = new XElement(H.Row, xlsxTable.TableColumns().Select(col => { var cellElement = new XElement(H.Cell, new XAttribute(H.Name, col.Name), new XAttribute(H.Val, (string)row[col.Name])); return cellElement; })); return rowElement; })); } var table = new XElement(H.Table, new XAttribute(H.Name, (string)txd.Root.Attribute("name")), new XAttribute(H.DisplayName, tableName), new XElement(H.Columns, txd.Root.Element(S.tableColumns).Elements(S.tableColumn) .Select(tc => new XElement(H.Column, new XAttribute(H.Name, (string)tc.Attribute("name"))))), tableCellData ); return table; }) ); if (!sheetInformation.HasElements) return null; return sheetInformation; } public static XElement GetPptxMetrics(PmlDocument pmlDoc, MetricsGetterSettings settings) { using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(pmlDoc)) { using (PresentationDocument pDoc = streamDoc.GetPresentationDocument()) { List metrics = new List(); bool valid = ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007); valid |= ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010); #if !NET35 valid |= ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013); #endif return new XElement(H.Metrics, new XAttribute(H.FileName, pmlDoc.FileName), new XAttribute(H.FileType, "PresentationML"), metrics, settings.RetrieveNamespaceList ? RetrieveNamespaceList(pDoc) : null, settings.RetrieveContentTypeList ? RetrieveContentTypeList(pDoc) : null); } } } private static object GetStyleHierarchy(WordprocessingDocument document) { var stylePart = document.MainDocumentPart.StyleDefinitionsPart; if (stylePart == null) return null; var xd = stylePart.GetXDocument(); var stylesWithPath = xd.Root .Elements(W.style) .Select(s => { var styleString = (string)s.Attribute(W.styleId); var thisStyle = s; while (true) { var baseStyle = (string)thisStyle.Elements(W.basedOn).Attributes(W.val).FirstOrDefault(); if (baseStyle == null) break; styleString = baseStyle + "/" + styleString; thisStyle = xd.Root.Elements(W.style).FirstOrDefault(ts => ts.Attribute(W.styleId).Value == baseStyle); if (thisStyle == null) break; } return styleString; }) .OrderBy(n => n) .ToList(); XElement styleHierarchy = new XElement(H.StyleHierarchy); foreach (var item in stylesWithPath) { var styleChain = item.Split('/'); XElement elementToAddTo = styleHierarchy; foreach (var inChain in styleChain.PtSkipLast(1)) elementToAddTo = elementToAddTo.Elements(H.Style).FirstOrDefault(z => z.Attribute(H.Id).Value == inChain); var styleToAdd = styleChain.Last(); elementToAddTo.Add( new XElement(H.Style, new XAttribute(H.Id, styleChain.Last()), new XAttribute(H.Type, (string)xd.Root.Elements(W.style).First(z => z.Attribute(W.styleId).Value == styleToAdd).Attribute(W.type)))); } return styleHierarchy; } private static XElement GetMetricsForWmlPart(OpenXmlPart part, MetricsGetterSettings settings) { XElement contentControls = null; if (part is MainDocumentPart || part is HeaderPart || part is FooterPart || part is FootnotesPart || part is EndnotesPart) { var xd = part.GetXDocument(); contentControls = (XElement)GetContentControlsTransform(xd.Root, settings); if (!contentControls.HasElements) contentControls = null; } var partMetrics = new XElement(H.Part, new XAttribute(H.ContentType, part.ContentType), new XAttribute(H.Uri, part.Uri.ToString()), contentControls); if (partMetrics.HasElements) return partMetrics; return null; } private static object GetContentControlsTransform(XNode node, MetricsGetterSettings settings) { XElement element = node as XElement; if (element != null) { if (element == element.Document.Root) return new XElement(H.ContentControls, element.Nodes().Select(n => GetContentControlsTransform(n, settings))); if (element.Name == W.sdt) { var tag = (string)element.Elements(W.sdtPr).Elements(W.tag).Attributes(W.val).FirstOrDefault(); XAttribute tagAttr = tag != null ? new XAttribute(H.Tag, tag) : null; var alias = (string)element.Elements(W.sdtPr).Elements(W.alias).Attributes(W.val).FirstOrDefault(); XAttribute aliasAttr = alias != null ? new XAttribute(H.Alias, alias) : null; var xPathAttr = new XAttribute(H.XPath, element.GetXPath()); var isText = element.Elements(W.sdtPr).Elements(W.text).Any(); var isBibliography = element.Elements(W.sdtPr).Elements(W.bibliography).Any(); var isCitation = element.Elements(W.sdtPr).Elements(W.citation).Any(); var isComboBox = element.Elements(W.sdtPr).Elements(W.comboBox).Any(); var isDate = element.Elements(W.sdtPr).Elements(W.date).Any(); var isDocPartList = element.Elements(W.sdtPr).Elements(W.docPartList).Any(); var isDocPartObj = element.Elements(W.sdtPr).Elements(W.docPartObj).Any(); var isDropDownList = element.Elements(W.sdtPr).Elements(W.dropDownList).Any(); var isEquation = element.Elements(W.sdtPr).Elements(W.equation).Any(); var isGroup = element.Elements(W.sdtPr).Elements(W.group).Any(); var isPicture = element.Elements(W.sdtPr).Elements(W.picture).Any(); var isRichText = element.Elements(W.sdtPr).Elements(W.richText).Any() || (! isText && ! isBibliography && ! isCitation && ! isComboBox && ! isDate && ! isDocPartList && ! isDocPartObj && ! isDropDownList && ! isEquation && ! isGroup && ! isPicture); string type = null; if (isText ) type = "Text"; if (isBibliography) type = "Bibliography"; if (isCitation ) type = "Citation"; if (isComboBox ) type = "ComboBox"; if (isDate ) type = "Date"; if (isDocPartList ) type = "DocPartList"; if (isDocPartObj ) type = "DocPartObj"; if (isDropDownList) type = "DropDownList"; if (isEquation ) type = "Equation"; if (isGroup ) type = "Group"; if (isPicture ) type = "Picture"; if (isRichText ) type = "RichText"; var typeAttr = new XAttribute(H.Type, type); return new XElement(H.ContentControl, typeAttr, tagAttr, aliasAttr, xPathAttr, element.Nodes().Select(n => GetContentControlsTransform(n, settings))); } return element.Nodes().Select(n => GetContentControlsTransform(n, settings)); } if (settings.IncludeTextInContentControls) return node; return null; } } public static class H { public static XName ActiveX = "ActiveX"; public static XName Alias = "Alias"; public static XName AltChunk = "AltChunk"; public static XName Arguments = "Arguments"; public static XName AsciiCharCount = "AsciiCharCount"; public static XName AsciiRunCount = "AsciiRunCount"; public static XName AverageParagraphLength = "AverageParagraphLength"; public static XName BaselineReport = "BaselineReport"; public static XName Batch = "Batch"; public static XName BatchName = "BatchName"; public static XName BatchSelector = "BatchSelector"; public static XName CSCharCount = "CSCharCount"; public static XName CSRunCount = "CSRunCount"; public static XName Catalog = "Catalog"; public static XName CatalogList = "CatalogList"; public static XName CatalogListFile = "CatalogListFile"; public static XName CaughtException = "CaughtException"; public static XName Cell = "Cell"; public static XName Column = "Column"; public static XName Columns = "Columns"; public static XName ComplexField = "ComplexField"; public static XName Computer = "Computer"; public static XName Computers = "Computers"; public static XName ContentControl = "ContentControl"; public static XName ContentControls = "ContentControls"; public static XName ContentType = "ContentType"; public static XName ContentTypes = "ContentTypes"; public static XName CustomXmlMarkup = "CustomXmlMarkup"; public static XName DLL = "DLL"; public static XName DefaultDialogValuesFile = "DefaultDialogValuesFile"; public static XName DefaultValues = "DefaultValues"; public static XName Dependencies = "Dependencies"; public static XName DestinationDir = "DestinationDir"; public static XName Directory = "Directory"; public static XName DirectoryPattern = "DirectoryPattern"; public static XName DisplayName = "DisplayName"; public static XName DoJobQueueName = "DoJobQueueName"; public static XName Document = "Document"; public static XName DocumentProtection = "DocumentProtection"; public static XName DocumentSelector = "DocumentSelector"; public static XName DocumentType = "DocumentType"; public static XName Documents = "Documents"; public static XName EastAsiaCharCount = "EastAsiaCharCount"; public static XName EastAsiaRunCount = "EastAsiaRunCount"; public static XName ElementCount = "ElementCount"; public static XName EmbeddedXlsx = "EmbeddedXlsx"; public static XName Error = "Error"; public static XName Exception = "Exception"; public static XName Exe = "Exe"; public static XName ExeRoot = "ExeRoot"; public static XName Extension = "Extension"; public static XName File = "File"; public static XName FileLength = "FileLength"; public static XName FileName = "FileName"; public static XName FilePattern = "FilePattern"; public static XName FileType = "FileType"; public static XName Guid = "Guid"; public static XName HAnsiCharCount = "HAnsiCharCount"; public static XName HAnsiRunCount = "HAnsiRunCount"; public static XName RevisionTracking = "RevisionTracking"; public static XName Hyperlink = "Hyperlink"; public static XName IPAddress = "IPAddress"; public static XName Id = "Id"; public static XName Invalid = "Invalid"; public static XName InvalidHyperlink = "InvalidHyperlink"; public static XName InvalidHyperlinkException = "InvalidHyperlinkException"; public static XName InvalidSaveThroughXslt = "InvalidSaveThroughXslt"; public static XName JobComplete = "JobComplete"; public static XName JobExe = "JobExe"; public static XName JobName = "JobName"; public static XName JobSpec = "JobSpec"; public static XName Languages = "Languages"; public static XName LegacyFrame = "LegacyFrame"; public static XName LocalDoJobQueue = "LocalDoJobQueue"; public static XName MachineName = "MachineName"; public static XName MaxConcurrentJobs = "MaxConcurrentJobs"; public static XName MaxDocumentsInJob = "MaxDocumentsInJob"; public static XName MaxParagraphLength = "MaxParagraphLength"; public static XName Message = "Message"; public static XName Metrics = "Metrics"; public static XName MultiDirectory = "MultiDirectory"; public static XName MultiFontRun = "MultiFontRun"; public static XName MultiServerQueue = "MultiServerQueue"; public static XName Name = "Name"; public static XName Namespaces = "Namespaces"; public static XName Namespace = "Namespace"; public static XName NamespaceName = "NamespaceName"; public static XName NamespacePrefix = "NamespacePrefix"; public static XName Note = "Note"; public static XName NumberingFormatList = "NumberingFormatList"; public static XName ObjectDisposedException = "ObjectDisposedException"; public static XName ParagraphCount = "ParagraphCount"; public static XName Part = "Part"; public static XName Parts = "Parts"; public static XName PassedDocuments = "PassedDocuments"; public static XName Path = "Path"; public static XName ProduceCatalog = "ProduceCatalog"; public static XName ReferenceToNullImage = "ReferenceToNullImage"; public static XName Report = "Report"; public static XName Root = "Root"; public static XName RootDirectory = "RootDirectory"; public static XName Row = "Row"; public static XName RunCount = "RunCount"; public static XName RunWithoutRprCount = "RunWithoutRprCount"; public static XName SdkValidationError = "SdkValidationError"; public static XName SdkValidationError2007 = "SdkValidationError2007"; public static XName SdkValidationError2010 = "SdkValidationError2010"; public static XName SdkValidationError2013 = "SdkValidationError2013"; public static XName Sheet = "Sheet"; public static XName Sheets = "Sheets"; public static XName SimpleField = "SimpleField"; public static XName Skip = "Skip"; public static XName SmartTag = "SmartTag"; public static XName SourceRootDir = "SourceRootDir"; public static XName SpawnerJobExeLocation = "SpawnerJobExeLocation"; public static XName SpawnerReady = "SpawnerReady"; public static XName Style = "Style"; public static XName StyleHierarchy = "StyleHierarchy"; public static XName SubDocument = "SubDocument"; public static XName Table = "Table"; public static XName TableData = "TableData"; public static XName Tag = "Tag"; public static XName Take = "Take"; public static XName TextBox = "TextBox"; public static XName TrackRevisionsEnabled = "TrackRevisionsEnabled"; public static XName Type = "Type"; public static XName Uri = "Uri"; public static XName Val = "Val"; public static XName Valid = "Valid"; public static XName WindowStyle = "WindowStyle"; public static XName XPath = "XPath"; public static XName ZeroLengthText = "ZeroLengthText"; public static XName custDataLst = "custDataLst"; public static XName custShowLst = "custShowLst"; public static XName kinsoku = "kinsoku"; public static XName modifyVerifier = "modifyVerifier"; public static XName photoAlbum = "photoAlbum"; } }