123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071 |
- // Copyright (c) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE file in the project root for full license information.
- using System;
- using System.Collections.Generic;
- using System.Drawing;
- using System.IO;
- using System.IO.Packaging;
- using System.Linq;
- using System.Text;
- using System.Xml.Linq;
- using DocumentFormat.OpenXml.Packaging;
- using DocumentFormat.OpenXml.Validation;
- using System.Globalization;
- namespace OpenXmlPowerTools
- {
- public class MetricsGetterSettings
- {
- public bool IncludeTextInContentControls;
- public bool IncludeXlsxTableCellData;
- public bool RetrieveNamespaceList;
- public bool RetrieveContentTypeList;
- }
- public class MetricsGetter
- {
- private static Lazy<Graphics> Graphics { get; } = new Lazy<Graphics>(() =>
- {
- Image image = new Bitmap(1, 1);
- return System.Drawing.Graphics.FromImage(image);
- });
- public static XElement GetMetrics(string fileName, MetricsGetterSettings settings)
- {
- FileInfo fi = new FileInfo(fileName);
- if (!fi.Exists)
- throw new FileNotFoundException("{0} does not exist.", fi.FullName);
- if (Util.IsWordprocessingML(fi.Extension))
- {
- WmlDocument wmlDoc = new WmlDocument(fi.FullName, true);
- return GetDocxMetrics(wmlDoc, settings);
- }
- if (Util.IsSpreadsheetML(fi.Extension))
- {
- SmlDocument smlDoc = new SmlDocument(fi.FullName, true);
- return GetXlsxMetrics(smlDoc, settings);
- }
- if (Util.IsPresentationML(fi.Extension))
- {
- PmlDocument pmlDoc = new PmlDocument(fi.FullName, true);
- return GetPptxMetrics(pmlDoc, settings);
- }
- return null;
- }
- public static XElement GetDocxMetrics(WmlDocument wmlDoc, MetricsGetterSettings settings)
- {
- try
- {
- using (MemoryStream ms = new MemoryStream())
- {
- ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length);
- using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true))
- {
- bool hasTrackedRevisions = RevisionAccepter.HasTrackedRevisions(document);
- if (hasTrackedRevisions)
- RevisionAccepter.AcceptRevisions(document);
- XElement metrics1 = GetWmlMetrics(wmlDoc.FileName, false, document, settings);
- if (hasTrackedRevisions)
- metrics1.Add(new XElement(H.RevisionTracking, new XAttribute(H.Val, true)));
- return metrics1;
- }
- }
- }
- catch (OpenXmlPowerToolsException e)
- {
- if (e.ToString().Contains("Invalid Hyperlink"))
- {
- using (MemoryStream ms = new MemoryStream())
- {
- ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length);
- #if !NET35
- UriFixer.FixInvalidUri(ms, brokenUri => FixUri(brokenUri));
- #endif
- wmlDoc = new WmlDocument("dummy.docx", ms.ToArray());
- }
- using (MemoryStream ms = new MemoryStream())
- {
- ms.Write(wmlDoc.DocumentByteArray, 0, wmlDoc.DocumentByteArray.Length);
- using (WordprocessingDocument document = WordprocessingDocument.Open(ms, true))
- {
- bool hasTrackedRevisions = RevisionAccepter.HasTrackedRevisions(document);
- if (hasTrackedRevisions)
- RevisionAccepter.AcceptRevisions(document);
- XElement metrics2 = GetWmlMetrics(wmlDoc.FileName, true, document, settings);
- if (hasTrackedRevisions)
- metrics2.Add(new XElement(H.RevisionTracking, new XAttribute(H.Val, true)));
- return metrics2;
- }
- }
- }
- }
- var metrics = new XElement(H.Metrics,
- new XAttribute(H.FileName, wmlDoc.FileName),
- new XAttribute(H.FileType, "WordprocessingML"),
- new XAttribute(H.Error, "Unknown error, metrics not determined"));
- return metrics;
- }
- private static int _getTextWidth(FontFamily ff, FontStyle fs, decimal sz, string text)
- {
- try
- {
- using (var f = new Font(ff, (float)sz / 2f, fs))
- {
- var proposedSize = new Size(int.MaxValue, int.MaxValue);
- var sf = Graphics.Value.MeasureString(text, f, proposedSize);
- return (int) sf.Width;
- }
- }
- catch
- {
- return 0;
- }
- }
- public static int GetTextWidth(FontFamily ff, FontStyle fs, decimal sz, string text)
- {
- try
- {
- return _getTextWidth(ff, fs, sz, text);
- }
- catch (ArgumentException)
- {
- try
- {
- const FontStyle fs2 = FontStyle.Regular;
- return _getTextWidth(ff, fs2, sz, text);
- }
- catch (ArgumentException)
- {
- const FontStyle fs2 = FontStyle.Bold;
- try
- {
- return _getTextWidth(ff, fs2, sz, text);
- }
- catch (ArgumentException)
- {
- // if both regular and bold fail, then get metrics for Times New Roman
- // use the original FontStyle (in fs)
- var ff2 = new FontFamily("Times New Roman");
- return _getTextWidth(ff2, fs, sz, text);
- }
- }
- }
- catch (OverflowException)
- {
- // This happened on Azure but interestingly enough not while testing locally.
- return 0;
- }
- }
- private static Uri FixUri(string brokenUri)
- {
- return new Uri("http://broken-link/");
- }
- private static XElement GetWmlMetrics(string fileName, bool invalidHyperlink, WordprocessingDocument wDoc, MetricsGetterSettings settings)
- {
- var parts = new XElement(H.Parts,
- wDoc.GetAllParts().Select(part =>
- {
- return GetMetricsForWmlPart(part, settings);
- }));
- if (!parts.HasElements)
- parts = null;
- var metrics = new XElement(H.Metrics,
- new XAttribute(H.FileName, fileName),
- new XAttribute(H.FileType, "WordprocessingML"),
- GetStyleHierarchy(wDoc),
- GetMiscWmlMetrics(wDoc, invalidHyperlink),
- parts,
- settings.RetrieveNamespaceList ? RetrieveNamespaceList(wDoc) : null,
- settings.RetrieveContentTypeList ? RetrieveContentTypeList(wDoc) : null
- );
- return metrics;
- }
- private static XElement RetrieveContentTypeList(OpenXmlPackage oxPkg)
- {
- Package pkg = oxPkg.Package;
- var nonRelationshipParts = pkg.GetParts().Cast<ZipPackagePart>().Where(p => p.ContentType != "application/vnd.openxmlformats-package.relationships+xml");
- var contentTypes = nonRelationshipParts
- .Select(p => p.ContentType)
- .OrderBy(t => t)
- .Distinct();
- var xe = new XElement(H.ContentTypes,
- contentTypes.Select(ct => new XElement(H.ContentType, new XAttribute(H.Val, ct))));
- return xe;
- }
- private static XElement RetrieveNamespaceList(OpenXmlPackage oxPkg)
- {
- Package pkg = oxPkg.Package;
- var nonRelationshipParts = pkg.GetParts().Cast<ZipPackagePart>().Where(p => p.ContentType != "application/vnd.openxmlformats-package.relationships+xml");
- var xmlParts = nonRelationshipParts
- .Where(p => p.ContentType.ToLower().EndsWith("xml"));
- var uniqueNamespaces = new HashSet<string>();
- foreach (var xp in xmlParts)
- {
- using (Stream st = xp.GetStream())
- {
- try
- {
- XDocument xdoc = XDocument.Load(st);
- var namespaces = xdoc
- .Descendants()
- .Attributes()
- .Where(a => a.IsNamespaceDeclaration)
- .Select(a => string.Format("{0}|{1}", a.Name.LocalName, a.Value))
- .OrderBy(t => t)
- .Distinct()
- .ToList();
- foreach (var item in namespaces)
- uniqueNamespaces.Add(item);
- }
- // if catch exception, forget about it. Just trying to get a most complete survey possible of all namespaces in all documents.
- // if caught exception, chances are the document is bad anyway.
- catch (Exception)
- {
- continue;
- }
- }
- }
- var xe = new XElement(H.Namespaces,
- uniqueNamespaces.OrderBy(t => t).Select(n =>
- {
- var spl = n.Split('|');
- return new XElement(H.Namespace,
- new XAttribute(H.NamespacePrefix, spl[0]),
- new XAttribute(H.NamespaceName, spl[1]));
- }));
- return xe;
- }
- private static List<XElement> GetMiscWmlMetrics(WordprocessingDocument document, bool invalidHyperlink)
- {
- List<XElement> metrics = new List<XElement>();
- List<string> notes = new List<string>();
- Dictionary<XName, int> elementCountDictionary = new Dictionary<XName, int>();
- if (invalidHyperlink)
- metrics.Add(new XElement(H.InvalidHyperlink, new XAttribute(H.Val, invalidHyperlink)));
- bool valid = ValidateWordprocessingDocument(document, metrics, notes, elementCountDictionary);
- if (invalidHyperlink)
- valid = false;
- return metrics;
- }
- private static bool ValidateWordprocessingDocument(WordprocessingDocument wDoc, List<XElement> metrics, List<string> notes, Dictionary<XName, int> metricCountDictionary)
- {
- bool valid = ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007);
- valid |= ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010);
- #if !NET35
- valid |= ValidateAgainstSpecificVersion(wDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013);
- #endif
- int elementCount = 0;
- int paragraphCount = 0;
- int textCount = 0;
- foreach (var part in wDoc.ContentParts())
- {
- XDocument xDoc = part.GetXDocument();
- foreach (var e in xDoc.Descendants())
- {
- if (e.Name == W.txbxContent)
- IncrementMetric(metricCountDictionary, H.TextBox);
- else if (e.Name == W.sdt)
- IncrementMetric(metricCountDictionary, H.ContentControl);
- else if (e.Name == W.customXml)
- IncrementMetric(metricCountDictionary, H.CustomXmlMarkup);
- else if (e.Name == W.fldChar)
- IncrementMetric(metricCountDictionary, H.ComplexField);
- else if (e.Name == W.fldSimple)
- IncrementMetric(metricCountDictionary, H.SimpleField);
- else if (e.Name == W.altChunk)
- IncrementMetric(metricCountDictionary, H.AltChunk);
- else if (e.Name == W.tbl)
- IncrementMetric(metricCountDictionary, H.Table);
- else if (e.Name == W.hyperlink)
- IncrementMetric(metricCountDictionary, H.Hyperlink);
- else if (e.Name == W.framePr)
- IncrementMetric(metricCountDictionary, H.LegacyFrame);
- else if (e.Name == W.control)
- IncrementMetric(metricCountDictionary, H.ActiveX);
- else if (e.Name == W.subDoc)
- IncrementMetric(metricCountDictionary, H.SubDocument);
- else if (e.Name == VML.imagedata || e.Name == VML.fill || e.Name == VML.stroke || e.Name == A.blip)
- {
- var relId = (string)e.Attribute(R.embed);
- if (relId != null)
- ValidateImageExists(part, relId, metricCountDictionary);
- relId = (string)e.Attribute(R.pict);
- if (relId != null)
- ValidateImageExists(part, relId, metricCountDictionary);
- relId = (string)e.Attribute(R.id);
- if (relId != null)
- ValidateImageExists(part, relId, metricCountDictionary);
- }
- if (part.Uri == wDoc.MainDocumentPart.Uri)
- {
- elementCount++;
- if (e.Name == W.p)
- paragraphCount++;
- if (e.Name == W.t)
- textCount += ((string)e).Length;
- }
- }
- }
- foreach (var item in metricCountDictionary)
- {
- metrics.Add(
- new XElement(item.Key, new XAttribute(H.Val, item.Value)));
- }
- metrics.Add(new XElement(H.ElementCount, new XAttribute(H.Val, elementCount)));
- metrics.Add(new XElement(H.AverageParagraphLength, new XAttribute(H.Val, (int)((double)textCount / (double)paragraphCount))));
- if (wDoc.GetAllParts().Any(part => part.ContentType == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"))
- metrics.Add(new XElement(H.EmbeddedXlsx, new XAttribute(H.Val, true)));
- NumberingFormatListAssembly(wDoc, metrics);
- XDocument wxDoc = wDoc.MainDocumentPart.GetXDocument();
- foreach (var d in wxDoc.Descendants())
- {
- if (d.Name == W.saveThroughXslt)
- {
- string rid = (string)d.Attribute(R.id);
- var tempExternalRelationship = wDoc
- .MainDocumentPart
- .DocumentSettingsPart
- .ExternalRelationships
- .FirstOrDefault(h => h.Id == rid);
- if (tempExternalRelationship == null)
- metrics.Add(new XElement(H.InvalidSaveThroughXslt, new XAttribute(H.Val, true)));
- valid = false;
- }
- else if (d.Name == W.trackRevisions)
- metrics.Add(new XElement(H.TrackRevisionsEnabled, new XAttribute(H.Val, true)));
- else if (d.Name == W.documentProtection)
- metrics.Add(new XElement(H.DocumentProtection, new XAttribute(H.Val, true)));
- }
- FontAndCharSetAnalysis(wDoc, metrics, notes);
- return valid;
- }
- private static bool ValidateAgainstSpecificVersion(WordprocessingDocument wDoc, List<XElement> metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName)
- {
- OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst);
- var errors = validator.Validate(wDoc);
- bool valid = errors.Count() == 0;
- if (!valid)
- {
- if (!metrics.Any(e => e.Name == H.SdkValidationError))
- metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true)));
- metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true),
- errors.Take(3).Select(err =>
- {
- StringBuilder sb = new StringBuilder();
- if (err.Description.Length > 300)
- sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine);
- else
- sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine);
- sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine);
- sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine);
- return sb.ToString();
- })));
- }
- return valid;
- }
- private static bool ValidateAgainstSpecificVersion(SpreadsheetDocument sDoc, List<XElement> metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName)
- {
- OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst);
- var errors = validator.Validate(sDoc);
- bool valid = errors.Count() == 0;
- if (!valid)
- {
- if (!metrics.Any(e => e.Name == H.SdkValidationError))
- metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true)));
- metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true),
- errors.Take(3).Select(err =>
- {
- StringBuilder sb = new StringBuilder();
- if (err.Description.Length > 300)
- sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine);
- else
- sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine);
- sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine);
- sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine);
- return sb.ToString();
- })));
- }
- return valid;
- }
- private static bool ValidateAgainstSpecificVersion(PresentationDocument pDoc, List<XElement> metrics, DocumentFormat.OpenXml.FileFormatVersions versionToValidateAgainst, XName versionSpecificMetricName)
- {
- OpenXmlValidator validator = new OpenXmlValidator(versionToValidateAgainst);
- var errors = validator.Validate(pDoc);
- bool valid = errors.Count() == 0;
- if (!valid)
- {
- if (!metrics.Any(e => e.Name == H.SdkValidationError))
- metrics.Add(new XElement(H.SdkValidationError, new XAttribute(H.Val, true)));
- metrics.Add(new XElement(versionSpecificMetricName, new XAttribute(H.Val, true),
- errors.Take(3).Select(err =>
- {
- StringBuilder sb = new StringBuilder();
- if (err.Description.Length > 300)
- sb.Append(PtUtils.MakeValidXml(err.Description.Substring(0, 300) + " ... elided ...") + Environment.NewLine);
- else
- sb.Append(PtUtils.MakeValidXml(err.Description) + Environment.NewLine);
- sb.Append(" in part " + PtUtils.MakeValidXml(err.Part.Uri.ToString()) + Environment.NewLine);
- sb.Append(" at " + PtUtils.MakeValidXml(err.Path.XPath) + Environment.NewLine);
- return sb.ToString();
- })));
- }
- return valid;
- }
- private static void IncrementMetric(Dictionary<XName, int> metricCountDictionary, XName xName)
- {
- if (metricCountDictionary.ContainsKey(xName))
- metricCountDictionary[xName] = metricCountDictionary[xName] + 1;
- else
- metricCountDictionary.Add(xName, 1);
- }
- private static void ValidateImageExists(OpenXmlPart part, string relId, Dictionary<XName, int> metrics)
- {
- var imagePart = part.Parts.FirstOrDefault(ipp => ipp.RelationshipId == relId);
- if (imagePart == null)
- IncrementMetric(metrics, H.ReferenceToNullImage);
- }
- private static void NumberingFormatListAssembly(WordprocessingDocument wDoc, List<XElement> metrics)
- {
- List<string> numFmtList = new List<string>();
- foreach (var part in wDoc.ContentParts())
- {
- var xDoc = part.GetXDocument();
- numFmtList = numFmtList.Concat(xDoc
- .Descendants(W.p)
- .Select(p =>
- {
- ListItemRetriever.RetrieveListItem(wDoc, p, null);
- ListItemRetriever.ListItemInfo lif = p.Annotation<ListItemRetriever.ListItemInfo>();
- if (lif != null && lif.IsListItem && lif.Lvl(ListItemRetriever.GetParagraphLevel(p)) != null)
- {
- string numFmtForLevel = (string)lif.Lvl(ListItemRetriever.GetParagraphLevel(p)).Elements(W.numFmt).Attributes(W.val).FirstOrDefault();
- if (numFmtForLevel == null)
- {
- var numFmtElement = lif.Lvl(ListItemRetriever.GetParagraphLevel(p)).Elements(MC.AlternateContent).Elements(MC.Choice).Elements(W.numFmt).FirstOrDefault();
- if (numFmtElement != null && (string)numFmtElement.Attribute(W.val) == "custom")
- numFmtForLevel = (string)numFmtElement.Attribute(W.format);
- }
- return numFmtForLevel;
- }
- return null;
- })
- .Where(s => s != null)
- .Distinct())
- .ToList();
- }
- if (numFmtList.Any())
- {
- var nfls = numFmtList.StringConcatenate(s => s + ",").TrimEnd(',');
- metrics.Add(new XElement(H.NumberingFormatList, new XAttribute(H.Val, PtUtils.MakeValidXml(nfls))));
- }
- }
- class FormattingMetrics
- {
- public int RunCount;
- public int RunWithoutRprCount;
- public int ZeroLengthText;
- public int MultiFontRun;
- public int AsciiCharCount;
- public int CSCharCount;
- public int EastAsiaCharCount;
- public int HAnsiCharCount;
- public int AsciiRunCount;
- public int CSRunCount;
- public int EastAsiaRunCount;
- public int HAnsiRunCount;
- public List<string> Languages;
- public FormattingMetrics()
- {
- Languages = new List<string>();
- }
- }
- private static void FontAndCharSetAnalysis(WordprocessingDocument wDoc, List<XElement> metrics, List<string> notes)
- {
- FormattingAssemblerSettings settings = new FormattingAssemblerSettings
- {
- RemoveStyleNamesFromParagraphAndRunProperties = false,
- ClearStyles = true,
- RestrictToSupportedNumberingFormats = false,
- RestrictToSupportedLanguages = false,
- };
- FormattingAssembler.AssembleFormatting(wDoc, settings);
- var formattingMetrics = new FormattingMetrics();
- foreach (var part in wDoc.ContentParts())
- {
- var xDoc = part.GetXDocument();
- foreach (var run in xDoc.Descendants(W.r))
- {
- formattingMetrics.RunCount++;
- AnalyzeRun(run, metrics, notes, formattingMetrics, part.Uri.ToString());
- }
- }
- metrics.Add(new XElement(H.RunCount, new XAttribute(H.Val, formattingMetrics.RunCount)));
- if (formattingMetrics.RunWithoutRprCount > 0)
- metrics.Add(new XElement(H.RunWithoutRprCount, new XAttribute(H.Val, formattingMetrics.RunWithoutRprCount)));
- if (formattingMetrics.ZeroLengthText > 0)
- metrics.Add(new XElement(H.ZeroLengthText, new XAttribute(H.Val, formattingMetrics.ZeroLengthText)));
- if (formattingMetrics.MultiFontRun > 0)
- metrics.Add(new XElement(H.MultiFontRun, new XAttribute(H.Val, formattingMetrics.MultiFontRun)));
- if (formattingMetrics.AsciiCharCount > 0)
- metrics.Add(new XElement(H.AsciiCharCount, new XAttribute(H.Val, formattingMetrics.AsciiCharCount)));
- if (formattingMetrics.CSCharCount > 0)
- metrics.Add(new XElement(H.CSCharCount, new XAttribute(H.Val, formattingMetrics.CSCharCount)));
- if (formattingMetrics.EastAsiaCharCount > 0)
- metrics.Add(new XElement(H.EastAsiaCharCount, new XAttribute(H.Val, formattingMetrics.EastAsiaCharCount)));
- if (formattingMetrics.HAnsiCharCount > 0)
- metrics.Add(new XElement(H.HAnsiCharCount, new XAttribute(H.Val, formattingMetrics.HAnsiCharCount)));
- if (formattingMetrics.AsciiRunCount > 0)
- metrics.Add(new XElement(H.AsciiRunCount, new XAttribute(H.Val, formattingMetrics.AsciiRunCount)));
- if (formattingMetrics.CSRunCount > 0)
- metrics.Add(new XElement(H.CSRunCount, new XAttribute(H.Val, formattingMetrics.CSRunCount)));
- if (formattingMetrics.EastAsiaRunCount > 0)
- metrics.Add(new XElement(H.EastAsiaRunCount, new XAttribute(H.Val, formattingMetrics.EastAsiaRunCount)));
- if (formattingMetrics.HAnsiRunCount > 0)
- metrics.Add(new XElement(H.HAnsiRunCount, new XAttribute(H.Val, formattingMetrics.HAnsiRunCount)));
- if (formattingMetrics.Languages.Any())
- {
- var uls = formattingMetrics.Languages.StringConcatenate(s => s + ",").TrimEnd(',');
- metrics.Add(new XElement(H.Languages, new XAttribute(H.Val, PtUtils.MakeValidXml(uls))));
- }
- }
- private static void AnalyzeRun(XElement run, List<XElement> attList, List<string> notes, FormattingMetrics formattingMetrics, string uri)
- {
- var runText = run.Elements()
- .Where(e => e.Name == W.t || e.Name == W.delText)
- .Select(t => (string)t)
- .StringConcatenate();
- if (runText.Length == 0)
- {
- formattingMetrics.ZeroLengthText++;
- return;
- }
- var rPr = run.Element(W.rPr);
- if (rPr == null)
- {
- formattingMetrics.RunWithoutRprCount++;
- notes.Add(PtUtils.MakeValidXml(string.Format("Error in part {0}: run without rPr at {1}", uri, run.GetXPath())));
- rPr = new XElement(W.rPr);
- }
- FormattingAssembler.CharStyleAttributes csa = new FormattingAssembler.CharStyleAttributes(null, rPr);
- var fontTypeArray = runText
- .Select(ch => FormattingAssembler.DetermineFontTypeFromCharacter(ch, csa))
- .ToArray();
- var distinctFontTypeArray = fontTypeArray
- .Distinct()
- .ToArray();
- var distinctFonts = distinctFontTypeArray
- .Select(ft =>
- {
- return GetFontFromFontType(csa, ft);
- })
- .Distinct();
- var languages = distinctFontTypeArray
- .Select(ft =>
- {
- if (ft == FormattingAssembler.FontType.Ascii)
- return csa.LatinLang;
- if (ft == FormattingAssembler.FontType.CS)
- return csa.BidiLang;
- if (ft == FormattingAssembler.FontType.EastAsia)
- return csa.EastAsiaLang;
- //if (ft == FormattingAssembler.FontType.HAnsi)
- return csa.LatinLang;
- })
- .Select(l =>
- {
- if (l == "" || l == null)
- return /* "Dflt:" + */ CultureInfo.CurrentCulture.Name;
- return l;
- })
- //.Where(l => l != null && l != "")
- .Distinct();
- if (languages.Any(l => !formattingMetrics.Languages.Contains(l)))
- formattingMetrics.Languages = formattingMetrics.Languages.Concat(languages).Distinct().ToList();
- var multiFontRun = distinctFonts.Count() > 1;
- if (multiFontRun)
- {
- formattingMetrics.MultiFontRun++;
- formattingMetrics.AsciiCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.Ascii).Count();
- formattingMetrics.CSCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.CS).Count();
- formattingMetrics.EastAsiaCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.EastAsia).Count();
- formattingMetrics.HAnsiCharCount += fontTypeArray.Where(ft => ft == FormattingAssembler.FontType.HAnsi).Count();
- }
- else
- {
- switch (fontTypeArray[0])
- {
- case FormattingAssembler.FontType.Ascii:
- formattingMetrics.AsciiCharCount += runText.Length;
- formattingMetrics.AsciiRunCount++;
- break;
- case FormattingAssembler.FontType.CS:
- formattingMetrics.CSCharCount += runText.Length;
- formattingMetrics.CSRunCount++;
- break;
- case FormattingAssembler.FontType.EastAsia:
- formattingMetrics.EastAsiaCharCount += runText.Length;
- formattingMetrics.EastAsiaRunCount++;
- break;
- case FormattingAssembler.FontType.HAnsi:
- formattingMetrics.HAnsiCharCount += runText.Length;
- formattingMetrics.HAnsiRunCount++;
- break;
- }
- }
- }
- private static string GetFontFromFontType(FormattingAssembler.CharStyleAttributes csa, FormattingAssembler.FontType ft)
- {
- switch (ft)
- {
- case FormattingAssembler.FontType.Ascii:
- return csa.AsciiFont;
- case FormattingAssembler.FontType.CS:
- return csa.CsFont;
- case FormattingAssembler.FontType.EastAsia:
- return csa.EastAsiaFont;
- case FormattingAssembler.FontType.HAnsi:
- return csa.HAnsiFont;
- default: // dummy
- return csa.AsciiFont;
- }
- }
- public static XElement GetXlsxMetrics(SmlDocument smlDoc, MetricsGetterSettings settings)
- {
- using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(smlDoc))
- {
- using (SpreadsheetDocument sDoc = streamDoc.GetSpreadsheetDocument())
- {
- List<XElement> metrics = new List<XElement>();
- bool valid = ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007);
- valid |= ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010);
- #if !NET35
- valid |= ValidateAgainstSpecificVersion(sDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013);
- #endif
- return new XElement(H.Metrics,
- new XAttribute(H.FileName, smlDoc.FileName),
- new XAttribute(H.FileType, "SpreadsheetML"),
- metrics,
- GetTableInfoForWorkbook(sDoc, settings),
- settings.RetrieveNamespaceList ? RetrieveNamespaceList(sDoc) : null,
- settings.RetrieveContentTypeList ? RetrieveContentTypeList(sDoc) : null);
- }
- }
- }
- private static XElement GetTableInfoForWorkbook(SpreadsheetDocument spreadsheet, MetricsGetterSettings settings)
- {
- var workbookPart = spreadsheet.WorkbookPart;
- var xd = workbookPart.GetXDocument();
- var partInformation =
- new XElement(H.Sheets,
- xd.Root
- .Element(S.sheets)
- .Elements(S.sheet)
- .Select(sh =>
- {
- var rid = (string)sh.Attribute(R.id);
- var sheetName = (string)sh.Attribute("name");
- WorksheetPart worksheetPart = (WorksheetPart)workbookPart.GetPartById(rid);
- return GetTableInfoForSheet(spreadsheet, worksheetPart, sheetName, settings);
- }));
- return partInformation;
- }
- public static XElement GetTableInfoForSheet(SpreadsheetDocument spreadsheetDocument, WorksheetPart sheetPart, string sheetName,
- MetricsGetterSettings settings)
- {
- var xd = sheetPart.GetXDocument();
- XElement sheetInformation = new XElement(H.Sheet,
- new XAttribute(H.Name, sheetName),
- xd.Root.Elements(S.tableParts).Elements(S.tablePart).Select(tp =>
- {
- string rId = (string)tp.Attribute(R.id);
- TableDefinitionPart tablePart = (TableDefinitionPart)sheetPart.GetPartById(rId);
- var txd = tablePart.GetXDocument();
- var tableName = (string)txd.Root.Attribute("displayName");
- XElement tableCellData = null;
- if (settings.IncludeXlsxTableCellData)
- {
- var xlsxTable = spreadsheetDocument.Table(tableName);
- tableCellData = new XElement(H.TableData,
- xlsxTable.TableRows()
- .Select(row =>
- {
- var rowElement = new XElement(H.Row,
- xlsxTable.TableColumns().Select(col =>
- {
- var cellElement = new XElement(H.Cell,
- new XAttribute(H.Name, col.Name),
- new XAttribute(H.Val, (string)row[col.Name]));
- return cellElement;
- }));
- return rowElement;
- }));
- }
- var table = new XElement(H.Table,
- new XAttribute(H.Name, (string)txd.Root.Attribute("name")),
- new XAttribute(H.DisplayName, tableName),
- new XElement(H.Columns,
- txd.Root.Element(S.tableColumns).Elements(S.tableColumn)
- .Select(tc => new XElement(H.Column,
- new XAttribute(H.Name, (string)tc.Attribute("name"))))),
- tableCellData
- );
- return table;
- })
- );
- if (!sheetInformation.HasElements)
- return null;
- return sheetInformation;
- }
- public static XElement GetPptxMetrics(PmlDocument pmlDoc, MetricsGetterSettings settings)
- {
- using (OpenXmlMemoryStreamDocument streamDoc = new OpenXmlMemoryStreamDocument(pmlDoc))
- {
- using (PresentationDocument pDoc = streamDoc.GetPresentationDocument())
- {
- List<XElement> metrics = new List<XElement>();
- bool valid = ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2007, H.SdkValidationError2007);
- valid |= ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2010, H.SdkValidationError2010);
- #if !NET35
- valid |= ValidateAgainstSpecificVersion(pDoc, metrics, DocumentFormat.OpenXml.FileFormatVersions.Office2013, H.SdkValidationError2013);
- #endif
- return new XElement(H.Metrics,
- new XAttribute(H.FileName, pmlDoc.FileName),
- new XAttribute(H.FileType, "PresentationML"),
- metrics,
- settings.RetrieveNamespaceList ? RetrieveNamespaceList(pDoc) : null,
- settings.RetrieveContentTypeList ? RetrieveContentTypeList(pDoc) : null);
- }
- }
- }
- private static object GetStyleHierarchy(WordprocessingDocument document)
- {
- var stylePart = document.MainDocumentPart.StyleDefinitionsPart;
- if (stylePart == null)
- return null;
- var xd = stylePart.GetXDocument();
- var stylesWithPath = xd.Root
- .Elements(W.style)
- .Select(s =>
- {
- var styleString = (string)s.Attribute(W.styleId);
- var thisStyle = s;
- while (true)
- {
- var baseStyle = (string)thisStyle.Elements(W.basedOn).Attributes(W.val).FirstOrDefault();
- if (baseStyle == null)
- break;
- styleString = baseStyle + "/" + styleString;
- thisStyle = xd.Root.Elements(W.style).FirstOrDefault(ts => ts.Attribute(W.styleId).Value == baseStyle);
- if (thisStyle == null)
- break;
- }
- return styleString;
- })
- .OrderBy(n => n)
- .ToList();
- XElement styleHierarchy = new XElement(H.StyleHierarchy);
- foreach (var item in stylesWithPath)
- {
- var styleChain = item.Split('/');
- XElement elementToAddTo = styleHierarchy;
- foreach (var inChain in styleChain.PtSkipLast(1))
- elementToAddTo = elementToAddTo.Elements(H.Style).FirstOrDefault(z => z.Attribute(H.Id).Value == inChain);
- var styleToAdd = styleChain.Last();
- elementToAddTo.Add(
- new XElement(H.Style,
- new XAttribute(H.Id, styleChain.Last()),
- new XAttribute(H.Type, (string)xd.Root.Elements(W.style).First(z => z.Attribute(W.styleId).Value == styleToAdd).Attribute(W.type))));
- }
- return styleHierarchy;
- }
- private static XElement GetMetricsForWmlPart(OpenXmlPart part, MetricsGetterSettings settings)
- {
- XElement contentControls = null;
- if (part is MainDocumentPart ||
- part is HeaderPart ||
- part is FooterPart ||
- part is FootnotesPart ||
- part is EndnotesPart)
- {
- var xd = part.GetXDocument();
- contentControls = (XElement)GetContentControlsTransform(xd.Root, settings);
- if (!contentControls.HasElements)
- contentControls = null;
- }
- var partMetrics = new XElement(H.Part,
- new XAttribute(H.ContentType, part.ContentType),
- new XAttribute(H.Uri, part.Uri.ToString()),
- contentControls);
- if (partMetrics.HasElements)
- return partMetrics;
- return null;
- }
- private static object GetContentControlsTransform(XNode node, MetricsGetterSettings settings)
- {
- XElement element = node as XElement;
- if (element != null)
- {
- if (element == element.Document.Root)
- return new XElement(H.ContentControls,
- element.Nodes().Select(n => GetContentControlsTransform(n, settings)));
- if (element.Name == W.sdt)
- {
- var tag = (string)element.Elements(W.sdtPr).Elements(W.tag).Attributes(W.val).FirstOrDefault();
- XAttribute tagAttr = tag != null ? new XAttribute(H.Tag, tag) : null;
- var alias = (string)element.Elements(W.sdtPr).Elements(W.alias).Attributes(W.val).FirstOrDefault();
- XAttribute aliasAttr = alias != null ? new XAttribute(H.Alias, alias) : null;
- var xPathAttr = new XAttribute(H.XPath, element.GetXPath());
- var isText = element.Elements(W.sdtPr).Elements(W.text).Any();
- var isBibliography = element.Elements(W.sdtPr).Elements(W.bibliography).Any();
- var isCitation = element.Elements(W.sdtPr).Elements(W.citation).Any();
- var isComboBox = element.Elements(W.sdtPr).Elements(W.comboBox).Any();
- var isDate = element.Elements(W.sdtPr).Elements(W.date).Any();
- var isDocPartList = element.Elements(W.sdtPr).Elements(W.docPartList).Any();
- var isDocPartObj = element.Elements(W.sdtPr).Elements(W.docPartObj).Any();
- var isDropDownList = element.Elements(W.sdtPr).Elements(W.dropDownList).Any();
- var isEquation = element.Elements(W.sdtPr).Elements(W.equation).Any();
- var isGroup = element.Elements(W.sdtPr).Elements(W.group).Any();
- var isPicture = element.Elements(W.sdtPr).Elements(W.picture).Any();
- var isRichText = element.Elements(W.sdtPr).Elements(W.richText).Any() ||
- (! isText &&
- ! isBibliography &&
- ! isCitation &&
- ! isComboBox &&
- ! isDate &&
- ! isDocPartList &&
- ! isDocPartObj &&
- ! isDropDownList &&
- ! isEquation &&
- ! isGroup &&
- ! isPicture);
- string type = null;
- if (isText ) type = "Text";
- if (isBibliography) type = "Bibliography";
- if (isCitation ) type = "Citation";
- if (isComboBox ) type = "ComboBox";
- if (isDate ) type = "Date";
- if (isDocPartList ) type = "DocPartList";
- if (isDocPartObj ) type = "DocPartObj";
- if (isDropDownList) type = "DropDownList";
- if (isEquation ) type = "Equation";
- if (isGroup ) type = "Group";
- if (isPicture ) type = "Picture";
- if (isRichText ) type = "RichText";
- var typeAttr = new XAttribute(H.Type, type);
- return new XElement(H.ContentControl,
- typeAttr,
- tagAttr,
- aliasAttr,
- xPathAttr,
- element.Nodes().Select(n => GetContentControlsTransform(n, settings)));
- }
- return element.Nodes().Select(n => GetContentControlsTransform(n, settings));
- }
- if (settings.IncludeTextInContentControls)
- return node;
- return null;
- }
- }
- public static class H
- {
- public static XName ActiveX = "ActiveX";
- public static XName Alias = "Alias";
- public static XName AltChunk = "AltChunk";
- public static XName Arguments = "Arguments";
- public static XName AsciiCharCount = "AsciiCharCount";
- public static XName AsciiRunCount = "AsciiRunCount";
- public static XName AverageParagraphLength = "AverageParagraphLength";
- public static XName BaselineReport = "BaselineReport";
- public static XName Batch = "Batch";
- public static XName BatchName = "BatchName";
- public static XName BatchSelector = "BatchSelector";
- public static XName CSCharCount = "CSCharCount";
- public static XName CSRunCount = "CSRunCount";
- public static XName Catalog = "Catalog";
- public static XName CatalogList = "CatalogList";
- public static XName CatalogListFile = "CatalogListFile";
- public static XName CaughtException = "CaughtException";
- public static XName Cell = "Cell";
- public static XName Column = "Column";
- public static XName Columns = "Columns";
- public static XName ComplexField = "ComplexField";
- public static XName Computer = "Computer";
- public static XName Computers = "Computers";
- public static XName ContentControl = "ContentControl";
- public static XName ContentControls = "ContentControls";
- public static XName ContentType = "ContentType";
- public static XName ContentTypes = "ContentTypes";
- public static XName CustomXmlMarkup = "CustomXmlMarkup";
- public static XName DLL = "DLL";
- public static XName DefaultDialogValuesFile = "DefaultDialogValuesFile";
- public static XName DefaultValues = "DefaultValues";
- public static XName Dependencies = "Dependencies";
- public static XName DestinationDir = "DestinationDir";
- public static XName Directory = "Directory";
- public static XName DirectoryPattern = "DirectoryPattern";
- public static XName DisplayName = "DisplayName";
- public static XName DoJobQueueName = "DoJobQueueName";
- public static XName Document = "Document";
- public static XName DocumentProtection = "DocumentProtection";
- public static XName DocumentSelector = "DocumentSelector";
- public static XName DocumentType = "DocumentType";
- public static XName Documents = "Documents";
- public static XName EastAsiaCharCount = "EastAsiaCharCount";
- public static XName EastAsiaRunCount = "EastAsiaRunCount";
- public static XName ElementCount = "ElementCount";
- public static XName EmbeddedXlsx = "EmbeddedXlsx";
- public static XName Error = "Error";
- public static XName Exception = "Exception";
- public static XName Exe = "Exe";
- public static XName ExeRoot = "ExeRoot";
- public static XName Extension = "Extension";
- public static XName File = "File";
- public static XName FileLength = "FileLength";
- public static XName FileName = "FileName";
- public static XName FilePattern = "FilePattern";
- public static XName FileType = "FileType";
- public static XName Guid = "Guid";
- public static XName HAnsiCharCount = "HAnsiCharCount";
- public static XName HAnsiRunCount = "HAnsiRunCount";
- public static XName RevisionTracking = "RevisionTracking";
- public static XName Hyperlink = "Hyperlink";
- public static XName IPAddress = "IPAddress";
- public static XName Id = "Id";
- public static XName Invalid = "Invalid";
- public static XName InvalidHyperlink = "InvalidHyperlink";
- public static XName InvalidHyperlinkException = "InvalidHyperlinkException";
- public static XName InvalidSaveThroughXslt = "InvalidSaveThroughXslt";
- public static XName JobComplete = "JobComplete";
- public static XName JobExe = "JobExe";
- public static XName JobName = "JobName";
- public static XName JobSpec = "JobSpec";
- public static XName Languages = "Languages";
- public static XName LegacyFrame = "LegacyFrame";
- public static XName LocalDoJobQueue = "LocalDoJobQueue";
- public static XName MachineName = "MachineName";
- public static XName MaxConcurrentJobs = "MaxConcurrentJobs";
- public static XName MaxDocumentsInJob = "MaxDocumentsInJob";
- public static XName MaxParagraphLength = "MaxParagraphLength";
- public static XName Message = "Message";
- public static XName Metrics = "Metrics";
- public static XName MultiDirectory = "MultiDirectory";
- public static XName MultiFontRun = "MultiFontRun";
- public static XName MultiServerQueue = "MultiServerQueue";
- public static XName Name = "Name";
- public static XName Namespaces = "Namespaces";
- public static XName Namespace = "Namespace";
- public static XName NamespaceName = "NamespaceName";
- public static XName NamespacePrefix = "NamespacePrefix";
- public static XName Note = "Note";
- public static XName NumberingFormatList = "NumberingFormatList";
- public static XName ObjectDisposedException = "ObjectDisposedException";
- public static XName ParagraphCount = "ParagraphCount";
- public static XName Part = "Part";
- public static XName Parts = "Parts";
- public static XName PassedDocuments = "PassedDocuments";
- public static XName Path = "Path";
- public static XName ProduceCatalog = "ProduceCatalog";
- public static XName ReferenceToNullImage = "ReferenceToNullImage";
- public static XName Report = "Report";
- public static XName Root = "Root";
- public static XName RootDirectory = "RootDirectory";
- public static XName Row = "Row";
- public static XName RunCount = "RunCount";
- public static XName RunWithoutRprCount = "RunWithoutRprCount";
- public static XName SdkValidationError = "SdkValidationError";
- public static XName SdkValidationError2007 = "SdkValidationError2007";
- public static XName SdkValidationError2010 = "SdkValidationError2010";
- public static XName SdkValidationError2013 = "SdkValidationError2013";
- public static XName Sheet = "Sheet";
- public static XName Sheets = "Sheets";
- public static XName SimpleField = "SimpleField";
- public static XName Skip = "Skip";
- public static XName SmartTag = "SmartTag";
- public static XName SourceRootDir = "SourceRootDir";
- public static XName SpawnerJobExeLocation = "SpawnerJobExeLocation";
- public static XName SpawnerReady = "SpawnerReady";
- public static XName Style = "Style";
- public static XName StyleHierarchy = "StyleHierarchy";
- public static XName SubDocument = "SubDocument";
- public static XName Table = "Table";
- public static XName TableData = "TableData";
- public static XName Tag = "Tag";
- public static XName Take = "Take";
- public static XName TextBox = "TextBox";
- public static XName TrackRevisionsEnabled = "TrackRevisionsEnabled";
- public static XName Type = "Type";
- public static XName Uri = "Uri";
- public static XName Val = "Val";
- public static XName Valid = "Valid";
- public static XName WindowStyle = "WindowStyle";
- public static XName XPath = "XPath";
- public static XName ZeroLengthText = "ZeroLengthText";
- public static XName custDataLst = "custDataLst";
- public static XName custShowLst = "custShowLst";
- public static XName kinsoku = "kinsoku";
- public static XName modifyVerifier = "modifyVerifier";
- public static XName photoAlbum = "photoAlbum";
- }
- }
|