using HtmlAgilityPack; using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; namespace HTEXLib.COMM.Helpers { public class HtmlHelper { public static string DoUselessTag(string str) { if (str.StartsWith("

")) { str = str.Substring(4); } if (str.EndsWith("

")) { str = str.Substring(0, str.Length - 1 - 2); } if (str.EndsWith("

")) { str = str.Substring(0, str.Length - 1 - 3); } if (str.EndsWith("

")) { str = str.Substring(0, str.Length - 1 - 4); } if (str.StartsWith("

") && !str.Contains("

")) { str = str.Replace("

", ""); } if (str.StartsWith("

") && !str.Contains("

")) { str = str.Replace("

", ""); } if (str.StartsWith("

") && !str.Contains("

")) { str = str.Replace("

", ""); } if (str.EndsWith("

") && !str.Contains("

") && !str.Contains("

") && !str.Contains("

")) { str = str.Replace("

", ""); } str = Regex.Replace(str, "]{0,})>\\s*

", ""); return str; } /// /// 从html获取文本及img的url 去掉其他标签的干扰,空格。以获取更准确的sha1校验值 /// /// /// public static string DoTextImg(string html) { HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); List urls = GetHtmlImageUrlList(html); StringBuilder builder = new StringBuilder(doc.DocumentNode.InnerText.Replace(" ", "")); if (urls.IsNotEmpty()) { foreach (string url in urls) { builder.Append(url); } } return builder.ToString(); } public static List GetHtmlImageUrlList(string sHtmlText) { // 定义正则表达式用来匹配 img 标签 Regex regImg = new Regex(@"]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串 MatchCollection matches = regImg.Matches(sHtmlText); //string[] sUrlList = new string[matches.Count]; List urls = new List(); // 取得匹配项列表 foreach (Match match in matches) urls.Add(match.Groups["imgUrl"].Value); return urls; } } }