12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- using HtmlAgilityPack;
- using System;
- using System.Collections.Generic;
- using System.Text;
- using System.Text.RegularExpressions;
- using TEAMModelOS.SDK.Helper.Common.CollectionHelper;
- namespace TEAMModelOS.SDK.Helper.Common.StringHelper
- {
- public class HtmlHelper
- {
- public static string DoUselessTag(string str)
- {
- if (str.StartsWith("</p>"))
- {
- str = str.Substring(4);
- }
- if (str.EndsWith("<p>"))
- {
- str = str.Substring(0, str.Length - 1 - 2);
- }
- if (str.EndsWith("<p >"))
- {
- str = str.Substring(0, str.Length - 1 - 3);
- }
- if (str.EndsWith("<p >"))
- {
- str = str.Substring(0, str.Length - 1 - 4);
- }
- if (str.StartsWith("<p >") && !str.Contains("</p>"))
- {
- str = str.Replace("<p >", "");
- }
- if (str.StartsWith("<p >") && !str.Contains("</p>"))
- {
- str = str.Replace("<p >", "");
- }
- if (str.StartsWith("<p>") && !str.Contains("</p>"))
- {
- str = str.Replace("<p>", "");
- }
- if (str.EndsWith("</p>") && !str.Contains("<p>")&& !str.Contains("<p >") && !str.Contains("<p >")){
- str = str.Replace("</p>", "");
- }
- return str;
- }
- /// <summary>
- /// 从html获取文本及img的url 去掉其他标签的干扰。以获取更准确的sha1校验值
- /// </summary>
- /// <param name="html"></param>
- /// <returns></returns>
- public static string DoTextImg(string html)
- {
- HtmlDocument doc = new HtmlDocument();
- doc.LoadHtml(html);
- List<string> urls = GetHtmlImageUrlList(html);
- StringBuilder builder = new StringBuilder(doc.DocumentNode.InnerText);
- if (urls.IsNotEmpty())
- {
- foreach (string url in urls)
- {
- builder.Append(url);
- }
- }
- return builder.ToString();
- }
- public static List<string> GetHtmlImageUrlList(string sHtmlText)
- {
- // 定义正则表达式用来匹配 img 标签
- Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
- // 搜索匹配的字符串
- MatchCollection matches = regImg.Matches(sHtmlText);
- //string[] sUrlList = new string[matches.Count];
- List<string> urls = new List<string>();
- // 取得匹配项列表
- foreach (Match match in matches)
- urls.Add(match.Groups["imgUrl"].Value);
- return urls;
- }
- }
- }
|