HtmlHelper.cs 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. using HtmlAgilityPack;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. using TEAMModelOS.SDK.Helper.Common.CollectionHelper;
  7. namespace TEAMModelOS.SDK.Helper.Common.StringHelper
  8. {
  9. public class HtmlHelper
  10. {
  11. public static string DoUselessTag(string str)
  12. {
  13. if (str.StartsWith("</p>"))
  14. {
  15. str = str.Substring(4);
  16. }
  17. if (str.EndsWith("<p>"))
  18. {
  19. str = str.Substring(0, str.Length - 1 - 2);
  20. }
  21. if (str.EndsWith("<p >"))
  22. {
  23. str = str.Substring(0, str.Length - 1 - 3);
  24. }
  25. if (str.EndsWith("<p >"))
  26. {
  27. str = str.Substring(0, str.Length - 1 - 4);
  28. }
  29. if (str.StartsWith("<p >") && !str.Contains("</p>"))
  30. {
  31. str = str.Replace("<p >", "");
  32. }
  33. if (str.StartsWith("<p >") && !str.Contains("</p>"))
  34. {
  35. str = str.Replace("<p >", "");
  36. }
  37. if (str.StartsWith("<p>") && !str.Contains("</p>"))
  38. {
  39. str = str.Replace("<p>", "");
  40. }
  41. if (str.EndsWith("</p>") && !str.Contains("<p>")&& !str.Contains("<p >") && !str.Contains("<p >")){
  42. str = str.Replace("</p>", "");
  43. }
  44. return str;
  45. }
  46. /// <summary>
  47. /// 从html获取文本及img的url 去掉其他标签的干扰。以获取更准确的sha1校验值
  48. /// </summary>
  49. /// <param name="html"></param>
  50. /// <returns></returns>
  51. public static string DoTextImg(string html)
  52. {
  53. HtmlDocument doc = new HtmlDocument();
  54. doc.LoadHtml(html);
  55. List<string> urls = GetHtmlImageUrlList(html);
  56. StringBuilder builder = new StringBuilder(doc.DocumentNode.InnerText);
  57. if (urls.IsNotEmpty())
  58. {
  59. foreach (string url in urls)
  60. {
  61. builder.Append(url);
  62. }
  63. }
  64. return builder.ToString();
  65. }
  66. public static List<string> GetHtmlImageUrlList(string sHtmlText)
  67. {
  68. // 定义正则表达式用来匹配 img 标签
  69. Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
  70. // 搜索匹配的字符串
  71. MatchCollection matches = regImg.Matches(sHtmlText);
  72. //string[] sUrlList = new string[matches.Count];
  73. List<string> urls = new List<string>();
  74. // 取得匹配项列表
  75. foreach (Match match in matches)
  76. urls.Add(match.Groups["imgUrl"].Value);
  77. return urls;
  78. }
  79. }
  80. }