HtmlHelper.cs 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. using HtmlAgilityPack;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. using System.Text.RegularExpressions;
  6. namespace HTEXLib.COMM.Helpers
  7. {
  8. public class HtmlHelper
  9. {
  10. public static string DoUselessTag(string str)
  11. {
  12. if (str.StartsWith("</p>"))
  13. {
  14. str = str.Substring(4);
  15. }
  16. if (str.EndsWith("<p>"))
  17. {
  18. str = str.Substring(0, str.Length - 1 - 2);
  19. }
  20. if (str.EndsWith("<p >"))
  21. {
  22. str = str.Substring(0, str.Length - 1 - 3);
  23. }
  24. if (str.EndsWith("<p >"))
  25. {
  26. str = str.Substring(0, str.Length - 1 - 4);
  27. }
  28. if (str.StartsWith("<p >") && !str.Contains("</p>"))
  29. {
  30. str = str.Replace("<p >", "");
  31. }
  32. if (str.StartsWith("<p >") && !str.Contains("</p>"))
  33. {
  34. str = str.Replace("<p >", "");
  35. }
  36. if (str.StartsWith("<p>") && !str.Contains("</p>"))
  37. {
  38. str = str.Replace("<p>", "");
  39. }
  40. if (str.EndsWith("</p>") && !str.Contains("<p>") && !str.Contains("<p >") && !str.Contains("<p >"))
  41. {
  42. str = str.Replace("</p>", "");
  43. }
  44. str = Regex.Replace(str, "<p([^>]{0,})>\\s*</p>", "");
  45. return str;
  46. }
  47. /// <summary>
  48. /// 从html获取文本及img的url 去掉其他标签的干扰,空格。以获取更准确的sha1校验值
  49. /// </summary>
  50. /// <param name="html"></param>
  51. /// <returns></returns>
  52. public static string DoTextImg(string html)
  53. {
  54. HtmlDocument doc = new HtmlDocument();
  55. doc.LoadHtml(html);
  56. List<string> urls = GetHtmlImageUrlList(html);
  57. StringBuilder builder = new StringBuilder(doc.DocumentNode.InnerText.Replace("&nbsp;", ""));
  58. if (urls.IsNotEmpty())
  59. {
  60. foreach (string url in urls)
  61. {
  62. builder.Append(url);
  63. }
  64. }
  65. return builder.ToString();
  66. }
  67. public static List<string> GetHtmlImageUrlList(string sHtmlText)
  68. {
  69. // 定义正则表达式用来匹配 img 标签
  70. Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
  71. // 搜索匹配的字符串
  72. MatchCollection matches = regImg.Matches(sHtmlText);
  73. //string[] sUrlList = new string[matches.Count];
  74. List<string> urls = new List<string>();
  75. // 取得匹配项列表
  76. foreach (Match match in matches)
  77. urls.Add(match.Groups["imgUrl"].Value);
  78. return urls;
  79. }
  80. }
  81. }