HtmlAnalyzeService.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. using DocumentFormat.OpenXml.Packaging;
  2. using HtmlAgilityPack;
  3. using Microsoft.AspNetCore.Http;
  4. using OpenXmlPowerTools;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Drawing.Imaging;
  8. using System.IO;
  9. using System.Linq;
  10. using System.Text;
  11. using System.Text.RegularExpressions;
  12. using TEAMModelOS.SDK.Helper.Common.CollectionHelper;
  13. using TEAMModelOS.SDK.Helper.Common.FileHelper;
  14. using TEAMModelOS.SDK.Helper.Common.StringHelper;
  15. using TEAMModelOS.SDK.Helper.Security.ShaHash;
  16. using TEAMModelOS.Service.Models.Core;
  17. using TEAMModelOS.Service.Models.Evaluation.Models;
  18. using TEAMModelOS.Service.Services.Evaluation.Interfaces;
  19. namespace TEAMModelOS.Service.Services.Evaluation.Implements
  20. {
  21. public class HtmlAnalyzeService : IHtmlAnalyzeService
  22. {
  23. private static string SummaryTag = "【题文】";
  24. private static string AnswerTag = "【答案】";
  25. private static string AnalysisTag = "【解析】";
  26. private static string EndedTag = "【结束】";
  27. private static string Options = "ABCDEFGHIJ";
  28. private static string CompleteStart = "【";
  29. private static string CompleteEnd = "】";
  30. private static string ComposeStart = "【综合题】";
  31. private static string ComposeEnd = "【综合题-题干】";
  32. private static string ComposeTag = "【综合题-";
  33. private static Dictionary<string, string> TestType = new Dictionary<string, string> {
  34. { "Single", "【单选题】|【结束】" }, { "Multiple", "【多选题】|【结束】" },
  35. { "Judge", "【判断题】|【结束】" }, { "Complete", "【填空题】|【结束】" },
  36. { "Subjective", "【问答题】|【结束】" } , { "Compose", "【综合题】|【完结】" }};
  37. public List<ItemInfo> AnalyzeWordAsync(string html, string Lang)
  38. {
  39. //去除class 以及span标签"
  40. string classpattern = "class=\"([^\"]*)\"";
  41. html = Regex.Replace(html, classpattern, "");
  42. string pattern = "<span([^>]{0,})>";
  43. html = Regex.Replace(html, pattern, "");
  44. html = html.Replace("\t", " ").Replace("<span>", "").Replace("</span>", "").Replace("dir=\"ltr\"", "");
  45. Dictionary<string, List<string>> TestInType = ConvertTest(html);
  46. List<ItemInfo> tests = new List<ItemInfo>();
  47. foreach (string key in TestInType.Keys)
  48. {
  49. switch (key)
  50. {
  51. case "Single":
  52. List<ItemInfo> exercisesSingle = SingleConvert(key, TestInType[key]);
  53. exercisesSingle.ForEach(x => { x.pShaCode = x.shaCode; });
  54. tests.AddRange(exercisesSingle); break;
  55. case "Multiple":
  56. List<ItemInfo> exercisesMultiple = MultipleConvert(key, TestInType[key]);
  57. exercisesMultiple.ForEach(x => { x.pShaCode = x.shaCode; });
  58. tests.AddRange(exercisesMultiple); break;
  59. case "Judge":
  60. List<ItemInfo> exercisesJudge = JudgeConvert(key, TestInType[key]);
  61. exercisesJudge.ForEach(x => { x.pShaCode = x.shaCode; });
  62. tests.AddRange(exercisesJudge); break;
  63. case "Complete":
  64. List<ItemInfo> exercisesComplete = CompleteConvert(key, TestInType[key]);
  65. exercisesComplete.ForEach(x => { x.pShaCode = x.shaCode; });
  66. tests.AddRange(exercisesComplete); break;
  67. case "Subjective":
  68. List<ItemInfo> exercisesSubjective = SubjectiveConvert(key, TestInType[key]);
  69. exercisesSubjective.ForEach(x => { x.pShaCode = x.shaCode; });
  70. tests.AddRange(exercisesSubjective); break;
  71. case "Compose":
  72. List<ItemInfo> exercisesCompose = ComposeConvert(key, TestInType[key], Lang);
  73. exercisesCompose.ForEach(x => { x.pShaCode = x.shaCode; });
  74. tests.AddRange(exercisesCompose);
  75. break;
  76. default: break;
  77. }
  78. }
  79. return tests;
  80. }
  81. private List<ItemInfo> SingleConvert(string TypeKey, List<string> list)
  82. {
  83. List<ItemInfo> testInfos = OptionProcess(TypeKey, list);
  84. return testInfos;
  85. }
  86. private List<ItemInfo> MultipleConvert(string TypeKey, List<string> list)
  87. {
  88. List<ItemInfo> testInfos = OptionProcess(TypeKey, list);
  89. return testInfos;
  90. }
  91. private List<ItemInfo> JudgeConvert(string TypeKey, List<string> list)
  92. {
  93. List<ItemInfo> testInfos = OptionProcess(TypeKey, list);
  94. return testInfos;
  95. }
  96. private List<ItemInfo> CompleteConvert(string TypeKey, List<string> list)
  97. {
  98. List<ItemInfo> testInfos = CompleteProcess(TypeKey, list);
  99. return testInfos;
  100. }
  101. private List<ItemInfo> CompleteProcess(string TypeKey, List<string> tests)
  102. {
  103. //List<string> tests = ConvertTest(testHtml);
  104. List<ItemInfo> testInfos = ConvertTestInfo(tests, TypeKey);
  105. HtmlDocument doc = new HtmlDocument();
  106. foreach (ItemInfo testInfo in testInfos)
  107. {
  108. List<string> ans = new List<string>();
  109. testInfo.question = testInfo.question.Replace(AnalysisTag, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  110. string regRex = CompleteStart + "([\\s\\S]*?)" + CompleteEnd;
  111. List<ReplaceDto> replaces = new List<ReplaceDto>();
  112. var m = Regex.Match(testInfo.question, regRex);
  113. int index = 1;
  114. while (m.Success)
  115. {
  116. string an = m.Groups[1].ToString();
  117. doc.LoadHtml(an);
  118. string anstr = doc.DocumentNode.InnerText;
  119. string nbsp = "";
  120. int length = System.Text.Encoding.Default.GetBytes(anstr).Length;
  121. for (int i = 0; i < length * 3; i++)
  122. {
  123. nbsp += "&nbsp;";
  124. }
  125. ReplaceDto replaceDto = new ReplaceDto { oldstr = CompleteStart + an + CompleteEnd, newstr = "<underline data=\"" + index + "\"><u>" + nbsp + "</u></underline>" };
  126. replaces.Add(replaceDto);
  127. ans.Add(an);
  128. m = m.NextMatch();
  129. index++;
  130. }
  131. string textImg = testInfo.question;
  132. //消除答案
  133. foreach (ReplaceDto replace in replaces)
  134. {
  135. testInfo.question = testInfo.question.Replace(replace.oldstr, replace.newstr);
  136. testInfo.question = HtmlHelper.DoUselessTag(testInfo.question);
  137. //只要题干文字和图片
  138. //不加underline标记
  139. textImg = testInfo.question.Replace(replace.oldstr, "");
  140. }
  141. textImg = HtmlHelper.DoTextImg(textImg);
  142. testInfo.shaCode = ShaHashHelper.GetSHA1(textImg);
  143. //处理解析
  144. testInfo.explain = testInfo.explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  145. testInfo.explain = HtmlHelper.DoUselessTag(testInfo.explain);
  146. testInfo.answer.AddRange(ans);
  147. }
  148. return testInfos;
  149. }
  150. private List<ItemInfo> OptionProcess(string typeKey, List<string> list)
  151. {
  152. string[] optionsKeys = Options.Select(s => s.ToString()).ToArray();
  153. List<ItemInfo> testInfos = ConvertTestInfo(list, typeKey);
  154. foreach (ItemInfo testInfo in testInfos)
  155. {
  156. string optsRgex = optionsKeys[0] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + AnswerTag;
  157. string optsHtml = Regex.Match(testInfo.question, optsRgex).Value;
  158. //HtmlDocument doc = new HtmlDocument();
  159. //doc.LoadHtml(optsHtml);
  160. //optsHtml = doc.DocumentNode.InnerText;
  161. //处理选项
  162. StringBuilder textImg = new StringBuilder();
  163. for (int i = 0; i < optionsKeys.Length - 1; i++)
  164. {
  165. string optRgex = optionsKeys[i] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + optionsKeys[i + 1] + "(\\.|\\.|\\、|\\:|\\:)";
  166. string optHtml = Regex.Match(optsHtml, optRgex).Value;
  167. if (!string.IsNullOrEmpty(optHtml))
  168. {
  169. optHtml = optHtml.Substring(2, optHtml.Length - 4);
  170. optHtml = HtmlHelper.DoUselessTag(optHtml);
  171. textImg.Append(HtmlHelper.DoTextImg(optHtml));
  172. testInfo.option.Add(new CodeValue { Code = optionsKeys[i], Value = optHtml });
  173. //testInfo.option.Add(new Dictionary<string, string> { { "code", optionsKeys[i] },{ "value", optHtml } });
  174. //testInfo.option.TryAdd(optionsKeys[i], optHtml);
  175. }
  176. else
  177. {
  178. optRgex = optionsKeys[i] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + AnswerTag;
  179. optHtml = Regex.Match(optsHtml, optRgex).Value;
  180. if (!string.IsNullOrEmpty(optHtml))
  181. {
  182. optHtml = optHtml.Substring(2, optHtml.Length - 6);
  183. optHtml = HtmlHelper.DoUselessTag(optHtml);
  184. textImg.Append(HtmlHelper.DoTextImg(optHtml));
  185. testInfo.option.Add(new CodeValue { Code = optionsKeys[i], Value = optHtml });
  186. //testInfo.option.Add(new Dictionary<string, string> { { "code", optionsKeys[i] }, { "value", optHtml } });
  187. //testInfo.option.TryAdd(optionsKeys[i], optHtml);
  188. }
  189. }
  190. }
  191. //处理题干
  192. testInfo.question = testInfo.question.Replace(optsHtml, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  193. testInfo.question = HtmlHelper.DoUselessTag(testInfo.question);
  194. textImg.Append(HtmlHelper.DoTextImg(testInfo.question));
  195. testInfo.shaCode = ShaHashHelper.GetSHA1(textImg.ToString());
  196. List<string> answers = testInfo.answer;
  197. HashSet<string> ans = new HashSet<string>();
  198. //处理答案
  199. for (int i = 0; i < answers.Count; i++)
  200. {
  201. string Answer = answers[i].Replace(AnswerTag, "").Replace(AnalysisTag, "").TrimStart().TrimEnd();
  202. Answer.Select(s => s.ToString()).ToList().ForEach(x =>
  203. {
  204. ans.Add(x);
  205. });
  206. }
  207. testInfo.answer = ans.ToList();
  208. //处理解析
  209. testInfo.explain = testInfo.explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  210. testInfo.explain = HtmlHelper.DoUselessTag(testInfo.explain);
  211. }
  212. return testInfos;
  213. }
  214. private List<ItemInfo> SubjectiveConvert(string TypeKey, List<string> tests)
  215. {
  216. // List<string> tests = ConvertTest(testHtml);
  217. List<ItemInfo> testInfos = ConvertTestInfo(tests, TypeKey);
  218. foreach (ItemInfo testInfo in testInfos)
  219. {
  220. testInfo.question = testInfo.question.Replace(AnalysisTag, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  221. testInfo.question = HtmlHelper.DoUselessTag(testInfo.question);
  222. StringBuilder textImg = new StringBuilder(HtmlHelper.DoTextImg(testInfo.question));
  223. testInfo.shaCode = ShaHashHelper.GetSHA1(textImg.ToString());
  224. for (int i = 0; i < testInfo.answer.Count; i++)
  225. {
  226. testInfo.answer[i] = testInfo.answer[i].Replace(AnswerTag, "").Replace(AnalysisTag, "");
  227. testInfo.answer[i] = HtmlHelper.DoUselessTag(testInfo.answer[i]);
  228. }
  229. testInfo.explain = testInfo.explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  230. testInfo.explain = HtmlHelper.DoUselessTag(testInfo.explain);
  231. }
  232. return testInfos;
  233. }
  234. private List<ItemInfo> ComposeConvert(string TypeKey, List<string> list, string Lang)
  235. {
  236. List<ItemInfo> exerciseDtos = new List<ItemInfo>();
  237. foreach (string html in list)
  238. {
  239. ItemInfo exercise = new ItemInfo() { type = TypeKey };
  240. string RegexStr = ComposeStart + "([\\s\\S]*?)" + ComposeEnd;
  241. Match mt = Regex.Match(html, RegexStr);
  242. exercise.question = HtmlHelper.DoUselessTag(mt.Value.Replace(ComposeStart, "").Replace(ComposeEnd, ""));
  243. string testinfo = Regex.Replace(html, RegexStr, "").Replace(ComposeTag, CompleteStart);
  244. //获取综合题的材料加每个小题的sha1Code
  245. string testQs = HtmlHelper.DoTextImg(exercise.question);
  246. List<ItemInfo> dtos = AnalyzeWordAsync(testinfo, Lang);
  247. if (dtos.IsNotEmpty())
  248. {
  249. dtos.ForEach(x => { testQs = testQs + x.shaCode; });
  250. exercise.shaCode = ShaHashHelper.GetSHA1(testQs);
  251. dtos.ForEach(x => { x.pShaCode = exercise.shaCode; });
  252. exercise.children.AddRange(dtos);
  253. }
  254. exerciseDtos.Add(exercise);
  255. }
  256. return exerciseDtos;
  257. }
  258. public static List<ItemInfo> ConvertTestInfo(List<string> tests, string TypeKey)
  259. {
  260. List<ItemInfo> testInfos = new List<ItemInfo>();
  261. foreach (string html in tests)
  262. {
  263. Dictionary<string, string> regex = new Dictionary<string, string>();
  264. Dictionary<string, string> question = new Dictionary<string, string> { { "Summary", TestType[TypeKey].Split("|")[0] + "|" + AnswerTag }, { "Answer", AnswerTag + "|" + AnalysisTag }, { "Analysis", AnalysisTag + "|" + EndedTag } };
  265. Dictionary<string, string> compquestion = new Dictionary<string, string> { { "Summary", TestType[TypeKey].Split("|")[0] + "|" + AnalysisTag }, { "Analysis", AnalysisTag + "|" + EndedTag } };
  266. ItemInfo test = new ItemInfo();
  267. test.type = TypeKey;
  268. List<string> keys = new List<string>();
  269. if (TypeKey.Equals("Complete"))
  270. {
  271. keys = compquestion.Keys.ToList();
  272. regex = compquestion;
  273. }
  274. else
  275. {
  276. keys = question.Keys.ToList();
  277. regex = question;
  278. }
  279. foreach (string key in keys)
  280. {
  281. string[] tags = regex[key].Split("|");
  282. string RegexStr = tags[0] + "([\\s\\S]*?)" + tags[1];
  283. Match mt = Regex.Match(html, RegexStr);
  284. switch (key)
  285. {
  286. case "Summary":
  287. test.question = mt.Value.Replace(TestType[TypeKey].Split("|")[0], ""); break;
  288. case "Answer":
  289. string Answer = mt.Value;
  290. ///单选或多选,判断答案 脱html标签
  291. if (TypeKey.Equals("Single") || TypeKey.Equals("Multiple") || TypeKey.Equals("Judge"))
  292. {
  293. HtmlDocument doc = new HtmlDocument();
  294. doc.LoadHtml(mt.Value);
  295. Answer = doc.DocumentNode.InnerText;
  296. }
  297. test.answer = new List<string>() { Answer }; break;
  298. case "Analysis":
  299. test.explain = mt.Value; break;
  300. default: break;
  301. }
  302. }
  303. testInfos.Add(test);
  304. }
  305. return testInfos;
  306. }
  307. /// <summary>
  308. /// 解析题型
  309. /// </summary>
  310. /// <param name="testHtml"></param>
  311. /// <returns></returns>
  312. public static Dictionary<string, List<string>> ConvertTest(string testHtml)
  313. {
  314. string start = SummaryTag;
  315. string end = EndedTag;
  316. Dictionary<string, List<string>> TestInType = new Dictionary<string, List<string>>();
  317. foreach (string key in TestType.Keys)
  318. {
  319. string[] tags = TestType[key].Split("|");
  320. string regRex = tags[0] + "([\\s\\S]*?)" + tags[1];
  321. var m = Regex.Match(testHtml, regRex);
  322. //int index = 1;
  323. List<string> tests = new List<string>();
  324. while (m.Success)
  325. {
  326. string testInfo = tags[0] + m.Groups[1].ToString() + tags[1];
  327. tests.Add(testInfo);
  328. m = m.NextMatch();
  329. }
  330. TestInType.Add(key, tests);
  331. }
  332. return TestInType;
  333. }
  334. }
  335. class ReplaceDto
  336. {
  337. public string oldstr { get; set; }
  338. public string newstr { get; set; }
  339. }
  340. }