HtmlAnalyzeService.cs 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. using DocumentFormat.OpenXml.Packaging;
  2. using HtmlAgilityPack;
  3. using Microsoft.AspNetCore.Http;
  4. using OpenXmlPowerTools;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.Drawing.Imaging;
  8. using System.IO;
  9. using System.Linq;
  10. using System.Text;
  11. using System.Text.RegularExpressions;
  12. using System.Threading.Tasks;
  13. using System.Xml.Linq;
  14. using TEAMModelOS.Model.Evaluation.Dtos.Own;
  15. using TEAMModelOS.SDK.Context.Configuration;
  16. using TEAMModelOS.SDK.Context.Constant;
  17. using TEAMModelOS.SDK.Extension.SnowFlake;
  18. using TEAMModelOS.SDK.Helper.Common.CollectionHelper;
  19. using TEAMModelOS.SDK.Helper.Common.FileHelper;
  20. using TEAMModelOS.SDK.Helper.Common.StringHelper;
  21. using TEAMModelOS.SDK.Helper.Security.ShaHash;
  22. using TEAMModelOS.SDK.Module.AzureBlob.Container;
  23. using TEAMModelOS.SDK.Module.AzureBlob.Interfaces;
  24. using TEAMModelOS.SDK.Module.AzureTable.Interfaces;
  25. using TEAMModelOS.Service.Models.Core;
  26. using TEAMModelOS.Service.Models.Evaluation.Dtos.Own;
  27. using TEAMModelOS.Service.Services.Evaluation.Interfaces;
  28. namespace TEAMModelOS.Service.Services.Evaluation.Implements
  29. {
  30. public class HtmlAnalyzeService : IHtmlAnalyzeService
  31. {
  32. private static string SummaryTag = "【题文】";
  33. private static string AnswerTag = "【答案】";
  34. private static string AnalysisTag = "【解析】";
  35. private static string EndedTag = "【结束】";
  36. private static string Options = "ABCDEFGHIJ";
  37. private static string CompleteStart = "【";
  38. private static string CompleteEnd = "】";
  39. private static string ComposeStart = "【综合题】";
  40. private static string ComposeEnd = "【综合题-题干】";
  41. private static string ComposeTag = "【综合题-";
  42. private static Dictionary<string, string> TestType = new Dictionary<string, string> {
  43. { "Single", "【单选题】|【结束】" }, { "Multiple", "【多选题】|【结束】" },
  44. { "Judge", "【判断题】|【结束】" }, { "Complete", "【填空题】|【结束】" },
  45. { "Subjective", "【问答题】|【结束】" } , { "Compose", "【综合题】|【完结】" }};
  46. public List<ExerciseDto> AnalyzeWordAsync(string html, string Lang)
  47. {
  48. //去除class 以及span标签"
  49. string classpattern = "class=\"([^\"]*)\"";
  50. html = Regex.Replace(html, classpattern, "");
  51. string pattern = "<span([^>]{0,})>";
  52. html = Regex.Replace(html, pattern, "");
  53. html = html.Replace("\t", " ").Replace("<span>", "").Replace("</span>", "").Replace("dir=\"ltr\"", "");
  54. Dictionary<string, List<string>> TestInType = ConvertTest(html);
  55. List<ExerciseDto> tests = new List<ExerciseDto>();
  56. foreach (string key in TestInType.Keys)
  57. {
  58. switch (key)
  59. {
  60. case "Single":
  61. List<ExerciseDto> exercisesSingle = SingleConvert(key, TestInType[key]);
  62. exercisesSingle.ForEach(x => { x.PShaCode = x.ShaCode; });
  63. tests.AddRange(exercisesSingle); break;
  64. case "Multiple":
  65. List<ExerciseDto> exercisesMultiple = MultipleConvert(key, TestInType[key]);
  66. exercisesMultiple.ForEach(x => { x.PShaCode = x.ShaCode; });
  67. tests.AddRange(exercisesMultiple); break;
  68. case "Judge":
  69. List<ExerciseDto> exercisesJudge = JudgeConvert(key, TestInType[key]);
  70. exercisesJudge.ForEach(x => { x.PShaCode = x.ShaCode; });
  71. tests.AddRange(exercisesJudge); break;
  72. case "Complete":
  73. List<ExerciseDto> exercisesComplete = CompleteConvert(key, TestInType[key]);
  74. exercisesComplete.ForEach(x => { x.PShaCode = x.ShaCode; });
  75. tests.AddRange(exercisesComplete); break;
  76. case "Subjective":
  77. List<ExerciseDto> exercisesSubjective = SubjectiveConvert(key, TestInType[key]);
  78. exercisesSubjective.ForEach(x => { x.PShaCode = x.ShaCode; });
  79. tests.AddRange(exercisesSubjective); break;
  80. case "Compose":
  81. List<ExerciseDto> exercisesCompose = ComposeConvert(key, TestInType[key], Lang);
  82. exercisesCompose.ForEach(x => { x.PShaCode = x.ShaCode; });
  83. tests.AddRange(exercisesCompose);
  84. break;
  85. default: break;
  86. }
  87. }
  88. return tests;
  89. }
  90. private List<ExerciseDto> SingleConvert(string TypeKey, List<string> list)
  91. {
  92. List<ExerciseDto> testInfos = OptionProcess(TypeKey, list);
  93. return testInfos;
  94. }
  95. private List<ExerciseDto> MultipleConvert(string TypeKey, List<string> list)
  96. {
  97. List<ExerciseDto> testInfos = OptionProcess(TypeKey, list);
  98. return testInfos;
  99. }
  100. private List<ExerciseDto> JudgeConvert(string TypeKey, List<string> list)
  101. {
  102. List<ExerciseDto> testInfos = OptionProcess(TypeKey, list);
  103. return testInfos;
  104. }
  105. private List<ExerciseDto> CompleteConvert(string TypeKey, List<string> list)
  106. {
  107. List<ExerciseDto> testInfos = CompleteProcess(TypeKey, list);
  108. return testInfos;
  109. }
  110. private List<ExerciseDto> CompleteProcess(string TypeKey, List<string> tests)
  111. {
  112. //List<string> tests = ConvertTest(testHtml);
  113. List<ExerciseDto> testInfos = ConvertTestInfo(tests, TypeKey);
  114. HtmlDocument doc = new HtmlDocument();
  115. foreach (ExerciseDto testInfo in testInfos)
  116. {
  117. List<string> ans = new List<string>();
  118. testInfo.Question = testInfo.Question.Replace(AnalysisTag, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  119. string regRex = CompleteStart + "([\\s\\S]*?)" + CompleteEnd;
  120. List<ReplaceDto> replaces = new List<ReplaceDto>();
  121. var m = Regex.Match(testInfo.Question, regRex);
  122. int index = 1;
  123. while (m.Success)
  124. {
  125. string an = m.Groups[1].ToString();
  126. doc.LoadHtml(an);
  127. string anstr = doc.DocumentNode.InnerText;
  128. string nbsp = "";
  129. int length = System.Text.Encoding.Default.GetBytes(anstr).Length;
  130. for (int i = 0; i < length * 3; i++)
  131. {
  132. nbsp += "&nbsp;";
  133. }
  134. ReplaceDto replaceDto = new ReplaceDto { oldstr = CompleteStart + an + CompleteEnd, newstr = "<underline data=\"" + index + "\"><u>" + nbsp + "</u></underline>" };
  135. replaces.Add(replaceDto);
  136. ans.Add(an);
  137. m = m.NextMatch();
  138. index++;
  139. }
  140. string textImg = testInfo.Question;
  141. //消除答案
  142. foreach (ReplaceDto replace in replaces)
  143. {
  144. testInfo.Question = testInfo.Question.Replace(replace.oldstr, replace.newstr);
  145. testInfo.Question = HtmlHelper.DoUselessTag(testInfo.Question);
  146. //只要题干文字和图片
  147. //不加underline标记
  148. textImg = testInfo.Question.Replace(replace.oldstr, "");
  149. }
  150. textImg = HtmlHelper.DoTextImg(textImg);
  151. testInfo.ShaCode = ShaHashHelper.GetSHA1(textImg);
  152. //处理解析
  153. testInfo.Explain = testInfo.Explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  154. testInfo.Explain = HtmlHelper.DoUselessTag(testInfo.Explain);
  155. testInfo.Answer.AddRange(ans);
  156. }
  157. return testInfos;
  158. }
  159. private List<ExerciseDto> OptionProcess(string typeKey, List<string> list)
  160. {
  161. string[] optionsKeys = Options.Select(s => s.ToString()).ToArray();
  162. List<ExerciseDto> testInfos = ConvertTestInfo(list, typeKey);
  163. foreach (ExerciseDto testInfo in testInfos)
  164. {
  165. string optsRgex = optionsKeys[0] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + AnswerTag;
  166. string optsHtml = Regex.Match(testInfo.Question, optsRgex).Value;
  167. //HtmlDocument doc = new HtmlDocument();
  168. //doc.LoadHtml(optsHtml);
  169. //optsHtml = doc.DocumentNode.InnerText;
  170. //处理选项
  171. StringBuilder textImg = new StringBuilder();
  172. for (int i = 0; i < optionsKeys.Length - 1; i++)
  173. {
  174. string optRgex = optionsKeys[i] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + optionsKeys[i + 1] + "(\\.|\\.|\\、|\\:|\\:)";
  175. string optHtml = Regex.Match(optsHtml, optRgex).Value;
  176. if (!string.IsNullOrEmpty(optHtml))
  177. {
  178. optHtml = optHtml.Substring(2, optHtml.Length - 4);
  179. optHtml = HtmlHelper.DoUselessTag(optHtml);
  180. textImg.Append(HtmlHelper.DoTextImg(optHtml));
  181. testInfo.Option.Add(new CodeValue { Code = optionsKeys[i], Value = optHtml });
  182. //testInfo.Option.Add(new Dictionary<string, string> { { "code", optionsKeys[i] },{ "value", optHtml } });
  183. //testInfo.Option.TryAdd(optionsKeys[i], optHtml);
  184. }
  185. else
  186. {
  187. optRgex = optionsKeys[i] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + AnswerTag;
  188. optHtml = Regex.Match(optsHtml, optRgex).Value;
  189. if (!string.IsNullOrEmpty(optHtml))
  190. {
  191. optHtml = optHtml.Substring(2, optHtml.Length - 6);
  192. optHtml = HtmlHelper.DoUselessTag(optHtml);
  193. textImg.Append(HtmlHelper.DoTextImg(optHtml));
  194. testInfo.Option.Add(new CodeValue { Code = optionsKeys[i], Value = optHtml });
  195. //testInfo.Option.Add(new Dictionary<string, string> { { "code", optionsKeys[i] }, { "value", optHtml } });
  196. //testInfo.Option.TryAdd(optionsKeys[i], optHtml);
  197. }
  198. }
  199. }
  200. //处理题干
  201. testInfo.Question = testInfo.Question.Replace(optsHtml, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  202. testInfo.Question = HtmlHelper.DoUselessTag(testInfo.Question);
  203. textImg.Append(HtmlHelper.DoTextImg(testInfo.Question));
  204. testInfo.ShaCode = ShaHashHelper.GetSHA1(textImg.ToString());
  205. List<string> answers = testInfo.Answer;
  206. HashSet<string> ans = new HashSet<string>();
  207. //处理答案
  208. for (int i = 0; i < answers.Count; i++)
  209. {
  210. string Answer = answers[i].Replace(AnswerTag, "").Replace(AnalysisTag, "").TrimStart().TrimEnd();
  211. Answer.Select(s => s.ToString()).ToList().ForEach(x =>
  212. {
  213. ans.Add(x);
  214. });
  215. }
  216. testInfo.Answer = ans.ToList();
  217. //处理解析
  218. testInfo.Explain = testInfo.Explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  219. testInfo.Explain = HtmlHelper.DoUselessTag(testInfo.Explain);
  220. }
  221. return testInfos;
  222. }
  223. private List<ExerciseDto> SubjectiveConvert(string TypeKey, List<string> tests)
  224. {
  225. // List<string> tests = ConvertTest(testHtml);
  226. List<ExerciseDto> testInfos = ConvertTestInfo(tests, TypeKey);
  227. foreach (ExerciseDto testInfo in testInfos)
  228. {
  229. testInfo.Question = testInfo.Question.Replace(AnalysisTag, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  230. testInfo.Question = HtmlHelper.DoUselessTag(testInfo.Question);
  231. StringBuilder textImg = new StringBuilder(HtmlHelper.DoTextImg(testInfo.Question));
  232. testInfo.ShaCode = ShaHashHelper.GetSHA1(textImg.ToString());
  233. for (int i = 0; i < testInfo.Answer.Count; i++)
  234. {
  235. testInfo.Answer[i] = testInfo.Answer[i].Replace(AnswerTag, "").Replace(AnalysisTag, "");
  236. testInfo.Answer[i] = HtmlHelper.DoUselessTag(testInfo.Answer[i]);
  237. }
  238. testInfo.Explain = testInfo.Explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  239. testInfo.Explain = HtmlHelper.DoUselessTag(testInfo.Explain);
  240. }
  241. return testInfos;
  242. }
  243. private List<ExerciseDto> ComposeConvert(string TypeKey, List<string> list, string Lang)
  244. {
  245. List<ExerciseDto> exerciseDtos = new List<ExerciseDto>();
  246. foreach (string html in list)
  247. {
  248. ExerciseDto exercise = new ExerciseDto() { Type = TypeKey };
  249. string RegexStr = ComposeStart + "([\\s\\S]*?)" + ComposeEnd;
  250. Match mt = Regex.Match(html, RegexStr);
  251. exercise.Question = HtmlHelper.DoUselessTag(mt.Value.Replace(ComposeStart, "").Replace(ComposeEnd, ""));
  252. string testinfo = Regex.Replace(html, RegexStr, "").Replace(ComposeTag, CompleteStart);
  253. //获取综合题的材料加每个小题的sha1Code
  254. string testQs = HtmlHelper.DoTextImg(exercise.Question);
  255. List<ExerciseDto> dtos = AnalyzeWordAsync(testinfo, Lang);
  256. if (dtos.IsNotEmpty())
  257. {
  258. dtos.ForEach(x => { testQs = testQs + x.ShaCode; });
  259. exercise.ShaCode = ShaHashHelper.GetSHA1(testQs);
  260. dtos.ForEach(x => { x.PShaCode = exercise.ShaCode; });
  261. exercise.Children.AddRange(dtos);
  262. }
  263. exerciseDtos.Add(exercise);
  264. }
  265. return exerciseDtos;
  266. }
  267. public static List<ExerciseDto> ConvertTestInfo(List<string> tests, string TypeKey)
  268. {
  269. List<ExerciseDto> testInfos = new List<ExerciseDto>();
  270. foreach (string html in tests)
  271. {
  272. Dictionary<string, string> regex = new Dictionary<string, string>();
  273. Dictionary<string, string> question = new Dictionary<string, string> { { "Summary", TestType[TypeKey].Split("|")[0] + "|" + AnswerTag }, { "Answer", AnswerTag + "|" + AnalysisTag }, { "Analysis", AnalysisTag + "|" + EndedTag } };
  274. Dictionary<string, string> compquestion = new Dictionary<string, string> { { "Summary", TestType[TypeKey].Split("|")[0] + "|" + AnalysisTag }, { "Analysis", AnalysisTag + "|" + EndedTag } };
  275. ExerciseDto test = new ExerciseDto();
  276. test.Type = TypeKey;
  277. List<string> keys = new List<string>();
  278. if (TypeKey.Equals("Complete"))
  279. {
  280. keys = compquestion.Keys.ToList();
  281. regex = compquestion;
  282. }
  283. else
  284. {
  285. keys = question.Keys.ToList();
  286. regex = question;
  287. }
  288. foreach (string key in keys)
  289. {
  290. string[] tags = regex[key].Split("|");
  291. string RegexStr = tags[0] + "([\\s\\S]*?)" + tags[1];
  292. Match mt = Regex.Match(html, RegexStr);
  293. switch (key)
  294. {
  295. case "Summary":
  296. test.Question = mt.Value.Replace(TestType[TypeKey].Split("|")[0], ""); break;
  297. case "Answer":
  298. string Answer = mt.Value;
  299. ///单选或多选,判断答案 脱html标签
  300. if (TypeKey.Equals("Single") || TypeKey.Equals("Multiple") || TypeKey.Equals("Judge"))
  301. {
  302. HtmlDocument doc = new HtmlDocument();
  303. doc.LoadHtml(mt.Value);
  304. Answer = doc.DocumentNode.InnerText;
  305. }
  306. test.Answer = new List<string>() { Answer }; break;
  307. case "Analysis":
  308. test.Explain = mt.Value; break;
  309. default: break;
  310. }
  311. }
  312. testInfos.Add(test);
  313. }
  314. return testInfos;
  315. }
  316. /// <summary>
  317. /// 解析题型
  318. /// </summary>
  319. /// <param name="testHtml"></param>
  320. /// <returns></returns>
  321. public static Dictionary<string, List<string>> ConvertTest(string testHtml)
  322. {
  323. string start = SummaryTag;
  324. string end = EndedTag;
  325. Dictionary<string, List<string>> TestInType = new Dictionary<string, List<string>>();
  326. foreach (string key in TestType.Keys)
  327. {
  328. string[] tags = TestType[key].Split("|");
  329. string regRex = tags[0] + "([\\s\\S]*?)" + tags[1];
  330. var m = Regex.Match(testHtml, regRex);
  331. //int index = 1;
  332. List<string> tests = new List<string>();
  333. while (m.Success)
  334. {
  335. string testInfo = tags[0] + m.Groups[1].ToString() + tags[1];
  336. tests.Add(testInfo);
  337. m = m.NextMatch();
  338. }
  339. TestInType.Add(key, tests);
  340. }
  341. return TestInType;
  342. }
  343. }
  344. class ReplaceDto
  345. {
  346. public string oldstr { get; set; }
  347. public string newstr { get; set; }
  348. }
  349. }