HtmlAnalyzeService.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. using HtmlAgilityPack;
  2. using System;
  3. using System.Collections.Generic;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using System.Threading.Tasks;
  8. using TEAMModelOS.Model.Core.Dtos;
  9. using TEAMModelOS.Model.Evaluation.Dtos;
  10. using TEAMModelOS.Model.Evaluation.Dtos.Own;
  11. using TEAMModelOS.SDK.Helper.Common.CollectionHelper;
  12. using TEAMModelOS.SDK.Helper.Common.StringHelper;
  13. using TEAMModelOS.SDK.Helper.Security.ShaHash;
  14. using TEAMModelOS.Service.Core.Implements;
  15. using TEAMModelOS.Service.Evaluation.Interfaces;
  16. namespace TEAMModelOS.Service.Evaluation.Implements
  17. {
  18. class HtmlAnalyzeService: BaseService, IHtmlAnalyzeService
  19. {
  20. private static string SummaryTag = "【题文】";
  21. private static string AnswerTag = "【答案】";
  22. private static string AnalysisTag = "【解析】";
  23. private static string EndedTag = "【结束】";
  24. private static string Options = "ABCDEFGHIJ";
  25. private static string CompleteStart = "【";
  26. private static string CompleteEnd = "】";
  27. private static string ComposeStart = "【综合题】";
  28. private static string ComposeEnd = "【综合题-题干】";
  29. private static string ComposeTag = "【综合题-";
  30. private static Dictionary<string, string> TestType = new Dictionary<string, string> {
  31. { "Single", "【单选题】|【结束】" }, { "Multiple", "【多选题】|【结束】" },
  32. { "Judge", "【判断题】|【结束】" }, { "Complete", "【填空题】|【结束】" },
  33. { "Subjective", "【问答题】|【结束】" } , { "Compose", "【综合题】|【完结】" }};
  34. public List<ExerciseDto> AnalyzeWordAsync(string html, string Lang)
  35. {
  36. //去除class 以及span标签"
  37. string classpattern = "class=\"([^\"]*)\"";
  38. html = Regex.Replace(html, classpattern, "");
  39. string pattern = "<span([^>]{0,})>";
  40. html = Regex.Replace(html, pattern, "");
  41. html = html.Replace("\t", " ").Replace("<span>", "").Replace("</span>", "").Replace("dir=\"ltr\"","");
  42. Dictionary<string, List<string>> TestInType= ConvertTest(html);
  43. List<ExerciseDto> tests = new List<ExerciseDto>();
  44. foreach (string key in TestInType.Keys)
  45. {
  46. switch (key)
  47. {
  48. case "Single":
  49. List<ExerciseDto> exercisesSingle = SingleConvert(key, TestInType[key]);
  50. exercisesSingle.ForEach(x => { x.PShaCode = x.ShaCode; });
  51. tests.AddRange(exercisesSingle); break;
  52. case "Multiple":
  53. List<ExerciseDto> exercisesMultiple = MultipleConvert(key, TestInType[key]);
  54. exercisesMultiple.ForEach(x => { x.PShaCode = x.ShaCode; });
  55. tests.AddRange(exercisesMultiple); break;
  56. case "Judge":
  57. List<ExerciseDto> exercisesJudge = JudgeConvert(key, TestInType[key]);
  58. exercisesJudge.ForEach(x => { x.PShaCode = x.ShaCode; });
  59. tests.AddRange(exercisesJudge); break;
  60. case "Complete":
  61. List<ExerciseDto> exercisesComplete = CompleteConvert(key, TestInType[key]);
  62. exercisesComplete.ForEach(x => { x.PShaCode = x.ShaCode; });
  63. tests.AddRange(exercisesComplete); break;
  64. case "Subjective":
  65. List<ExerciseDto> exercisesSubjective = SubjectiveConvert(key, TestInType[key]);
  66. exercisesSubjective.ForEach(x => { x.PShaCode = x.ShaCode; });
  67. tests.AddRange(exercisesSubjective); break;
  68. case "Compose":
  69. List<ExerciseDto> exercisesCompose = ComposeConvert(key, TestInType[key],Lang);
  70. exercisesCompose.ForEach(x => { x.PShaCode = x.ShaCode; });
  71. tests.AddRange(exercisesCompose);
  72. break;
  73. default: break;
  74. }
  75. }
  76. return tests;
  77. }
  78. private List<ExerciseDto> SingleConvert(string TypeKey, List<string> list)
  79. {
  80. List<ExerciseDto> testInfos = OptionProcess(TypeKey, list);
  81. return testInfos;
  82. }
  83. private List<ExerciseDto> MultipleConvert(string TypeKey, List<string> list)
  84. {
  85. List<ExerciseDto> testInfos = OptionProcess(TypeKey, list);
  86. return testInfos;
  87. }
  88. private List<ExerciseDto> JudgeConvert(string TypeKey, List<string> list)
  89. {
  90. List<ExerciseDto> testInfos = OptionProcess(TypeKey, list);
  91. return testInfos;
  92. }
  93. private List<ExerciseDto> CompleteConvert(string TypeKey, List<string> list)
  94. {
  95. List<ExerciseDto> testInfos = CompleteProcess(TypeKey, list);
  96. return testInfos;
  97. }
  98. private List<ExerciseDto> SubjectiveConvert(string TypeKey, List<string> tests)
  99. {
  100. // List<string> tests = ConvertTest(testHtml);
  101. List<ExerciseDto> testInfos = ConvertTestInfo(tests, TypeKey);
  102. foreach (ExerciseDto testInfo in testInfos)
  103. {
  104. testInfo.Question = testInfo.Question.Replace(AnalysisTag, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  105. testInfo.Question = HtmlHelper.DoUselessTag(testInfo.Question);
  106. StringBuilder textImg = new StringBuilder(HtmlHelper.DoTextImg(testInfo.Question));
  107. testInfo.ShaCode = ShaHashHelper.GetSHA1(textImg.ToString());
  108. for (int i = 0; i < testInfo.Answer.Count; i++)
  109. {
  110. testInfo.Answer[i] = testInfo.Answer[i].Replace(AnswerTag, "").Replace(AnalysisTag, "");
  111. testInfo.Answer[i] = HtmlHelper.DoUselessTag(testInfo.Answer[i]);
  112. }
  113. testInfo.Explain = testInfo.Explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  114. testInfo.Explain = HtmlHelper.DoUselessTag(testInfo.Explain);
  115. }
  116. return testInfos;
  117. }
  118. private List<ExerciseDto> ComposeConvert(string TypeKey, List<string> list ,string Lang)
  119. {
  120. List<ExerciseDto> exerciseDtos = new List<ExerciseDto>();
  121. foreach (string html in list) {
  122. ExerciseDto exercise = new ExerciseDto() { Type=TypeKey };
  123. string RegexStr = ComposeStart + "([\\s\\S]*?)" + ComposeEnd;
  124. Match mt = Regex.Match(html, RegexStr);
  125. exercise.Question= HtmlHelper.DoUselessTag(mt.Value.Replace(ComposeStart,"").Replace(ComposeEnd,""));
  126. string testinfo = Regex.Replace(html, RegexStr, "").Replace(ComposeTag,CompleteStart);
  127. //获取综合题的材料加每个小题的sha1Code
  128. string testQs= HtmlHelper.DoTextImg(exercise.Question);
  129. List<ExerciseDto> dtos = AnalyzeWordAsync(testinfo, Lang);
  130. if (dtos.IsNotEmpty()) {
  131. dtos.ForEach(x => { testQs = testQs + x.ShaCode; });
  132. exercise.ShaCode=ShaHashHelper.GetSHA1(testQs);
  133. dtos.ForEach(x => { x.PShaCode = exercise.ShaCode; });
  134. exercise.Children.AddRange(dtos);
  135. }
  136. exerciseDtos.Add(exercise);
  137. }
  138. return exerciseDtos;
  139. }
  140. private List<ExerciseDto> OptionProcess(string typeKey, List<string> list)
  141. {
  142. string[] optionsKeys = Options.Select(s => s.ToString()).ToArray();
  143. List<ExerciseDto> testInfos = ConvertTestInfo(list, typeKey);
  144. foreach (ExerciseDto testInfo in testInfos)
  145. {
  146. string optsRgex = optionsKeys[0] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + AnswerTag;
  147. string optsHtml = Regex.Match(testInfo.Question, optsRgex).Value;
  148. //HtmlDocument doc = new HtmlDocument();
  149. //doc.LoadHtml(optsHtml);
  150. //optsHtml = doc.DocumentNode.InnerText;
  151. //处理选项
  152. StringBuilder textImg = new StringBuilder();
  153. for (int i = 0; i < optionsKeys.Length - 1; i++)
  154. {
  155. string optRgex = optionsKeys[i] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + optionsKeys[i + 1] + "(\\.|\\.|\\、|\\:|\\:)";
  156. string optHtml = Regex.Match(optsHtml, optRgex).Value;
  157. if (!string.IsNullOrEmpty(optHtml))
  158. {
  159. optHtml = optHtml.Substring(2, optHtml.Length - 4);
  160. optHtml = HtmlHelper.DoUselessTag(optHtml);
  161. textImg.Append(HtmlHelper.DoTextImg(optHtml));
  162. testInfo.Option.Add(new CodeValue { Code = optionsKeys[i], Value = optHtml });
  163. //testInfo.Option.Add(new Dictionary<string, string> { { "code", optionsKeys[i] },{ "value", optHtml } });
  164. //testInfo.Option.TryAdd(optionsKeys[i], optHtml);
  165. }
  166. else
  167. {
  168. optRgex = optionsKeys[i] + "(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + AnswerTag;
  169. optHtml = Regex.Match(optsHtml, optRgex).Value;
  170. if (!string.IsNullOrEmpty(optHtml))
  171. {
  172. optHtml = optHtml.Substring(2, optHtml.Length - 6);
  173. optHtml = HtmlHelper.DoUselessTag(optHtml);
  174. textImg.Append(HtmlHelper.DoTextImg(optHtml));
  175. testInfo.Option.Add(new CodeValue { Code = optionsKeys[i], Value = optHtml });
  176. //testInfo.Option.Add(new Dictionary<string, string> { { "code", optionsKeys[i] }, { "value", optHtml } });
  177. //testInfo.Option.TryAdd(optionsKeys[i], optHtml);
  178. }
  179. }
  180. }
  181. //处理题干
  182. testInfo.Question = testInfo.Question.Replace(optsHtml, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  183. testInfo.Question = HtmlHelper.DoUselessTag(testInfo.Question);
  184. textImg.Append(HtmlHelper.DoTextImg(testInfo.Question));
  185. testInfo.ShaCode = ShaHashHelper.GetSHA1(textImg.ToString());
  186. List<string> answers = testInfo.Answer;
  187. HashSet<string> ans = new HashSet<string>();
  188. //处理答案
  189. for (int i = 0; i < answers.Count; i++)
  190. {
  191. string Answer = answers[i].Replace(AnswerTag, "").Replace(AnalysisTag, "").TrimStart().TrimEnd();
  192. Answer.Select(s => s.ToString()).ToList().ForEach(x =>
  193. {
  194. ans.Add(x);
  195. });
  196. }
  197. testInfo.Answer = ans.ToList();
  198. //处理解析
  199. testInfo.Explain = testInfo.Explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  200. testInfo.Explain = HtmlHelper.DoUselessTag(testInfo.Explain);
  201. }
  202. return testInfos;
  203. }
  204. public static List<ExerciseDto> ConvertTestInfo(List<string> tests, string TypeKey) {
  205. List<ExerciseDto> testInfos = new List<ExerciseDto>();
  206. foreach (string html in tests)
  207. {
  208. Dictionary<string, string> regex = new Dictionary<string, string>();
  209. Dictionary<string, string> question = new Dictionary<string, string> { { "Summary", TestType[TypeKey].Split("|")[0] + "|" + AnswerTag }, { "Answer", AnswerTag + "|" + AnalysisTag }, { "Analysis", AnalysisTag + "|" + EndedTag } };
  210. Dictionary<string, string> compquestion = new Dictionary<string, string> { { "Summary", TestType[TypeKey].Split("|")[0] + "|" + AnalysisTag }, { "Analysis", AnalysisTag + "|" + EndedTag } };
  211. ExerciseDto test = new ExerciseDto();
  212. test.Type = TypeKey;
  213. List<string> keys = new List<string>();
  214. if (TypeKey.Equals("Complete"))
  215. {
  216. keys = compquestion.Keys.ToList();
  217. regex = compquestion;
  218. }
  219. else
  220. {
  221. keys = question.Keys.ToList();
  222. regex = question;
  223. }
  224. foreach (string key in keys)
  225. {
  226. string[] tags = regex[key].Split("|");
  227. string RegexStr = tags[0] + "([\\s\\S]*?)" + tags[1];
  228. Match mt = Regex.Match(html, RegexStr);
  229. switch (key)
  230. {
  231. case "Summary":
  232. test.Question = mt.Value.Replace(TestType[TypeKey].Split("|")[0],""); break;
  233. case "Answer":
  234. string Answer = mt.Value;
  235. ///单选或多选,判断答案 脱html标签
  236. if (TypeKey.Equals("Single") || TypeKey.Equals("Multiple") || TypeKey.Equals("Judge"))
  237. {
  238. HtmlDocument doc = new HtmlDocument();
  239. doc.LoadHtml(mt.Value);
  240. Answer = doc.DocumentNode.InnerText;
  241. }
  242. test.Answer = new List<string>() { Answer }; break;
  243. case "Analysis":
  244. test.Explain = mt.Value; break;
  245. default: break;
  246. }
  247. }
  248. testInfos.Add(test);
  249. }
  250. return testInfos;
  251. }
  252. private List<ExerciseDto> CompleteProcess(string TypeKey, List<string> tests)
  253. {
  254. //List<string> tests = ConvertTest(testHtml);
  255. List<ExerciseDto> testInfos = ConvertTestInfo(tests, TypeKey);
  256. HtmlDocument doc = new HtmlDocument();
  257. foreach (ExerciseDto testInfo in testInfos)
  258. {
  259. List<string> ans = new List<string>();
  260. testInfo.Question = testInfo.Question.Replace(AnalysisTag, "").Replace(SummaryTag, "").Replace(AnswerTag, "");
  261. string regRex = CompleteStart + "([\\s\\S]*?)" + CompleteEnd;
  262. List<ReplaceDto> replaces = new List<ReplaceDto>();
  263. var m = Regex.Match(testInfo.Question, regRex);
  264. int index = 1;
  265. while (m.Success)
  266. {
  267. string an = m.Groups[1].ToString();
  268. doc.LoadHtml(an);
  269. string anstr = doc.DocumentNode.InnerText;
  270. string nbsp = "";
  271. int length = System.Text.Encoding.Default.GetBytes(anstr).Length;
  272. for (int i = 0; i < length * 3; i++)
  273. {
  274. nbsp += "&nbsp;";
  275. }
  276. ReplaceDto replaceDto = new ReplaceDto { oldstr = CompleteStart + an + CompleteEnd, newstr = "<underline data=\"" + index + "\"><u>" + nbsp + "</u></underline>" };
  277. replaces.Add(replaceDto);
  278. ans.Add(an);
  279. m = m.NextMatch();
  280. index++;
  281. }
  282. string textImg = testInfo.Question;
  283. //消除答案
  284. foreach (ReplaceDto replace in replaces)
  285. {
  286. testInfo.Question = testInfo.Question.Replace(replace.oldstr, replace.newstr);
  287. testInfo.Question = HtmlHelper.DoUselessTag(testInfo.Question);
  288. //只要题干文字和图片
  289. //不加underline标记
  290. textImg = testInfo.Question.Replace(replace.oldstr, "");
  291. }
  292. textImg = HtmlHelper.DoTextImg(textImg);
  293. testInfo.ShaCode = ShaHashHelper.GetSHA1(textImg);
  294. //处理解析
  295. testInfo.Explain = testInfo.Explain.Replace(AnalysisTag, "").Replace(EndedTag, "");
  296. testInfo.Explain = HtmlHelper.DoUselessTag(testInfo.Explain);
  297. testInfo.Answer.AddRange(ans);
  298. }
  299. return testInfos;
  300. }
  301. /// <summary>
  302. /// 解析题型
  303. /// </summary>
  304. /// <param name="testHtml"></param>
  305. /// <returns></returns>
  306. public static Dictionary<string, List<string>> ConvertTest(string testHtml)
  307. {
  308. string start = SummaryTag;
  309. string end = EndedTag;
  310. Dictionary<string, List<string>> TestInType = new Dictionary<string, List<string>>();
  311. foreach (string key in TestType.Keys) {
  312. string[] tags = TestType[key].Split("|");
  313. string regRex = tags[0] + "([\\s\\S]*?)" + tags[1];
  314. var m = Regex.Match(testHtml, regRex);
  315. //int index = 1;
  316. List<string> tests = new List<string>();
  317. while (m.Success)
  318. {
  319. string testInfo = tags[0]+m.Groups[1].ToString()+tags[1];
  320. tests.Add(testInfo);
  321. m = m.NextMatch();
  322. }
  323. TestInType.Add(key, tests);
  324. }
  325. return TestInType;
  326. }
  327. }
  328. }