HTML2ITEMV2Translator.cs 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. using HTEXLib.COMM.Helpers;
  2. using HTEXLib.DOCX.Models;
  3. using HTEXLib.Helpers.ShapeHelpers;
  4. using HtmlAgilityPack;
  5. using System;
  6. using System.Collections.Generic;
  7. using System.IO;
  8. using System.Linq;
  9. using System.Text;
  10. using System.Text.Json;
  11. using System.Text.RegularExpressions;
  12. using System.Threading.Tasks;
  13. namespace HTEXLib.Translator
  14. {
  15. public class HTML2ITEMV2Translator
  16. {
  17. public const string Answer = "Answer";
  18. public const string Analysis = "Analysis";
  19. public const string Ended = "Ended";
  20. public const string Point = "Point";
  21. public const string Score = "Score";
  22. public const string Summary = "Summary";
  23. public const string Filed = "Filed";
  24. public const string Level = "Level";
  25. public LangConfig langConfig { get; set; }
  26. public HtmlDocument doc { get; set; } = new HtmlDocument();
  27. public string[] optionsKeys { get; set; }
  28. public Dictionary<string, string[]> dict { get; set; }
  29. public string[] Fileds { get; set; }
  30. public List<LangConfig>? _langConfigs { get; set; } = new List<LangConfig>();
  31. public HTML2ITEMV2Translator(string configPath)
  32. {
  33. FileStream fs = new FileStream(configPath+ "/LangConfig.json", System.IO.FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
  34. StreamReader sr = new StreamReader(fs, System.Text.Encoding.UTF8);
  35. string line;
  36. StringBuilder builder = new StringBuilder();
  37. while ((line = sr.ReadLine()) != null)
  38. {
  39. builder.Append(line.ToString());
  40. }
  41. sr.Close();
  42. string text = builder.ToString();
  43. _langConfigs = JsonSerializer.Deserialize<List<LangConfig>>(text);
  44. }
  45. /// <summary>
  46. /// 处理标签中以及题型标签中包含的空格字符
  47. /// </summary>
  48. /// <param name="html"></param>
  49. /// <returns></returns>
  50. public string BlankProcess(string html)
  51. {
  52. string ans = langConfig.Item.Start + langConfig.Item.Answer + langConfig.Item.End;
  53. string als = langConfig.Item.Start + langConfig.Item.Analysis + langConfig.Item.End;
  54. string end = langConfig.Item.Start + langConfig.Item.Ended + langConfig.Item.End;
  55. string pot = langConfig.Item.Start + langConfig.Item.Point + langConfig.Item.End;
  56. string scr = langConfig.Item.Start + langConfig.Item.Score + langConfig.Item.End;
  57. string lvl = langConfig.Item.Start + langConfig.Item.Level + langConfig.Item.End;
  58. foreach (var filed in Fileds)
  59. {
  60. var fld= langConfig.Item.Start + filed + langConfig.Item.End;
  61. string[] fldarry = fld.Select(s => s.ToString()).ToArray();
  62. string fldReg = string.Join("\\s*", fldarry);
  63. html = Regex.Replace(html, fldReg, fld);
  64. }
  65. string[] ansarry = ans.Select(s => s.ToString()).ToArray();
  66. string[] alsarry = als.Select(s => s.ToString()).ToArray();
  67. string[] endarry = end.Select(s => s.ToString()).ToArray();
  68. string[] potarry = pot.Select(s => s.ToString()).ToArray();
  69. string[] scrarry = scr.Select(s => s.ToString()).ToArray();
  70. string[] lvlarry = lvl.Select(s => s.ToString()).ToArray();
  71. string ansReg = string.Join("\\s*", ansarry);
  72. string alsReg = string.Join("\\s*", alsarry);
  73. string endReg = string.Join("\\s*", endarry);
  74. string potReg = string.Join("\\s*", potarry);
  75. string scrReg = string.Join("\\s*", scrarry);
  76. string lvlReg = string.Join("\\s*", lvlarry);
  77. html = Regex.Replace(html, ansReg, ans);
  78. html = Regex.Replace(html, alsReg, als);
  79. html = Regex.Replace(html, endReg, end);
  80. html = Regex.Replace(html, potReg, pot);
  81. html = Regex.Replace(html, scrReg, scr);
  82. html = Regex.Replace(html, lvlReg, lvl);
  83. string blankReg = "\\s*";
  84. foreach (string value in langConfig.Item.Type.Values)
  85. {
  86. //string tag = langConfig.Item.Start + "\\s*" + "\\d+\\s*" + string.Join("\\s*", value.Select(s => s.ToString()).ToArray()) + "\\s*" + langConfig.Item.End;
  87. string tag = $"{langConfig.Item.Start}\\s*\\d+\\s*{string.Join("\\s*", value.Select(s => s.ToString()).ToArray())}\\s*{langConfig.Item.End}";
  88. var m = Regex.Match(html, tag);
  89. while (m.Success)
  90. {
  91. string blankStr = Regex.Replace(m.Value, blankReg, "");
  92. html = html.Replace(m.Value, blankStr);
  93. m = m.NextMatch();
  94. }
  95. }
  96. return html;
  97. }
  98. public List<DOCX.Models.ItemInfo> Translate(string html, JsonElement lang )
  99. {
  100. string mathjax = "<script type=\"text/javascript\" src=\"http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML\"></script>";
  101. html = html.Replace(mathjax, "");
  102. //去除class 以及span标签"
  103. string classpattern = "class=\"([^\"]*)\"";
  104. html = Regex.Replace(html, classpattern, "");
  105. string pattern = "<span([^>]{0,})>";
  106. html = Regex.Replace(html, pattern, "");
  107. html = html.Replace(" close=\"\" separators=\" | \">", "");
  108. html = html.Replace("\t", " ").Replace("<span>", "").Replace("</span>", "").Replace("dir=\"ltr\"", "");
  109. doc.LoadHtml(html);
  110. //初始化语言配置
  111. langConfig = null;
  112. var configArray = Regex.Split(doc.DocumentNode.InnerText, "{([\\S]*?)}");
  113. for (int index = 1; index < configArray.Length; index++) {
  114. if (index % 2 == 1) {
  115. langConfig = _langConfigs.Where(x => x.Lang.Equals(configArray[index],StringComparison.CurrentCultureIgnoreCase) ).FirstOrDefault();
  116. if (langConfig != null) {
  117. break;
  118. }
  119. }
  120. }
  121. if (langConfig == null && lang.ValueKind== JsonValueKind.String) {
  122. langConfig = _langConfigs.Where(x => x.Lang == lang.GetString()).FirstOrDefault();
  123. }
  124. if (langConfig == null) {
  125. throw new Exception();
  126. }
  127. //初始化标签配置
  128. Fileds = langConfig.Item.Filed.Split('|');
  129. dict = new Dictionary<string, string[]>
  130. {
  131. { langConfig.Item.Answer, new string[] { Answer } },
  132. { langConfig.Item.Analysis, new string[] { Analysis } },
  133. { langConfig.Item.Ended, new string[] { Ended } },
  134. { langConfig.Item.Point, new string[] { Point } },
  135. { langConfig.Item.Score, new string[] { Score } },
  136. { langConfig.Item.Level, new string[] { Level } }
  137. };
  138. foreach (string key in langConfig.Item.Type.Keys)
  139. {
  140. dict.Add(langConfig.Item.Type[key], new string[] { Summary, key });
  141. }
  142. foreach (var filed in Fileds)
  143. {
  144. dict.Add(filed, new string[] { Filed, $"{Array.IndexOf(Fileds, filed)}" });
  145. }
  146. optionsKeys = langConfig.Item.Options.Select(s => s.ToString()).ToArray();
  147. //处理 标签中包含的空格字符
  148. html = BlankProcess(html);
  149. html = Regex.Replace(html, "{" + langConfig.Lang + "}","", RegexOptions.IgnoreCase);
  150. var array = Regex.Split(html, "{([\\S]*?)}");
  151. List<KeyValuePair<int[], List<string>>> composeKeys = new List<KeyValuePair<int[], List<string>>>();
  152. //List<string>
  153. //处理综合题
  154. for (int index = 1; index < array.Length; index++)
  155. {
  156. var tagValue = BlankTag(array[index]);
  157. tagValue = Regex.Replace(tagValue, @"\d", "");
  158. if (dict.TryGetValue(tagValue, out string[] keyInfo))
  159. {
  160. if (keyInfo[0] == Summary && keyInfo[1].Equals("compose"))
  161. {
  162. var curr = index;
  163. List<string> comsArray = new List<string>() ;
  164. for (int composeIndex = index + 1; composeIndex < array.Length; composeIndex++) {
  165. var conIndex = BlankTag(array[composeIndex]);
  166. tagValue = Regex.Replace(tagValue, @"\d", "");
  167. if (conIndex.Equals(langConfig.Item.Ended) || conIndex.Equals(tagValue + langConfig.Item.Ended)) {
  168. comsArray.AddRange(array.ToList().GetRange(index+1, composeIndex - index-1));
  169. index = composeIndex + 1;
  170. break;
  171. }
  172. }
  173. //int[0]综合题开始标签位置,int[1]综合题结束标签位置,int[2]综合题第一个小题的开始标签位置
  174. KeyValuePair<int[], List<string>> coms = new KeyValuePair<int[], List<string>>(new int[] { curr, index }, comsArray);
  175. composeKeys.Add(coms);
  176. }
  177. }
  178. }
  179. List<KeyValuePair<int[], List<ItemInfo>>> composeList = new List<KeyValuePair<int[], List<ItemInfo>>>();
  180. foreach(var conskey in composeKeys) {
  181. List<DOCX.Models.ItemInfo> consInner = ConvertTest(conskey.Value.ToArray(), null);
  182. int stIndex = conskey.Key[0]+1;
  183. if (consInner.IsNotEmpty())
  184. {
  185. stIndex = consInner[0].order<=0? conskey.Key[0]+consInner[0].order:stIndex;
  186. }
  187. KeyValuePair<int[], List<ItemInfo>> innerComposeItem = new KeyValuePair<int[], List<ItemInfo>>(new int[] { conskey.Key[0], conskey.Key[1], stIndex }, consInner);
  188. composeList.Add(innerComposeItem);
  189. }
  190. List<DOCX.Models.ItemInfo> tests= ConvertTest(array, composeList);
  191. return tests;
  192. }
  193. private string BlankTag(string tagHtml) {
  194. //去掉标签中的Html
  195. doc.LoadHtml(tagHtml);
  196. var tagValue = doc.DocumentNode.InnerText.Replace("{", "").Replace("}", "")
  197. .Replace("\n", "").Replace(" ", "").Replace("\t", "").Replace("\r", "")
  198. .Replace("&nbsp;", "").Replace("&emsp;", "").Replace("&emsp;", "");
  199. // tagValue = Regex.Replace(tagValue, @"\d", "");
  200. tagValue = Regex.Replace(tagValue, @"\s", "");
  201. return tagValue;
  202. }
  203. public List<DOCX.Models.ItemInfo> ConvertTest(string[] array, List<KeyValuePair<int[], List<ItemInfo>>> composeList) {
  204. List<DOCX.Models.ItemInfo> tests = new List<DOCX.Models.ItemInfo>();
  205. //内容零时变量,追加完成后重新实例化 new StringBuilder()
  206. StringBuilder content = new StringBuilder();
  207. //告知遇到新标签,内容需要重新初始化实例
  208. DOCX.Models.ItemInfo test=null;
  209. string openTag = "";
  210. string openTagVal = "";
  211. bool openFlag = false;
  212. for (int index = 1; index < array.Length; index++) {
  213. if (index % 2 == 1)
  214. {
  215. //去掉标签中的{} 空格换行制表符及Html空格数字等
  216. var tagValue = BlankTag(array[index]);
  217. tagValue = Regex.Replace(tagValue, @"\d", "");
  218. if (dict.TryGetValue(tagValue, out string[] keyInfo))
  219. {
  220. switch ( keyInfo[0] ) {
  221. case Summary:
  222. if (!string.IsNullOrEmpty(openTag) && openFlag && test != null)
  223. {
  224. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  225. content = new StringBuilder();
  226. }
  227. if (test != null)
  228. {
  229. tests.Add(test);
  230. }
  231. //下列代码不能调整顺序
  232. if (keyInfo[1].Equals("compose"))
  233. {
  234. var id = System.Guid.NewGuid().ToString();
  235. var compose = new ItemInfo { type = keyInfo[1],objective=false,order=index,id=id };
  236. if (composeList.IsNotEmpty())
  237. {
  238. var childItem = composeList.Where(x => x.Key[0] == index).FirstOrDefault();
  239. childItem.Value.ForEach(x => x.pid = id);
  240. compose.children = childItem.Value;
  241. var ques= array.ToList().GetRange(childItem.Key[0]+1, childItem.Key[2]- childItem.Key[0]);
  242. compose.question= string.Join("", ques);
  243. index = childItem.Key[1] - 1;
  244. }
  245. tests.Add(compose);
  246. openTag = "";
  247. openTagVal = "";
  248. openFlag = false;
  249. test = null;
  250. }
  251. else {
  252. var id = System.Guid.NewGuid().ToString();
  253. test = new ItemInfo() { type = keyInfo[1], order = index, id = id };
  254. if (keyInfo[1].Equals("single") || keyInfo[1].Equals("multiple") || keyInfo[1].Equals("judge"))
  255. {
  256. test.objective = true;
  257. }
  258. else
  259. {
  260. test.objective = false;
  261. }
  262. openTag = Summary;
  263. openTagVal = tagValue;
  264. openFlag = true;
  265. }
  266. break;
  267. case Answer:
  268. //下列代码不能调整顺序
  269. if (!string.IsNullOrEmpty(openTag) && openFlag)
  270. {
  271. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  272. content = new StringBuilder();
  273. }
  274. openTag = Answer;
  275. openTagVal = tagValue;
  276. openFlag = true;
  277. break;
  278. case Analysis:
  279. //下列代码不能调整顺序
  280. if (!string.IsNullOrEmpty(openTag) && openFlag)
  281. {
  282. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  283. content = new StringBuilder();
  284. }
  285. openTag = Analysis;
  286. openTagVal = tagValue;
  287. openFlag = true;
  288. break;
  289. case Ended:
  290. break;
  291. case Point:
  292. //下列代码不能调整顺序
  293. if (!string.IsNullOrEmpty(openTag) && openFlag)
  294. {
  295. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  296. content = new StringBuilder();
  297. }
  298. openTag = Point;
  299. openTagVal = tagValue;
  300. openFlag = true;
  301. break;
  302. case Score:
  303. //下列代码不能调整顺序
  304. if (!string.IsNullOrEmpty(openTag) && openFlag)
  305. {
  306. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  307. content = new StringBuilder();
  308. }
  309. openTag = Score;
  310. openTagVal = tagValue;
  311. openFlag = true;
  312. break;
  313. case Level:
  314. //下列代码不能调整顺序
  315. if (!string.IsNullOrEmpty(openTag) && openFlag)
  316. {
  317. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  318. content = new StringBuilder();
  319. }
  320. openTag = Level;
  321. openTagVal = tagValue;
  322. openFlag = true;
  323. break;
  324. case Filed:
  325. //下列代码不能调整顺序
  326. if (!string.IsNullOrEmpty(openTag) && openFlag)
  327. {
  328. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  329. content = new StringBuilder();
  330. }
  331. openTag = Filed;
  332. openTagVal = tagValue;
  333. openFlag = true;
  334. break;
  335. }
  336. }
  337. //如果不是标签内的则累加到内容上
  338. else {
  339. content.Append(array[index]);
  340. }
  341. }
  342. else {
  343. //偶数序列为内容
  344. content.Append(array[index]);
  345. }
  346. }
  347. if (test != null)
  348. {
  349. DoOpenTag(openTag, openTagVal, openFlag, content, test);
  350. tests.Add(test);
  351. }
  352. return tests;
  353. }
  354. public (List<CodeValue> options,string question) OptionProcess(string question) {
  355. List<CodeValue> options = new List<CodeValue>();
  356. string optsRgex = optionsKeys[0] + "\\s*(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?).*"; ;
  357. string optsHtml = Regex.Match(question, optsRgex).Value;
  358. //StringBuilder textImg = new StringBuilder();
  359. for (int i = 0; i < optionsKeys.Length - 1; i++)
  360. {
  361. string optRgex = optionsKeys[i] + "\\s*(\\.|\\.|\\、|\\:|\\:)([\\s\\S]*?)" + optionsKeys[i + 1] + "\\s*(\\.|\\.|\\、|\\:|\\:)";
  362. string optHtml = Regex.Match(optsHtml, optRgex).Value;
  363. if (string.IsNullOrWhiteSpace(optHtml)) {
  364. optRgex = optionsKeys[i] + "\\s*(\\.|\\.|\\、|\\:|\\:).*";
  365. optHtml = Regex.Match(optsHtml, optRgex).Value;
  366. }
  367. if (!string.IsNullOrEmpty(optHtml))
  368. {
  369. optHtml = Regex.Replace(optHtml, optionsKeys[i + 1] + "\\s*(\\.|\\.|\\、|\\:|\\:)", "");
  370. optHtml = optHtml.Substring(2, optHtml.Length - 2);
  371. optHtml = HtmlHelper.DoUselessTag(optHtml);
  372. optHtml = optHtml.TrimStart().TrimEnd();
  373. //textImg.Append(HtmlHelper.DoTextImg(optHtml));
  374. options.Add(new CodeValue { code = optionsKeys[i], value = optHtml });
  375. }
  376. }
  377. if (!string.IsNullOrWhiteSpace(optsHtml))
  378. {
  379. return (options, question.Replace(optsHtml, ""));
  380. }
  381. else {
  382. return (null, question);
  383. }
  384. }
  385. public void DoOpenTag (string openTag,string openTagVal, bool openFlag, StringBuilder content , DOCX.Models.ItemInfo test) {
  386. if (test != null) {
  387. switch (openTag) {
  388. case Summary:
  389. if (test.type.Equals("single") || test.type.Equals("multiple")|| test.type.Equals("judge"))
  390. {
  391. (List<CodeValue> options, string question) = OptionProcess(content.ToString());
  392. test.option = options;
  393. test.question = HtmlHelper.DoUselessTag(question) ;
  394. }
  395. else {
  396. test.question = HtmlHelper.DoUselessTag(content.ToString());
  397. }
  398. break;
  399. case Answer:
  400. if (test.type.Equals("single") || test.type.Equals("multiple")|| test.type.Equals("judge"))
  401. {
  402. HashSet<string> ans = new HashSet<string>();
  403. var anstr = BlankTag(content.ToString());
  404. anstr.Select(s => s.ToString()).ToList().ForEach(x =>
  405. {
  406. ans.Add(x);
  407. });
  408. test.answer = ans.ToList();
  409. if (test.type.Equals("judge")) {
  410. string[] Judge = langConfig.Item.Judge.Split('|');
  411. List<CodeValue> option = new List<CodeValue>() { new CodeValue { code = "A", value = Judge[0] }, new CodeValue { code = "B", value = Judge[1] } };
  412. if (test.answer != null && test.answer.Count > 0)
  413. {
  414. int index = 0;
  415. foreach (var j in Judge)
  416. {
  417. if (String.Equals(test.answer[0], j, StringComparison.CurrentCultureIgnoreCase))
  418. {
  419. test.answer[0] = option[index].code;
  420. test.option = option;
  421. break;
  422. }
  423. index += 1;
  424. }
  425. }
  426. }
  427. }
  428. else {
  429. test.answer = new List<string>() { HtmlHelper.DoUselessTag(content.ToString()) };
  430. }
  431. break;
  432. case Analysis:
  433. test.explain = HtmlHelper.DoUselessTag(content.ToString());
  434. break;
  435. case Ended: break;
  436. case Point:
  437. string Points =BlankTag(content.ToString());
  438. if (!string.IsNullOrWhiteSpace(Points))
  439. {
  440. string[] ps = Regex.Split(Points, "\\.|\\.|\\、|\\:|\\:|\\,|\\,|\\;|\\;");
  441. if (ps != null && ps.Length > 0)
  442. {
  443. test.knowledge = ps.Distinct().ToList();
  444. }
  445. }
  446. break;
  447. case Score:
  448. //单选或多选,判断答案 脱html标签
  449. string Scores = BlankTag(content.ToString());
  450. //正则匹配数字 整数和小数点
  451. var reg = "^[0-9]+(\\.?[0-9]+)?";
  452. Match m1t = Regex.Match(Scores, reg);
  453. double.TryParse(m1t.Value, out double sc);
  454. test.score = sc;
  455. break;
  456. case Level:
  457. //单选或多选,判断答案 脱html标签
  458. string Levels = BlankTag(content.ToString());
  459. //正则匹配数字 整数和小数点
  460. var lelreg = "^[0-9]+(\\.?[0-9]+)?";
  461. Match lelm1t = Regex.Match(Levels, lelreg);
  462. int.TryParse(lelm1t.Value, out int lvl);
  463. test.level = lvl;
  464. break;
  465. case Filed:
  466. test.field = Array.IndexOf(Fileds, openTagVal) + 1;
  467. break;
  468. }
  469. }
  470. }
  471. }
  472. }