MLService.cs 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. using Microsoft.ML.Data;
  2. using Microsoft.ML;
  3. namespace HTEX.Test.Service
  4. {
  5. public static class MLService
  6. {
  7. /// <summary>
  8. ///
  9. /// </summary>
  10. /// <param name="datas">数据需要去掉0</param>
  11. /// <param name="numberOfClusters"></param>
  12. /// <returns></returns>
  13. public static List<ClusterData> KMeans(float[] datas, int numberOfClusters = 2)
  14. {
  15. List<DataPoint> data = new List<DataPoint>();
  16. foreach (var d in datas)
  17. {
  18. data.Add(new DataPoint { Feature = d });
  19. }
  20. // 定义数据视图
  21. var mlContext = new MLContext();
  22. var dataView = mlContext.Data.LoadFromEnumerable(data);
  23. // 定义聚类管道
  24. var pipeline = mlContext.Transforms.Concatenate("Features", new[] { "Feature" })
  25. .Append(mlContext.Clustering.Trainers.KMeans(numberOfClusters: numberOfClusters)); // 假设我们想要将数据分成3个集群
  26. // 训练模型
  27. var model = pipeline.Fit(dataView);
  28. // 转换数据以获取聚类结果
  29. var predictions = model.Transform(dataView);
  30. // 提取聚类结果
  31. var inMemoryCollection = mlContext.Data.CreateEnumerable<ClusterPrediction>(predictions, reuseRowObject: false);
  32. // 打印聚类结果
  33. //var clusterSizes = new int[3]; // 假设有3个聚类
  34. int index = 0;
  35. List<ClusterData> clusterDatas = new List<ClusterData>();
  36. foreach (var prediction in inMemoryCollection)
  37. {
  38. //Console.WriteLine($"Data point: {data[index].Feature}, Cluster: {prediction.ClusterId}");
  39. var clusterData = clusterDatas.Find(x => x.ClusterId.Equals(prediction.ClusterId));
  40. if (clusterData!=null)
  41. {
  42. clusterData.count +=1;
  43. clusterData.datas.Add(data[index].Feature);
  44. clusterData.avg=clusterData.datas.Sum()*1.0F / clusterData.datas.Count();
  45. }
  46. else
  47. {
  48. clusterDatas.Add(new ClusterData { avg=data[index].Feature, count=1, ClusterId=prediction.ClusterId, datas=new List<float> { data[index].Feature } });
  49. }
  50. index++;
  51. //计算每个聚类的数据点数量
  52. //clusterSizes[prediction.ClusterId-1]++;
  53. }
  54. // 预测聚类
  55. // 确定最密集的部分
  56. // 这通常需要对聚类结果进行分析,比如计算每个聚类的平均距离、大小等
  57. // 在这里,你可以通过比较不同聚类的数据点数量或计算聚类中心周围的密度来估计哪个是最密集的
  58. // 找出最大的聚类
  59. // var maxClusterIndex = clusterSizes.ToList().IndexOf(clusterSizes.Max());
  60. //Console.WriteLine($"The densest cluster is cluster {maxClusterIndex} with {clusterSizes[maxClusterIndex]} data points.");
  61. // 你还可以进一步分析聚类的特性,比如找出聚类中心、计算聚类内的方差等
  62. return clusterDatas;
  63. }
  64. /// <summary>
  65. ///
  66. /// </summary>
  67. /// <param name="datas"></param>
  68. /// <param name="numberOfClusters"></param>
  69. /// <param name="dropPercent">最大平均数的聚类与数量最多的聚类数量的落差小于30% 则以更高的为准</param>
  70. /// <returns></returns>
  71. public static (ClusterData clusterData, List<ClusterData> clusterDatas) GetNormalCluster (float[] datas, int numberOfClusters = 2,double dropPercent=0.3)
  72. {
  73. List<ClusterData> clusterDatas = KMeans(datas, numberOfClusters);
  74. clusterDatas=clusterDatas.OrderByDescending(dr => dr.count).ToList();
  75. ClusterData clusterData = FindSatisfactoryRecord(clusterDatas, 0, dropPercent);
  76. return (clusterData, clusterDatas);
  77. }
  78. static ClusterData FindSatisfactoryRecord(List<ClusterData> data, int currentIndex,double dropPercent)
  79. { // 如果当前索引小于0,说明已经到达列表开头,返回null
  80. if (currentIndex < 0) { return null; }
  81. // 获取当前数据
  82. ClusterData current = data.ElementAt(currentIndex);
  83. if (currentIndex+1>=data.Count())
  84. {
  85. return current;
  86. }
  87. else
  88. {
  89. ClusterData next = data.ElementAt(currentIndex +1); // 检查平均值和人数差是否满足条件
  90. if (current.avg > next.avg)
  91. {
  92. return current;
  93. }
  94. else
  95. {
  96. var d = (current.count- next.count)*1.0/current.count;
  97. if (d>=dropPercent)
  98. {
  99. return current;
  100. }
  101. else
  102. { // 递归调用,继续向前比较
  103. return FindSatisfactoryRecord(data, currentIndex + 1, dropPercent);
  104. }
  105. }
  106. }
  107. }
  108. }
  109. // 定义数据模型
  110. public class DataPoint
  111. {
  112. public float Feature { get; set; }
  113. }
  114. // 聚类预测类
  115. public class ClusterPrediction
  116. {
  117. [ColumnName("PredictedLabel")]
  118. public uint ClusterId;
  119. // 你可以添加其他预测列,比如距离聚类中心的距离等
  120. }
  121. public class ClusterData
  122. {
  123. public List<float> datas = new List<float>();
  124. public uint ClusterId { get; set; }
  125. public int count { get; set; }
  126. public float avg { get; set; }
  127. }
  128. }