KMeansService.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Threading.Tasks;
  7. namespace HTEX.Lib.ETL
  8. {
  9. public class KMeansService
  10. {
  11. public static Dictionary<double, List<double>> KMeansOptimized(IEnumerable<double> datas, int optimalK = 0, int maxClusters = 10)
  12. {
  13. if (optimalK<=0)
  14. {
  15. // 使用肘部法则找到最佳聚类数目
  16. optimalK = FindOptimalK(datas, maxClusters);
  17. }
  18. // 使用 K-means++ 方法初始化质心
  19. List<double> centroids = InitializeCentroids(datas, optimalK);
  20. Dictionary<double, List<double>> clustersResults = new Dictionary<double, List<double>>();
  21. // 迭代次数
  22. int maxIterations = 100;
  23. for (int iteration = 0; iteration < maxIterations; iteration++)
  24. {
  25. // 将数据点分配给最近的质心
  26. Dictionary<double, List<double>> clusters = new Dictionary<double, List<double>>();
  27. foreach (int point in datas)
  28. {
  29. double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
  30. if (!clusters.ContainsKey(nearestCentroid))
  31. {
  32. clusters[nearestCentroid] = new List<double>();
  33. }
  34. clusters[nearestCentroid].Add(point);
  35. }
  36. // 更新质心位置
  37. List<double> newCentroids = new List<double>();
  38. foreach (KeyValuePair<double, List<double>> cluster in clusters)
  39. {
  40. double sum = cluster.Value.Sum();
  41. int count = cluster.Value.Count;
  42. double newCentroid = sum / count; // 计算均值
  43. newCentroids.Add(newCentroid);
  44. }
  45. // 检查质心是否改变
  46. clustersResults = clusters;
  47. if (newCentroids.SequenceEqual(centroids))
  48. {
  49. break;
  50. }
  51. centroids = newCentroids;
  52. }
  53. return clustersResults;
  54. }
  55. private static int FindOptimalK(IEnumerable<double> datas, int maxClusters)
  56. {
  57. double[] silhouetteCoefficients = new double[maxClusters];
  58. for (int k = 2; k <= maxClusters; k++) // 至少需要两个聚类
  59. {
  60. var centroids = InitializeCentroids(datas, k);
  61. var clusters = AssignPointsToClusters(datas, centroids);
  62. silhouetteCoefficients[k - 1] = CalculateSilhouetteCoefficient(clusters);
  63. }
  64. // 寻找最高的轮廓系数对应的聚类数
  65. int optimalK = 0;
  66. double maxCoefficient = double.MinValue;
  67. for (int i = 0; i < silhouetteCoefficients.Length; i++)
  68. {
  69. if (silhouetteCoefficients[i] > maxCoefficient)
  70. {
  71. maxCoefficient = silhouetteCoefficients[i];
  72. optimalK = i + 2; // 因为我们从2开始计算
  73. }
  74. }
  75. return optimalK;
  76. }
  77. private static double CalculateSilhouetteCoefficient(Dictionary<double, List<double>> clusters)
  78. {
  79. double silhouetteCoefficient = 0;
  80. int totalElements = clusters.Sum(cluster => cluster.Value.Count);
  81. foreach (var cluster in clusters)
  82. {
  83. foreach (var element in cluster.Value)
  84. {
  85. double a = AverageDistanceWithinCluster(element, cluster.Value);
  86. double b = Double.PositiveInfinity;
  87. foreach (var otherCluster in clusters.Where(c => !c.Key.Equals(cluster.Key)))
  88. {
  89. double dist = AverageDistanceBetweenClusters(element, otherCluster.Value);
  90. if (dist < b)
  91. {
  92. b = dist;
  93. }
  94. }
  95. silhouetteCoefficient += (b - a) / Math.Max(a, b);
  96. }
  97. }
  98. return silhouetteCoefficient / totalElements;
  99. }
  100. private static double AverageDistanceWithinCluster(double element, List<double> cluster)
  101. {
  102. double sum = 0;
  103. foreach (var otherElement in cluster.Where(e => !e.Equals(element)))
  104. {
  105. sum += Math.Abs(element - otherElement);
  106. }
  107. return sum / (cluster.Count - 1);
  108. }
  109. private static double AverageDistanceBetweenClusters(double element, List<double> otherCluster)
  110. {
  111. double sum = 0;
  112. foreach (var otherElement in otherCluster)
  113. {
  114. sum += Math.Abs(element - otherElement);
  115. }
  116. return sum / otherCluster.Count;
  117. }
  118. /// <summary>
  119. /// 随机方法初始化质心
  120. /// </summary>
  121. /// <param name="datas"></param>
  122. /// <param name="k"></param>
  123. /// <returns></returns>
  124. //private static List<int> InitializeCentroids(IEnumerable<int> datas, int k)
  125. //{
  126. // // K-means++ 初始化
  127. // var centroids = new List<int>();
  128. // var random = new Random();
  129. // // centroids.Add(datas.ElementAt(random.Next(datas.Count())));
  130. // centroids.Add(1);
  131. // while (centroids.Count < k)
  132. // {
  133. // int[] distancesSquared = datas.Select(dataPoint =>
  134. // centroids.Min(centroid => (dataPoint - centroid) * (dataPoint - centroid))).ToArray();
  135. // double totalDistance = distancesSquared.Sum();
  136. // //double r = random.NextDouble() * totalDistance;
  137. // double r = totalDistance;
  138. // double accumulator = 0;
  139. // for (int i = 0; i < distancesSquared.Length; i++)
  140. // {
  141. // accumulator += distancesSquared[i];
  142. // if (accumulator >= r)
  143. // {
  144. // centroids.Add(datas.ElementAt(i));
  145. // break;
  146. // }
  147. // }
  148. // }
  149. // return centroids;
  150. //}
  151. /// <summary>
  152. /// 分位数法 来初始化质心,将数据按照一定的百分位数划分来确定初始质心的位置。确保质心在整个数据范围内的分布更加均匀
  153. /// </summary>
  154. /// <param name="datas"></param>
  155. /// <param name="k"></param>
  156. /// <returns></returns>
  157. //private static List<int> InitializeCentroids(IEnumerable<int> datas, int k)
  158. //{
  159. // // 排序数据
  160. // var sortedData = datas.OrderBy(d => d).ToArray();
  161. // // 计算每个质心的位置
  162. // List<int> centroids = new List<int>();
  163. // for (int i = 0; i < k; i++)
  164. // {
  165. // // 计算第i个质心应该位于哪个位置
  166. // int index = (sortedData.Length * i + 1) / k - 1;
  167. // if (index < 0 || index >= sortedData.Length) continue; // 防止索引越界
  168. // centroids.Add(sortedData[index]);
  169. // }
  170. // return centroids;
  171. //}
  172. private static List<double> InitializeCentroids(IEnumerable<double> datas, int k)
  173. {
  174. // 计算每个数据点的局部密度
  175. var densities = CalculateLocalDensities(datas);
  176. // 选择高密度点作为初始质心
  177. var centroids = new List<double>();
  178. foreach (var density in densities.OrderByDescending(d => d.Value).Take(k))
  179. {
  180. centroids.Add(density.Key);
  181. }
  182. return centroids;
  183. }
  184. private static Dictionary<double, int> CalculateLocalDensities(IEnumerable<double> datas, int radius = 10)
  185. {
  186. // 计算每个数据点周围的邻居数量作为局部密度
  187. var densities = new Dictionary<double, int>();
  188. foreach (var dataPoint in datas)
  189. {
  190. int localDensity = datas.Count(p => Math.Abs(p - dataPoint) <= radius);
  191. densities[dataPoint] = localDensity;
  192. }
  193. return densities;
  194. }
  195. private static Dictionary<double, List<double>> AssignPointsToClusters(IEnumerable<double> datas, List<double> centroids)
  196. {
  197. // 分配数据点到最近的质心
  198. var clusters = new Dictionary<double, List<double>>();
  199. foreach (var point in datas)
  200. {
  201. double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
  202. if (!clusters.ContainsKey(nearestCentroid))
  203. {
  204. clusters[nearestCentroid] = new List<double>();
  205. }
  206. clusters[nearestCentroid].Add(point);
  207. }
  208. return clusters;
  209. }
  210. public static Dictionary<double, List<double>> KMeans(IEnumerable<double> datas)
  211. {
  212. // 初始化质心,可以选择数据中的k个随机点作为初始质心
  213. List<double> centroids = new List<double>();
  214. centroids.Add(datas.Min());
  215. centroids.Add(datas.Max());
  216. centroids.Sort(); // 对质心进行排序以避免重复
  217. Dictionary<double, List<double>> clustersResults = new Dictionary<double, List<double>>();
  218. // 迭代次数
  219. int maxIterations = 100;
  220. for (int iteration = 0; iteration < maxIterations; iteration++)
  221. {
  222. // 将数据点分配给最近的质心
  223. Dictionary<double, List<double>> clusters = new Dictionary<double, List<double>>();
  224. foreach (int point in datas)
  225. {
  226. double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
  227. if (!clusters.ContainsKey(nearestCentroid))
  228. {
  229. clusters[nearestCentroid] = new List<double>();
  230. }
  231. clusters[nearestCentroid].Add(point);
  232. }
  233. // 更新质心位置
  234. List<double> newCentroids = new List<double>();
  235. foreach (KeyValuePair<double, List<double>> cluster in clusters)
  236. {
  237. double sum = cluster.Value.Sum();
  238. int count = cluster.Value.Count;
  239. double newCentroid = sum *1.0/ count; // 计算均值
  240. newCentroids.Add(newCentroid);
  241. // 输出当前聚类的信息
  242. //Console.WriteLine($"Cluster with centroid {cluster.Key}:");
  243. //Console.WriteLine($"Min: {cluster.Value.Min()}, Max: {cluster.Value.Max()}, Average: {sum / count}");
  244. }
  245. // 检查质心是否改变
  246. newCentroids.Sort();
  247. clustersResults=clusters;
  248. if (newCentroids.SequenceEqual(centroids))
  249. {
  250. break;
  251. }
  252. centroids = newCentroids;
  253. }
  254. return clustersResults;
  255. }
  256. public static Dictionary<double, List<int>> KMeans(IEnumerable<int> datas)
  257. {
  258. // 初始化质心,可以选择数据中的k个随机点作为初始质心
  259. List<double> centroids = new List<double>();
  260. centroids.Add(datas.Min());
  261. centroids.Add(datas.Max());
  262. centroids.Sort(); // 对质心进行排序以避免重复
  263. Dictionary<double, List<int>> clustersResults = new Dictionary<double, List<int>>();
  264. // 迭代次数
  265. int maxIterations = 100;
  266. for (int iteration = 0; iteration < maxIterations; iteration++)
  267. {
  268. // 将数据点分配给最近的质心
  269. Dictionary<double, List<int>> clusters = new Dictionary<double, List<int>>();
  270. foreach (int point in datas)
  271. {
  272. double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
  273. if (!clusters.ContainsKey(nearestCentroid))
  274. {
  275. clusters[nearestCentroid] = new List<int>();
  276. }
  277. clusters[nearestCentroid].Add(point);
  278. }
  279. // 更新质心位置
  280. List<double> newCentroids = new List<double>();
  281. foreach (KeyValuePair<double, List<int>> cluster in clusters)
  282. {
  283. int sum = cluster.Value.Sum();
  284. int count = cluster.Value.Count;
  285. double newCentroid = sum *1.0/ count; // 计算均值
  286. newCentroids.Add(newCentroid);
  287. // 输出当前聚类的信息
  288. //Console.WriteLine($"Cluster with centroid {cluster.Key}:");
  289. //Console.WriteLine($"Min: {cluster.Value.Min()}, Max: {cluster.Value.Max()}, Average: {sum / count}");
  290. }
  291. // 检查质心是否改变
  292. newCentroids.Sort();
  293. clustersResults=clusters;
  294. if (newCentroids.SequenceEqual(centroids))
  295. {
  296. break;
  297. }
  298. centroids = newCentroids;
  299. }
  300. return clustersResults;
  301. }
  302. }
  303. }