using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace HTEX.Lib.ETL { public class KMeansService { public static Dictionary> KMeansOptimized(IEnumerable datas, int optimalK = 0, int maxClusters = 10) { if (optimalK<=0) { // 使用肘部法则找到最佳聚类数目 optimalK = FindOptimalK(datas, maxClusters); } // 使用 K-means++ 方法初始化质心 List centroids = InitializeCentroids(datas, optimalK); Dictionary> clustersResults = new Dictionary>(); // 迭代次数 int maxIterations = 100; for (int iteration = 0; iteration < maxIterations; iteration++) { // 将数据点分配给最近的质心 Dictionary> clusters = new Dictionary>(); foreach (int point in datas) { double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First(); if (!clusters.ContainsKey(nearestCentroid)) { clusters[nearestCentroid] = new List(); } clusters[nearestCentroid].Add(point); } // 更新质心位置 List newCentroids = new List(); foreach (KeyValuePair> cluster in clusters) { double sum = cluster.Value.Sum(); int count = cluster.Value.Count; double newCentroid = sum / count; // 计算均值 newCentroids.Add(newCentroid); } // 检查质心是否改变 clustersResults = clusters; if (newCentroids.SequenceEqual(centroids)) { break; } centroids = newCentroids; } return clustersResults; } private static int FindOptimalK(IEnumerable datas, int maxClusters) { double[] silhouetteCoefficients = new double[maxClusters]; for (int k = 2; k <= maxClusters; k++) // 至少需要两个聚类 { var centroids = InitializeCentroids(datas, k); var clusters = AssignPointsToClusters(datas, centroids); silhouetteCoefficients[k - 1] = CalculateSilhouetteCoefficient(clusters); } // 寻找最高的轮廓系数对应的聚类数 int optimalK = 0; double maxCoefficient = double.MinValue; for (int i = 0; i < silhouetteCoefficients.Length; i++) { if (silhouetteCoefficients[i] > maxCoefficient) { maxCoefficient = silhouetteCoefficients[i]; optimalK = i + 2; // 因为我们从2开始计算 } } return optimalK; } private static double CalculateSilhouetteCoefficient(Dictionary> clusters) { double silhouetteCoefficient = 0; int totalElements = clusters.Sum(cluster => cluster.Value.Count); foreach (var cluster in clusters) { foreach (var element in cluster.Value) { double a = AverageDistanceWithinCluster(element, cluster.Value); double b = Double.PositiveInfinity; foreach (var otherCluster in clusters.Where(c => !c.Key.Equals(cluster.Key))) { double dist = AverageDistanceBetweenClusters(element, otherCluster.Value); if (dist < b) { b = dist; } } silhouetteCoefficient += (b - a) / Math.Max(a, b); } } return silhouetteCoefficient / totalElements; } private static double AverageDistanceWithinCluster(double element, List cluster) { double sum = 0; foreach (var otherElement in cluster.Where(e => !e.Equals(element))) { sum += Math.Abs(element - otherElement); } return sum / (cluster.Count - 1); } private static double AverageDistanceBetweenClusters(double element, List otherCluster) { double sum = 0; foreach (var otherElement in otherCluster) { sum += Math.Abs(element - otherElement); } return sum / otherCluster.Count; } /// /// 随机方法初始化质心 /// /// /// /// //private static List InitializeCentroids(IEnumerable datas, int k) //{ // // K-means++ 初始化 // var centroids = new List(); // var random = new Random(); // // centroids.Add(datas.ElementAt(random.Next(datas.Count()))); // centroids.Add(1); // while (centroids.Count < k) // { // int[] distancesSquared = datas.Select(dataPoint => // centroids.Min(centroid => (dataPoint - centroid) * (dataPoint - centroid))).ToArray(); // double totalDistance = distancesSquared.Sum(); // //double r = random.NextDouble() * totalDistance; // double r = totalDistance; // double accumulator = 0; // for (int i = 0; i < distancesSquared.Length; i++) // { // accumulator += distancesSquared[i]; // if (accumulator >= r) // { // centroids.Add(datas.ElementAt(i)); // break; // } // } // } // return centroids; //} /// /// 分位数法 来初始化质心,将数据按照一定的百分位数划分来确定初始质心的位置。确保质心在整个数据范围内的分布更加均匀 /// /// /// /// //private static List InitializeCentroids(IEnumerable datas, int k) //{ // // 排序数据 // var sortedData = datas.OrderBy(d => d).ToArray(); // // 计算每个质心的位置 // List centroids = new List(); // for (int i = 0; i < k; i++) // { // // 计算第i个质心应该位于哪个位置 // int index = (sortedData.Length * i + 1) / k - 1; // if (index < 0 || index >= sortedData.Length) continue; // 防止索引越界 // centroids.Add(sortedData[index]); // } // return centroids; //} private static List InitializeCentroids(IEnumerable datas, int k) { // 计算每个数据点的局部密度 var densities = CalculateLocalDensities(datas); // 选择高密度点作为初始质心 var centroids = new List(); foreach (var density in densities.OrderByDescending(d => d.Value).Take(k)) { centroids.Add(density.Key); } return centroids; } private static Dictionary CalculateLocalDensities(IEnumerable datas, int radius = 10) { // 计算每个数据点周围的邻居数量作为局部密度 var densities = new Dictionary(); foreach (var dataPoint in datas) { int localDensity = datas.Count(p => Math.Abs(p - dataPoint) <= radius); densities[dataPoint] = localDensity; } return densities; } private static Dictionary> AssignPointsToClusters(IEnumerable datas, List centroids) { // 分配数据点到最近的质心 var clusters = new Dictionary>(); foreach (var point in datas) { double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First(); if (!clusters.ContainsKey(nearestCentroid)) { clusters[nearestCentroid] = new List(); } clusters[nearestCentroid].Add(point); } return clusters; } public static Dictionary> KMeans(IEnumerable datas) { // 初始化质心,可以选择数据中的k个随机点作为初始质心 List centroids = new List(); centroids.Add(datas.Min()); centroids.Add(datas.Max()); centroids.Sort(); // 对质心进行排序以避免重复 Dictionary> clustersResults = new Dictionary>(); // 迭代次数 int maxIterations = 100; for (int iteration = 0; iteration < maxIterations; iteration++) { // 将数据点分配给最近的质心 Dictionary> clusters = new Dictionary>(); foreach (int point in datas) { double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First(); if (!clusters.ContainsKey(nearestCentroid)) { clusters[nearestCentroid] = new List(); } clusters[nearestCentroid].Add(point); } // 更新质心位置 List newCentroids = new List(); foreach (KeyValuePair> cluster in clusters) { double sum = cluster.Value.Sum(); int count = cluster.Value.Count; double newCentroid = sum *1.0/ count; // 计算均值 newCentroids.Add(newCentroid); // 输出当前聚类的信息 //Console.WriteLine($"Cluster with centroid {cluster.Key}:"); //Console.WriteLine($"Min: {cluster.Value.Min()}, Max: {cluster.Value.Max()}, Average: {sum / count}"); } // 检查质心是否改变 newCentroids.Sort(); clustersResults=clusters; if (newCentroids.SequenceEqual(centroids)) { break; } centroids = newCentroids; } return clustersResults; } public static Dictionary> KMeans(IEnumerable datas) { // 初始化质心,可以选择数据中的k个随机点作为初始质心 List centroids = new List(); centroids.Add(datas.Min()); centroids.Add(datas.Max()); centroids.Sort(); // 对质心进行排序以避免重复 Dictionary> clustersResults = new Dictionary>(); // 迭代次数 int maxIterations = 100; for (int iteration = 0; iteration < maxIterations; iteration++) { // 将数据点分配给最近的质心 Dictionary> clusters = new Dictionary>(); foreach (int point in datas) { double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First(); if (!clusters.ContainsKey(nearestCentroid)) { clusters[nearestCentroid] = new List(); } clusters[nearestCentroid].Add(point); } // 更新质心位置 List newCentroids = new List(); foreach (KeyValuePair> cluster in clusters) { int sum = cluster.Value.Sum(); int count = cluster.Value.Count; double newCentroid = sum *1.0/ count; // 计算均值 newCentroids.Add(newCentroid); // 输出当前聚类的信息 //Console.WriteLine($"Cluster with centroid {cluster.Key}:"); //Console.WriteLine($"Min: {cluster.Value.Min()}, Max: {cluster.Value.Max()}, Average: {sum / count}"); } // 检查质心是否改变 newCentroids.Sort(); clustersResults=clusters; if (newCentroids.SequenceEqual(centroids)) { break; } centroids = newCentroids; } return clustersResults; } } }