123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343 |
- using System;
- using System.Collections;
- using System.Collections.Generic;
- using System.Linq;
- using System.Text;
- using System.Threading.Tasks;
- namespace HTEX.Lib.ETL
- {
- public class KMeansService
- {
- public static Dictionary<double, List<double>> KMeansOptimized(IEnumerable<double> datas, int optimalK = 0, int maxClusters = 10)
- {
-
- if (optimalK<=0)
- {
- // 使用肘部法则找到最佳聚类数目
- optimalK = FindOptimalK(datas, maxClusters);
- }
- // 使用 K-means++ 方法初始化质心
- List<double> centroids = InitializeCentroids(datas, optimalK);
- Dictionary<double, List<double>> clustersResults = new Dictionary<double, List<double>>();
- // 迭代次数
- int maxIterations = 100;
- for (int iteration = 0; iteration < maxIterations; iteration++)
- {
- // 将数据点分配给最近的质心
- Dictionary<double, List<double>> clusters = new Dictionary<double, List<double>>();
- foreach (int point in datas)
- {
- double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
- if (!clusters.ContainsKey(nearestCentroid))
- {
- clusters[nearestCentroid] = new List<double>();
- }
- clusters[nearestCentroid].Add(point);
- }
- // 更新质心位置
- List<double> newCentroids = new List<double>();
- foreach (KeyValuePair<double, List<double>> cluster in clusters)
- {
- double sum = cluster.Value.Sum();
- int count = cluster.Value.Count;
- double newCentroid = sum / count; // 计算均值
- newCentroids.Add(newCentroid);
- }
- // 检查质心是否改变
- clustersResults = clusters;
- if (newCentroids.SequenceEqual(centroids))
- {
- break;
- }
- centroids = newCentroids;
- }
- return clustersResults;
- }
- private static int FindOptimalK(IEnumerable<double> datas, int maxClusters)
- {
- double[] silhouetteCoefficients = new double[maxClusters];
- for (int k = 2; k <= maxClusters; k++) // 至少需要两个聚类
- {
- var centroids = InitializeCentroids(datas, k);
- var clusters = AssignPointsToClusters(datas, centroids);
- silhouetteCoefficients[k - 1] = CalculateSilhouetteCoefficient(clusters);
- }
- // 寻找最高的轮廓系数对应的聚类数
- int optimalK = 0;
- double maxCoefficient = double.MinValue;
- for (int i = 0; i < silhouetteCoefficients.Length; i++)
- {
- if (silhouetteCoefficients[i] > maxCoefficient)
- {
- maxCoefficient = silhouetteCoefficients[i];
- optimalK = i + 2; // 因为我们从2开始计算
- }
- }
- return optimalK;
- }
- private static double CalculateSilhouetteCoefficient(Dictionary<double, List<double>> clusters)
- {
- double silhouetteCoefficient = 0;
- int totalElements = clusters.Sum(cluster => cluster.Value.Count);
- foreach (var cluster in clusters)
- {
- foreach (var element in cluster.Value)
- {
- double a = AverageDistanceWithinCluster(element, cluster.Value);
- double b = Double.PositiveInfinity;
- foreach (var otherCluster in clusters.Where(c => !c.Key.Equals(cluster.Key)))
- {
- double dist = AverageDistanceBetweenClusters(element, otherCluster.Value);
- if (dist < b)
- {
- b = dist;
- }
- }
- silhouetteCoefficient += (b - a) / Math.Max(a, b);
- }
- }
- return silhouetteCoefficient / totalElements;
- }
- private static double AverageDistanceWithinCluster(double element, List<double> cluster)
- {
- double sum = 0;
- foreach (var otherElement in cluster.Where(e => !e.Equals(element)))
- {
- sum += Math.Abs(element - otherElement);
- }
- return sum / (cluster.Count - 1);
- }
- private static double AverageDistanceBetweenClusters(double element, List<double> otherCluster)
- {
- double sum = 0;
- foreach (var otherElement in otherCluster)
- {
- sum += Math.Abs(element - otherElement);
- }
- return sum / otherCluster.Count;
- }
- /// <summary>
- /// 随机方法初始化质心
- /// </summary>
- /// <param name="datas"></param>
- /// <param name="k"></param>
- /// <returns></returns>
- //private static List<int> InitializeCentroids(IEnumerable<int> datas, int k)
- //{
- // // K-means++ 初始化
- // var centroids = new List<int>();
- // var random = new Random();
- // // centroids.Add(datas.ElementAt(random.Next(datas.Count())));
- // centroids.Add(1);
- // while (centroids.Count < k)
- // {
- // int[] distancesSquared = datas.Select(dataPoint =>
- // centroids.Min(centroid => (dataPoint - centroid) * (dataPoint - centroid))).ToArray();
- // double totalDistance = distancesSquared.Sum();
- // //double r = random.NextDouble() * totalDistance;
- // double r = totalDistance;
- // double accumulator = 0;
- // for (int i = 0; i < distancesSquared.Length; i++)
- // {
- // accumulator += distancesSquared[i];
- // if (accumulator >= r)
- // {
- // centroids.Add(datas.ElementAt(i));
- // break;
- // }
- // }
- // }
- // return centroids;
- //}
- /// <summary>
- /// 分位数法 来初始化质心,将数据按照一定的百分位数划分来确定初始质心的位置。确保质心在整个数据范围内的分布更加均匀
- /// </summary>
- /// <param name="datas"></param>
- /// <param name="k"></param>
- /// <returns></returns>
- //private static List<int> InitializeCentroids(IEnumerable<int> datas, int k)
- //{
- // // 排序数据
- // var sortedData = datas.OrderBy(d => d).ToArray();
- // // 计算每个质心的位置
- // List<int> centroids = new List<int>();
- // for (int i = 0; i < k; i++)
- // {
- // // 计算第i个质心应该位于哪个位置
- // int index = (sortedData.Length * i + 1) / k - 1;
- // if (index < 0 || index >= sortedData.Length) continue; // 防止索引越界
- // centroids.Add(sortedData[index]);
- // }
- // return centroids;
- //}
- private static List<double> InitializeCentroids(IEnumerable<double> datas, int k)
- {
- // 计算每个数据点的局部密度
- var densities = CalculateLocalDensities(datas);
- // 选择高密度点作为初始质心
- var centroids = new List<double>();
- foreach (var density in densities.OrderByDescending(d => d.Value).Take(k))
- {
- centroids.Add(density.Key);
- }
- return centroids;
- }
- private static Dictionary<double, int> CalculateLocalDensities(IEnumerable<double> datas, int radius = 10)
- {
- // 计算每个数据点周围的邻居数量作为局部密度
- var densities = new Dictionary<double, int>();
- foreach (var dataPoint in datas)
- {
- int localDensity = datas.Count(p => Math.Abs(p - dataPoint) <= radius);
- densities[dataPoint] = localDensity;
- }
- return densities;
- }
- private static Dictionary<double, List<double>> AssignPointsToClusters(IEnumerable<double> datas, List<double> centroids)
- {
- // 分配数据点到最近的质心
- var clusters = new Dictionary<double, List<double>>();
- foreach (var point in datas)
- {
- double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
- if (!clusters.ContainsKey(nearestCentroid))
- {
- clusters[nearestCentroid] = new List<double>();
- }
- clusters[nearestCentroid].Add(point);
- }
- return clusters;
- }
- public static Dictionary<double, List<double>> KMeans(IEnumerable<double> datas)
- {
- // 初始化质心,可以选择数据中的k个随机点作为初始质心
- List<double> centroids = new List<double>();
- centroids.Add(datas.Min());
- centroids.Add(datas.Max());
- centroids.Sort(); // 对质心进行排序以避免重复
- Dictionary<double, List<double>> clustersResults = new Dictionary<double, List<double>>();
- // 迭代次数
- int maxIterations = 100;
- for (int iteration = 0; iteration < maxIterations; iteration++)
- {
- // 将数据点分配给最近的质心
- Dictionary<double, List<double>> clusters = new Dictionary<double, List<double>>();
- foreach (int point in datas)
- {
- double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
- if (!clusters.ContainsKey(nearestCentroid))
- {
- clusters[nearestCentroid] = new List<double>();
- }
- clusters[nearestCentroid].Add(point);
- }
- // 更新质心位置
- List<double> newCentroids = new List<double>();
- foreach (KeyValuePair<double, List<double>> cluster in clusters)
- {
- double sum = cluster.Value.Sum();
- int count = cluster.Value.Count;
- double newCentroid = sum *1.0/ count; // 计算均值
- newCentroids.Add(newCentroid);
- // 输出当前聚类的信息
- //Console.WriteLine($"Cluster with centroid {cluster.Key}:");
- //Console.WriteLine($"Min: {cluster.Value.Min()}, Max: {cluster.Value.Max()}, Average: {sum / count}");
- }
- // 检查质心是否改变
- newCentroids.Sort();
- clustersResults=clusters;
- if (newCentroids.SequenceEqual(centroids))
- {
- break;
- }
- centroids = newCentroids;
- }
- return clustersResults;
- }
- public static Dictionary<double, List<int>> KMeans(IEnumerable<int> datas)
- {
- // 初始化质心,可以选择数据中的k个随机点作为初始质心
- List<double> centroids = new List<double>();
- centroids.Add(datas.Min());
- centroids.Add(datas.Max());
- centroids.Sort(); // 对质心进行排序以避免重复
- Dictionary<double, List<int>> clustersResults = new Dictionary<double, List<int>>();
- // 迭代次数
- int maxIterations = 100;
- for (int iteration = 0; iteration < maxIterations; iteration++)
- {
- // 将数据点分配给最近的质心
- Dictionary<double, List<int>> clusters = new Dictionary<double, List<int>>();
- foreach (int point in datas)
- {
- double nearestCentroid = centroids.OrderBy(c => Math.Abs(point - c)).First();
- if (!clusters.ContainsKey(nearestCentroid))
- {
- clusters[nearestCentroid] = new List<int>();
- }
- clusters[nearestCentroid].Add(point);
- }
- // 更新质心位置
- List<double> newCentroids = new List<double>();
- foreach (KeyValuePair<double, List<int>> cluster in clusters)
- {
- int sum = cluster.Value.Sum();
- int count = cluster.Value.Count;
- double newCentroid = sum *1.0/ count; // 计算均值
- newCentroids.Add(newCentroid);
- // 输出当前聚类的信息
- //Console.WriteLine($"Cluster with centroid {cluster.Key}:");
- //Console.WriteLine($"Min: {cluster.Value.Min()}, Max: {cluster.Value.Max()}, Average: {sum / count}");
- }
- // 检查质心是否改变
- newCentroids.Sort();
- clustersResults=clusters;
- if (newCentroids.SequenceEqual(centroids))
- {
- break;
- }
- centroids = newCentroids;
- }
- return clustersResults;
- }
- }
- }
|