实现基于Kmeans的商品价格聚类

# -*-coding:utf-8-*-"""Author: ThinkgamerDesc:代码4-9 实现基于Kmeans的商品价格聚类
"""
import numpy as np
import pandas as pd
import randomclass kMeans:def __init__(self):pass# 加载数据集def loadData(self,file):return pd.read_csv(file,header=0,sep=",")# 去除异常值,使用正态分布方法,同时保证最大异常值为5000,最小异常值为1def filterAnomalyValue(self,data):upper = np.mean(data["price"]) + 3 * np.std(data["price"])lower = np.mean(data["price"]) - 3 * np.std(data["price"])upper_limit = upper if upper > 5000 else 5000lower_limit = lower if lower > 1 else 1print("最大异常值为:{},最小异常值为:{}".format(upper_limit,lower_limit))# 过滤掉大于最大异常值和小于最小异常值的newData = data[(data["price"]<upper_limit)& (data["price"]>lower_limit)]return newData,upper_limit,lower_limit# 初始化簇类中心def initCenters(self,values,K,Cluster):random.seed(100)oldCenters = list()for i in range(K):index = random.randint(0,len(values))Cluster.setdefault(i,{})Cluster[i]["center"]=values[index]Cluster[i]["values"]=[]oldCenters.append(values[index])return oldCenters,Cluster# 计算任意两条数据之间的欧式距离def distance(self,price1,price2):return np.emath.sqrt(pow(price1-price2, 2))# 聚类def kMeans(self,data,K,maxIters):Cluster = dict() # 最终聚类结果oldCenters, Cluster = self.initCenters(data,K,Cluster)print("初始的簇类中心为:{}".format(oldCenters))# 标志变量,若为True,则继续迭代clusterChanged = Truei = 0  # 记录迭代次数 最大迭代while clusterChanged:for price in data:# 每条数据距离离最近簇类的距离,初始化为正无穷大minDistance = np.inf# 每条数据对应的索引,初始化为-1minIndex = -1for key in Cluster.keys():# 计算每条数据到簇类中心的距离dis = self.distance(price, Cluster[key]["center"])if dis < minDistance:minDistance = disminIndex = keyCluster[minIndex]["values"].append(price)newCenters = list()for key in Cluster.keys():newCenter = np.mean(Cluster[key]["values"])Cluster[key]["center"] = newCenternewCenters.append(newCenter)print("第{}次迭代后的簇类中心为:{}".format(i,newCenters))if oldCenters == newCenters or i > maxIters:clusterChanged = Falseelse:oldCenters = newCentersi += 1# 删除Cluster 中记录的簇类值for key in Cluster.keys(): Cluster[key]["values"]=[]return Clusterif __name__ == "__main__":file = "sku-price/skuid_price.csv"km = kMeans()data = km.loadData(file)newData,upper_limit,lower_limit = km.filterAnomalyValue(data)Cluster = km.kMeans(newData["price"].values,K=7,maxIters=200)print(Cluster)

结果

最大异常值为:5149.081853395541,最小异常值为:1
初始的簇类中心为:[362, 58, 48, 1881, 149, 145, 18]
第0次迭代后的簇类中心为:[639.5957446808511, 76.22099447513813, 42.116883116883116, 2633.59649122807, 194.6044776119403, 123.68, 15.355371900826446]
第1次迭代后的簇类中心为:[803.8540372670808, 78.05555555555556, 43.8034188034188, 3236.0897435897436, 259.305, 127.03703703703704, 13.345794392523365]
第2次迭代后的簇类中心为:[889.9057971014493, 80.45161290322581, 44.208333333333336, 3352.5416666666665, 332.25615763546796, 143.6, 13.345794392523365]
第3次迭代后的簇类中心为:[958.2520661157025, 84.01796407185628, 45.03174603174603, 3429.014705882353, 403.6180904522613, 162.56185567010309, 13.345794392523365]
第4次迭代后的簇类中心为:[1007.8967136150235, 91.15300546448087, 46.857142857142854, 3448.179104477612, 472.7724867724868, 187.87745098039215, 13.345794392523365]
第5次迭代后的簇类中心为:[1056.4972677595629, 99.67692307692307, 50.11842105263158, 3448.179104477612, 538.313829787234, 214.69117647058823, 14.368421052631579]
第6次迭代后的簇类中心为:[1097.975, 111.60765550239235, 53.98816568047337, 3448.179104477612, 579.7908163265306, 239.82872928176795, 15.355371900826446]
第7次迭代后的簇类中心为:[1128.7430555555557, 122.0377358490566, 57.79459459459459, 3448.179104477612, 615.2448979591836, 264.6140350877193, 16.34375]
第8次迭代后的簇类中心为:[1158.4, 132.95890410958904, 62.30890052356021, 3448.179104477612, 651.8952879581152, 296.39877300613495, 18.281690140845072]
第9次迭代后的簇类中心为:[1183.1596638655462, 145.78095238095239, 68.66341463414633, 3448.179104477612, 687.5934065934066, 328.2654320987654, 20.411392405063292]
第10次迭代后的簇类中心为:[1223.0833333333333, 160.47524752475246, 74.29596412556054, 3465.4545454545455, 717.1657458563536, 355.64935064935065, 21.810650887573964]
第11次迭代后的簇类中心为:[1254.0309278350514, 175.13197969543148, 80.17094017094017, 3465.4545454545455, 741.934065934066, 382.4166666666667, 23.704918032786885]
第12次迭代后的簇类中心为:[1303.3658536585365, 191.92513368983958, 85.99604743083005, 3465.4545454545455, 766.9424083769634, 405.6212121212121, 24.942708333333332]
第13次迭代后的簇类中心为:[1419.5694444444443, 204.44919786096256, 91.232, 3536.0967741935483, 798.890052356021, 436.9389312977099, 27.36190476190476]
第14次迭代后的簇类中心为:[1595.0892857142858, 218.5632183908046, 97.68339768339769, 3593.6610169491523, 839.5846153846154, 464.0220588235294, 29.254464285714285]
第15次迭代后的簇类中心为:[1781.9791666666667, 233.53529411764706, 104.02334630350195, 3669.4727272727273, 881.2553191489362, 499.0211267605634, 31.818930041152264]
第16次迭代后的簇类中心为:[1977.2093023255813, 244.36746987951807, 109.40873015873017, 3745.9411764705883, 918.4198895027624, 525.2266666666667, 34.011538461538464]
第17次迭代后的簇类中心为:[2163.2105263157896, 257.71069182389937, 116.08835341365462, 3803.1041666666665, 956.7159090909091, 548.0193548387097, 36.33812949640288]
第18次迭代后的簇类中心为:[2366.189189189189, 271.9483870967742, 122.59016393442623, 3907.767441860465, 1000.3636363636364, 576.1779141104295, 38.63513513513514]
第19次迭代后的簇类中心为:[2474.153846153846, 289.85526315789474, 128.77551020408163, 3998.025641025641, 1029.1633986928105, 603.1144578313254, 40.31715210355987]
第20次迭代后的簇类中心为:[2492.975, 310.91333333333336, 135.5483870967742, 4018.315789473684, 1045.041958041958, 628.8282208588957, 41.90654205607477]
第21次迭代后的簇类中心为:[2511.769230769231, 329.50993377483445, 141.08032128514057, 4018.315789473684, 1059.876811594203, 649.2866242038217, 43.259818731117825]
第22次迭代后的簇类中心为:[2531.2105263157896, 348.44666666666666, 147.96774193548387, 4018.315789473684, 1085.984251968504, 675.9177215189874, 45.05813953488372]
第23次迭代后的簇类中心为:[2550.1794871794873, 369.6808510638298, 158.16935483870967, 4038.5135135135133, 1100.7142857142858, 694.5253164556962, 47.38781163434903]
第24次迭代后的簇类中心为:[2605.625, 382.863309352518, 166.69787234042553, 4081.342857142857, 1117.9912280701753, 708.493670886076, 50.269633507853406]
第25次迭代后的簇类中心为:[2639.1428571428573, 399.94573643410854, 176.1949152542373, 4128.121212121212, 1134.066037735849, 722.3703703703703, 52.09620253164557]
第26次迭代后的簇类中心为:[2693.9024390243903, 411.4047619047619, 181.85470085470087, 4152.09375, 1161.0392156862745, 733.7743902439024, 53.43316831683168]
第27次迭代后的簇类中心为:[2693.9024390243903, 424.46031746031747, 188.5286343612335, 4152.09375, 1177.3473684210526, 748.6, 55.39088729016787]
第28次迭代后的簇类中心为:[2712.9, 441.15748031496065, 194.43805309734512, 4152.09375, 1190.1489361702127, 759.6100628930818, 56.6]
第29次迭代后的簇类中心为:[2712.9, 457.5833333333333, 202.08796296296296, 4152.09375, 1207.896551724138, 779.6474358974359, 58.893181818181816]
第30次迭代后的簇类中心为:[2712.9, 474.89230769230767, 209.56880733944953, 4152.09375, 1236.4675324675325, 800.2704402515724, 59.96420581655481]
第31次迭代后的簇类中心为:[2712.9, 490.43283582089555, 214.5852534562212, 4152.09375, 1283.125, 826.2085889570552, 60.93156732891832]
第32次迭代后的簇类中心为:[2712.9, 508.9136690647482, 220.85185185185185, 4152.09375, 1334.0377358490566, 852.6134969325153, 62.073913043478264]
第33次迭代后的簇类中心为:[2712.9, 519.8, 223.66355140186917, 4152.09375, 1387.9772727272727, 873.5121951219512, 62.747844827586206]
第34次迭代后的簇类中心为:[2748.157894736842, 530.6291390728477, 229.0097087378641, 4152.09375, 1452.4878048780488, 890.4906832298136, 64.4324894514768]
第35次迭代后的簇类中心为:[2804.054054054054, 544.6513157894736, 235.3574879227053, 4174.935483870968, 1505.175, 903.8789808917197, 65.27139874739039]
第36次迭代后的簇类中心为:[2822.3611111111113, 554.5562913907285, 242.00975609756097, 4174.935483870968, 1537.4871794871794, 912.3870967741935, 66.46090534979425]
第37次迭代后的簇类中心为:[2822.3611111111113, 566.5833333333334, 246.95544554455446, 4174.935483870968, 1574.5142857142857, 930.0463576158941, 67.51016260162602]
第38次迭代后的簇类中心为:[2822.3611111111113, 576.1552795031056, 251.76142131979697, 4174.935483870968, 1606.125, 945.374149659864, 68.75350701402806]
第39次迭代后的簇类中心为:[2822.3611111111113, 583.6036585365854, 256.8072916666667, 4174.935483870968, 1655.75, 960.4794520547945, 70.00592885375494]
第40次迭代后的簇类中心为:[2822.3611111111113, 595.3913043478261, 262.92424242424244, 4174.935483870968, 1655.75, 965.9859154929577, 70.1854043392505]
第41次迭代后的簇类中心为:[2822.3611111111113, 606.4076433121019, 270.55223880597015, 4174.935483870968, 1655.75, 970.0575539568346, 70.93150684931507]
第42次迭代后的簇类中心为:[2822.3611111111113, 611.9873417721519, 273.68, 4174.935483870968, 1655.75, 974.1764705882352, 71.50389105058366]
第43次迭代后的簇类中心为:[2822.3611111111113, 615.3270440251572, 277.1326530612245, 4174.935483870968, 1655.75, 976.9104477611941, 72.46628131021194]
第44次迭代后的簇类中心为:[2822.3611111111113, 620.8589743589744, 282.1015228426396, 4174.935483870968, 1655.75, 978.2857142857143, 73.0478927203065]
第45次迭代后的簇类中心为:[2822.3611111111113, 624.2193548387097, 287.7083333333333, 4174.935483870968, 1655.75, 979.6590909090909, 74.40831758034027]
第46次迭代后的簇类中心为:[2822.3611111111113, 627.5921052631579, 293.8042328042328, 4174.935483870968, 1655.75, 979.6590909090909, 75.57570093457944]
第47次迭代后的簇类中心为:[2822.3611111111113, 632.125, 298.5483870967742, 4174.935483870968, 1655.75, 982.3846153846154, 76.57037037037037]
第48次迭代后的簇类中心为:[2822.3611111111113, 635.51677852349, 302.3903743315508, 4174.935483870968, 1655.75, 982.3846153846154, 76.97785977859779]
第49次迭代后的簇类中心为:[2822.3611111111113, 638.9527027027027, 307.9344262295082, 4174.935483870968, 1655.75, 983.7364341085271, 78.1970802919708]
第50次迭代后的簇类中心为:[2822.3611111111113, 643.5479452054794, 313.2142857142857, 4174.935483870968, 1655.75, 985.1015625, 79.0126811594203]
第51次迭代后的簇类中心为:[2822.3611111111113, 649.3793103448276, 316.48369565217394, 4174.935483870968, 1655.75, 987.8333333333334, 79.2242314647378]
第52次迭代后的簇类中心为:[2822.3611111111113, 654.0206896551724, 318.9081081081081, 4174.935483870968, 1655.75, 990.5806451612904, 79.43682310469315]
第53次迭代后的簇类中心为:[2822.3611111111113, 658.6689655172414, 321.3279569892473, 4174.935483870968, 1655.75, 993.360655737705, 79.65225225225225]
第54次迭代后的簇类中心为:[2822.3611111111113, 659.8472222222222, 322.8817204301075, 4174.935483870968, 1655.75, 993.360655737705, 79.86870503597122]
第55次迭代后的簇类中心为:[2822.3611111111113, 661.027972027972, 325.7826086956522, 4174.935483870968, 1655.75, 993.360655737705, 80.51878354203936]
第56次迭代后的簇类中心为:[2822.3611111111113, 664.5633802816901, 328.9402173913044, 4174.935483870968, 1655.75, 994.7355371900826, 80.9536541889483]
第57次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 330.5217391304348, 4174.935483870968, 1655.75, 994.7355371900826, 81.17259786476869]
第58次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 331.20765027322403, 4174.935483870968, 1655.75, 994.7355371900826, 81.39253996447601]
第59次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 332.5911602209945, 4174.935483870968, 1655.75, 994.7355371900826, 81.83362831858408]
第60次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 333.2888888888889, 4174.935483870968, 1655.75, 994.7355371900826, 82.0547703180212]
第61次迭代后的簇类中心为:[2822.3611111111113, 665.7659574468086, 333.2888888888889, 4174.935483870968, 1655.75, 994.7355371900826, 82.0547703180212]
{0: {'center': 2822.3611111111113, 'values': [3308, 2988, 2404, 2360, 2667, 2854, 3311, 3070, 3227, 3444, 2817, 3271, 2528, 2884, 3011, 2707, 2563, 2695, 2545, 2369, 2569, 2966, 2308, 2889, 2709, 2867, 3361, 3290, 2372, 3288, 2402, 2823, 3042, 2381, 2406, 2909]}, 1: {'center': 665.7659574468086, 'values': [609, 757, 644, 758, 638, 768, 521, 802, 747, 781, 621, 761, 536, 705, 647, 601, 797, 707, 777, 781, 645, 594, 698, 723, 606, 692, 754, 626, 706, 505, 718, 676, 592, 602, 634, 524, 733, 623, 721, 593, 582, 794, 589, 541, 563, 777, 700, 706, 576, 669, 569, 522, 532, 566, 717, 596, 817, 646, 707, 584, 580, 742, 555, 618, 566, 726, 593, 658, 751, 681, 582, 585, 664, 768, 635, 743, 632, 640, 735, 545, 692, 627, 763, 560, 681, 515, 683, 731, 726, 808, 732, 635, 600, 684, 670, 523, 812, 643, 598, 735, 557, 618, 821, 732, 725, 567, 809, 553, 821, 803, 732, 536, 669, 818, 695, 500, 509, 814, 596, 509, 659, 674, 684, 778, 581, 741, 789, 589, 653, 788, 626, 793, 672, 567, 827, 579, 736, 640, 607, 658, 755]}, 2: {'center': 333.2888888888889, 'values': [385, 405, 233, 285, 247, 491, 439, 387, 237, 244, 245, 219, 482, 461, 448, 324, 432, 426, 378, 400, 495, 419, 259, 467, 311, 351, 292, 427, 213, 473, 229, 317, 217, 216, 338, 489, 306, 445, 314, 430, 352, 259, 268, 329, 273, 454, 465, 327, 339, 284, 211, 493, 310, 224, 273, 249, 213, 414, 344, 326, 349, 279, 395, 237, 252, 412, 362, 241, 267, 465, 385, 427, 449, 414, 468, 327, 420, 444, 267, 268, 294, 480, 257, 245, 225, 427, 444, 228, 253, 269, 336, 231, 288, 210, 359, 228, 475, 255, 427, 274, 229, 292, 301, 458, 295, 290, 453, 333, 369, 227, 386, 474, 265, 379, 398, 297, 241, 215, 209, 247, 483, 423, 299, 413, 327, 417, 378, 352, 273, 322, 247, 306, 271, 243, 385, 369, 343, 276, 377, 298, 473, 378, 427, 241, 485, 314, 232, 377, 331, 325, 395, 273, 493, 414, 475, 260, 444, 244, 297, 453, 415, 458, 281, 245, 319, 470, 244, 246, 235, 221, 270, 290, 222, 243, 288, 223, 227, 272, 208, 208]}, 3: {'center': 4174.935483870968, 'values': [4374, 4297, 3848, 3672, 4083, 4653, 3702, 3687, 5114, 5014, 3574, 4835, 4295, 4373, 4284, 3891, 3624, 3734, 4985, 3713, 4399, 4173, 4375, 4512, 4841, 3505, 3999, 3740, 4279, 4299, 3549]}, 4: {'center': 1655.75, 'values': [1760, 1433, 1430, 1326, 1528, 1556, 1516, 1564, 1382, 1934, 2044, 1395, 1409, 1514, 1604, 1447, 2042, 1512, 1773, 1421, 1881, 1810, 1874, 1542, 2062, 2145, 2062, 1395]}, 5: {'center': 994.7355371900826, 'values': [838, 1007, 847, 848, 981, 1061, 1041, 1039, 1063, 1039, 969, 1042, 1207, 1073, 1214, 1059, 865, 1078, 1006, 1121, 1017, 1249, 973, 1230, 849, 1124, 1259, 992, 837, 1268, 1254, 1213, 887, 1170, 1233, 1204, 1165, 1220, 1254, 1127, 1186, 1134, 924, 1027, 985, 861, 943, 921, 927, 834, 844, 974, 980, 871, 918, 1038, 838, 878, 993, 908, 952, 902, 964, 937, 848, 1002, 972, 972, 963, 1077, 999, 1017, 1029, 983, 887, 1000, 956, 991, 947, 863, 990, 999, 1015, 887, 898, 840, 995, 846, 1000, 1130, 909, 1014, 938, 1042, 893, 1103, 1037, 1076, 1043, 992, 894, 937, 843, 860, 1039, 1017, 896, 906, 875, 946, 992, 838, 911, 930, 923, 862, 899, 1069, 927, 1077, 907]}, 6: {'center': 82.0547703180212, 'values': [143, 196, 35, 127, 86, 127, 67, 28, 125, 125, 79, 53, 109, 35, 42, 51, 46, 67, 41, 40, 5, 93, 183, 24, 74, 151, 43, 74, 15, 176, 18, 52, 15, 48, 160, 35, 63, 37, 62, 159, 60, 80, 189, 107, 91, 94, 114, 159, 27, 48, 6, 169, 41, 120, 39, 111, 184, 71, 46, 160, 75, 37, 176, 17, 45, 77, 69, 90, 100, 31, 36, 61, 78, 83, 65, 71, 130, 3, 42, 33, 108, 85, 96, 182, 25, 72, 55, 190, 145, 72, 13, 69, 30, 13, 72, 97, 43, 124, 140, 119, 31, 45, 33, 34, 19, 70, 96, 52, 164, 200, 100, 154, 52, 177, 171, 82, 153, 123, 90, 66, 53, 75, 113, 22, 135, 202, 205, 122, 34, 156, 201, 173, 155, 71, 156, 137, 207, 56, 123, 71, 188, 175, 63, 174, 86, 116, 93, 125, 134, 187, 120, 14, 178, 187, 150, 38, 188, 117, 123, 190, 95, 68, 38, 100, 74, 53, 58, 11, 104, 58, 120, 19, 160, 5, 54, 87, 116, 77, 27, 65, 12, 32, 116, 56, 30, 95, 41, 39, 17, 158, 100, 135, 145, 121, 11, 116, 66, 74, 170, 6, 36, 7, 171, 94, 201, 63, 18, 143, 15, 86, 79, 96, 171, 3, 144, 206, 85, 126, 178, 92, 8, 50, 65, 3, 48, 58, 58, 7, 172, 177, 98, 74, 6, 89, 52, 4, 10, 101, 137, 70, 26, 55, 114, 31, 134, 84, 26, 53, 116, 20, 76, 154, 11, 5, 16, 18, 34, 79, 17, 82, 118, 76, 32, 62, 53, 7, 157, 58, 9, 3, 76, 61, 182, 79, 36, 39, 143, 33, 149, 157, 10, 63, 98, 88, 6, 6, 13, 103, 142, 92, 24, 99, 46, 4, 124, 69, 100, 6, 40, 90, 50, 180, 151, 102, 204, 146, 84, 142, 117, 3, 16, 168, 111, 70, 8, 96, 102, 74, 179, 155, 30, 133, 23, 64, 83, 24, 88, 23, 141, 50, 17, 59, 137, 49, 30, 27, 57, 84, 11, 54, 80, 14, 107, 135, 52, 71, 39, 143, 161, 77, 30, 39, 9, 48, 20, 53, 92, 143, 18, 59, 108, 56, 74, 88, 25, 105, 65, 64, 69, 14, 117, 14, 52, 70, 124, 94, 133, 72, 64, 142, 79, 50, 108, 117, 63, 121, 4, 81, 11, 92, 17, 14, 33, 48, 19, 127, 188, 3, 21, 12, 48, 6, 117, 124, 22, 63, 190, 139, 159, 103, 58, 12, 101, 16, 74, 37, 67, 193, 26, 69, 91, 40, 203, 178, 16, 35, 39, 60, 36, 96, 39, 166, 50, 40, 41, 125, 9, 55, 116, 63, 54, 36, 40, 123, 83, 92, 199, 58, 104, 142, 93, 12, 144, 61, 42, 66, 58, 152, 147, 189, 100, 63, 7, 79, 93, 112, 70, 157, 133, 89, 124, 60, 25, 97, 7, 79, 166, 9, 66, 93, 30, 84, 46, 118, 147, 107, 40, 50, 41, 63, 115, 175, 124, 7, 10, 201, 81, 9, 101, 105, 128, 83, 126, 45, 40, 100, 115, 147, 98, 22, 6, 99, 30, 36, 88, 141, 184, 91, 136, 188, 37, 70, 63, 206, 2, 22, 12, 143, 131, 174, 45, 19, 101, 71, 112, 3, 102, 95, 178, 172, 25, 31, 64, 32, 197, 148, 10, 106, 3, 175, 6, 21, 147, 10, 65, 51, 14, 166, 76, 44, 95, 113, 62, 65, 2, 11]}}

代码下载链接代码下载链接

实现基于二分-Kmeans的商品价格聚类

# -*-coding:utf-8-*-"""Author: ThinkgamerDesc:代码4-10 实现基于二分-Kmeans的商品价格聚类
"""
import numpy as np
import pandas as pd
import randomclass kMeans:def __init__(self):pass# 加载数据集def loadData(self,file):return pd.read_csv(file,header=0,sep=",")# 去除异常值,使用正态分布方法,同时保证最大异常值为5000,最小异常值为1def filterAnomalyValue(self,data):upper = np.mean(data["price"]) + 3 * np.std(data["price"])lower = np.mean(data["price"]) - 3 * np.std(data["price"])upper_limit = upper if upper > 5000 else 5000lower_limit = lower if lower > 1 else 1print("最大异常值为:{},最小异常值为:{}".format(upper_limit,lower_limit))# 过滤掉大于最大异常值和小于最小异常值的newData = data[(data["price"]<upper_limit)& (data["price"]>lower_limit)]return newData,upper_limit,lower_limit# 初始化簇类中心def initCenters(self,values,K,Cluster):random.seed(100)oldCenters = list()for i in range(K):index = random.randint(0,len(values))Cluster.setdefault(i,{})Cluster[i]["center"]=values[index]Cluster[i]["values"]=[]oldCenters.append(values[index])return oldCenters,Cluster# 计算任意两条数据之间的欧式距离def distance(self,price1,price2):return np.emath.sqrt(pow(price1-price2, 2))# 聚类def kMeans(self,data,K,maxIters):Cluster = dict() # 最终聚类结果oldCenters,Cluster = self.initCenters(data,K,Cluster)# print("初始的簇类中心为:{}".format(oldCenters))# 标志变量,若为True,则继续迭代clusterChanged = Truei = 0  # 记录迭代次数 最大迭代while clusterChanged:for price in data:# 每条数据距离离最近簇类的距离,初始化为正无穷大minDistance = np.inf# 每条数据对应的索引,初始化为-1minIndex = -1for key in Cluster.keys():# 计算每条数据到簇类中心的距离dis = self.distance(price, Cluster[key]["center"])if dis < minDistance:minDistance = disminIndex = keyCluster[minIndex]["values"].append(price)newCenters = list()for key in Cluster.keys():newCenter = np.mean(Cluster[key]["values"])Cluster[key]["center"] = newCenternewCenters.append(newCenter)# print("第{}次迭代后的簇类中心为:{}".format(i,newCenters))if oldCenters == newCenters or i > maxIters:clusterChanged = Falseelse:oldCenters = newCentersi += 1# 删除self.Cluster 中记录的簇类值for key in Cluster.keys(): Cluster[key]["values"]=[]return Cluster# 计算对应的SSE值def SSE(self,data,mean):newData = np.mat(data)-meanreturn (newData * newData.T).tolist()[0][0]# 二分kMeansdef diKMeans(self,data,K=7):clusterSSEResult = dict() # 簇类对应的SSE值clusterSSEResult.setdefault(0,{})clusterSSEResult[0]["values"] = dataclusterSSEResult[0]["sse"] = np.inf  # inf为正无穷大clusterSSEResult[0]["center"] = np.mean(data)while len(clusterSSEResult) < K:maxSSE = -np.infmaxSSEKey = 0# 找到最大SSE值对应数据,进行kmeans聚类for key in clusterSSEResult.keys():if clusterSSEResult[key]["sse"] > maxSSE:maxSSE = clusterSSEResult[key]["sse"]maxSSEKey = key# clusterResult {0: {'center': x, 'values': []}, 1: {'center': x, 'values': []}}clusterResult = \self.kMeans(clusterSSEResult[maxSSEKey]["values"],K=2,maxIters = 200)# 删除clusterSSE中的minKey对应的值del clusterSSEResult[maxSSEKey]# 将经过kMeas聚类后的结果赋值给clusterSSEResultclusterSSEResult.setdefault(maxSSEKey,{})clusterSSEResult[maxSSEKey]["center"]=clusterResult[0]["center"]clusterSSEResult[maxSSEKey]["values"]=clusterResult[0]["values"]clusterSSEResult[maxSSEKey]["sse"]=\self.SSE(clusterResult[0]["values"],clusterResult[0]["center"])maxKey = max(clusterSSEResult.keys()) + 1clusterSSEResult.setdefault(maxKey,{})clusterSSEResult[maxKey]["center"]=clusterResult[1]["center"]clusterSSEResult[maxKey]["values"]=clusterResult[1]["values"]clusterSSEResult[maxKey]["sse"]=\self.SSE(clusterResult[1]["values"],clusterResult[1]["center"])return clusterSSEResultif __name__ == "__main__":file = "../data/sku-price/skuid_price.csv"km = kMeans()data = km.loadData(file)newData,upper_limit,lower_limit = km.filterAnomalyValue(data)# Cluster = km.kMeans(newData["price"].values,K=7,maxIters=200)# print(Cluster)clusterSSE = km.diKMeans(newData["price"].values,K=7)print(clusterSSE)

结果

最大异常值为:5149.081853395541,最小异常值为:1
{3: {'center': 4152.09375, 'values': [4374, 4297, 3848, 3672, 4083, 4653, 3444, 3702, 3687, 5114, 5014, 3574, 4835, 4295, 4373, 4284, 3891, 3624, 3734, 4985, 3713, 4399, 4173, 4375, 4512, 4841, 3505, 3999, 3740, 4279, 4299, 3549], 'sse': 7261498.71875}, 1: {'center': 1127.6285714285714, 'values': [1007, 981, 1061, 1041, 1039, 1760, 1433, 1430, 1063, 1039, 969, 1326, 1042, 1528, 1207, 1073, 1214, 1059, 1078, 1006, 1556, 1121, 1017, 1249, 973, 1230, 1516, 1564, 1124, 1382, 1259, 992, 1268, 1254, 1213, 1170, 1233, 1395, 1409, 1514, 1604, 1204, 1447, 1512, 1165, 1220, 1773, 1254, 1127, 1421, 1186, 1542, 1134, 1395, 924, 1027, 985, 943, 921, 927, 974, 980, 918, 1038, 993, 952, 964, 937, 1002, 972, 972, 963, 1077, 999, 1017, 1029, 983, 1000, 956, 991, 947, 990, 999, 1015, 995, 1000, 1130, 1014, 938, 1042, 1103, 1037, 1076, 1043, 992, 937, 1039, 1017, 946, 992, 930, 923, 1069, 927, 1077], 'sse': 4282336.514285714}, 4: {'center': 707.6685714285715, 'values': [838, 609, 757, 847, 848, 644, 758, 638, 768, 521, 802, 747, 781, 621, 865, 761, 849, 536, 837, 705, 647, 601, 797, 707, 777, 781, 645, 594, 698, 723, 606, 692, 754, 626, 706, 505, 718, 676, 592, 602, 634, 524, 733, 623, 721, 593, 582, 794, 589, 541, 563, 777, 700, 887, 706, 576, 669, 569, 522, 532, 861, 566, 717, 596, 817, 646, 707, 584, 580, 742, 555, 618, 566, 834, 844, 726, 593, 658, 751, 681, 871, 582, 585, 664, 768, 635, 743, 632, 838, 640, 735, 545, 878, 692, 627, 763, 908, 560, 681, 902, 515, 848, 683, 731, 726, 808, 732, 635, 600, 684, 670, 523, 812, 643, 598, 735, 557, 887, 618, 863, 821, 732, 725, 567, 887, 898, 840, 809, 553, 821, 803, 846, 732, 536, 669, 818, 909, 695, 893, 509, 814, 596, 509, 659, 674, 684, 778, 894, 843, 860, 581, 741, 789, 896, 589, 906, 653, 875, 788, 626, 838, 793, 911, 672, 567, 862, 827, 579, 736, 899, 640, 607, 658, 755, 907], 'sse': 2263906.777142857}, 2: {'center': 336.3314606741573, 'values': [385, 405, 233, 285, 247, 491, 439, 387, 237, 244, 245, 219, 482, 461, 448, 324, 432, 426, 378, 400, 495, 419, 259, 467, 311, 351, 292, 427, 213, 473, 229, 317, 217, 216, 338, 489, 306, 445, 314, 430, 352, 259, 268, 329, 273, 454, 465, 327, 339, 284, 211, 493, 310, 224, 273, 249, 213, 414, 344, 326, 349, 279, 395, 237, 252, 412, 362, 241, 267, 465, 385, 427, 449, 414, 468, 327, 420, 444, 267, 268, 294, 480, 257, 245, 225, 427, 444, 228, 253, 269, 336, 231, 288, 210, 359, 228, 475, 255, 427, 274, 229, 292, 301, 458, 295, 290, 453, 333, 369, 227, 386, 474, 265, 379, 398, 297, 241, 215, 247, 483, 423, 299, 413, 327, 417, 378, 352, 500, 273, 322, 247, 306, 271, 243, 385, 369, 343, 276, 377, 298, 473, 378, 427, 241, 485, 314, 232, 377, 331, 325, 395, 273, 493, 414, 475, 260, 444, 244, 297, 453, 415, 458, 281, 245, 319, 470, 244, 246, 235, 221, 270, 290, 222, 243, 288, 223, 227, 272], 'sse': 1361889.4438202246}, 5: {'center': 82.72056239015818, 'values': [143, 196, 35, 127, 86, 127, 67, 28, 125, 125, 79, 53, 109, 35, 42, 51, 46, 67, 41, 40, 5, 93, 183, 24, 74, 151, 43, 74, 15, 176, 18, 52, 15, 48, 160, 35, 63, 37, 62, 159, 60, 80, 189, 107, 91, 94, 114, 159, 27, 48, 6, 169, 41, 120, 39, 111, 184, 71, 46, 160, 75, 37, 176, 17, 45, 77, 69, 90, 100, 31, 36, 61, 78, 83, 65, 71, 130, 3, 42, 33, 108, 85, 96, 182, 25, 72, 55, 190, 145, 72, 13, 69, 30, 13, 72, 97, 43, 124, 140, 119, 31, 45, 33, 34, 19, 70, 96, 52, 164, 200, 100, 154, 52, 177, 171, 82, 153, 123, 90, 66, 53, 75, 113, 22, 135, 202, 205, 122, 34, 156, 201, 173, 155, 71, 156, 137, 207, 56, 123, 71, 188, 175, 63, 174, 209, 86, 116, 93, 125, 134, 187, 120, 14, 178, 187, 150, 38, 188, 117, 123, 190, 95, 68, 38, 100, 74, 53, 58, 11, 104, 58, 120, 19, 160, 5, 54, 87, 116, 77, 27, 65, 12, 32, 116, 56, 30, 95, 41, 39, 17, 158, 100, 135, 145, 121, 11, 116, 66, 74, 170, 6, 36, 7, 171, 94, 201, 63, 18, 143, 15, 86, 79, 96, 171, 3, 144, 206, 85, 126, 178, 92, 8, 50, 65, 3, 48, 58, 58, 7, 172, 177, 98, 74, 6, 89, 52, 4, 10, 101, 137, 70, 26, 55, 114, 31, 134, 84, 26, 53, 116, 20, 76, 154, 11, 5, 16, 18, 34, 79, 17, 82, 118, 76, 32, 62, 53, 7, 157, 58, 9, 3, 76, 61, 182, 79, 36, 39, 143, 33, 149, 157, 10, 63, 98, 88, 6, 6, 13, 103, 142, 92, 24, 99, 46, 4, 124, 69, 100, 6, 40, 90, 50, 180, 151, 102, 204, 146, 84, 142, 117, 3, 16, 168, 111, 70, 8, 96, 102, 74, 179, 155, 30, 133, 23, 64, 83, 24, 88, 23, 141, 50, 17, 59, 137, 49, 30, 27, 57, 84, 11, 54, 80, 14, 107, 135, 52, 71, 39, 143, 161, 77, 30, 39, 9, 48, 20, 53, 92, 143, 18, 59, 108, 56, 74, 88, 25, 105, 65, 64, 69, 14, 117, 14, 52, 70, 124, 94, 133, 72, 64, 142, 79, 50, 108, 117, 63, 121, 4, 81, 11, 92, 17, 14, 33, 48, 19, 127, 188, 3, 21, 12, 48, 6, 117, 124, 22, 63, 190, 139, 159, 103, 58, 12, 101, 16, 74, 37, 67, 193, 26, 69, 91, 40, 203, 178, 16, 35, 39, 60, 36, 96, 39, 166, 50, 40, 41, 125, 9, 55, 116, 63, 54, 36, 40, 123, 83, 92, 199, 58, 104, 142, 93, 12, 144, 61, 42, 66, 58, 152, 147, 189, 100, 63, 7, 79, 93, 112, 70, 157, 133, 89, 124, 60, 25, 97, 7, 79, 166, 9, 66, 93, 30, 84, 46, 118, 147, 107, 40, 50, 41, 63, 115, 175, 124, 7, 10, 201, 81, 9, 101, 105, 128, 83, 126, 45, 40, 100, 115, 147, 98, 22, 6, 99, 30, 36, 88, 141, 184, 91, 136, 188, 208, 37, 70, 63, 206, 2, 22, 12, 143, 131, 174, 45, 19, 101, 71, 112, 3, 102, 95, 178, 172, 25, 31, 64, 32, 197, 148, 208, 10, 106, 3, 175, 6, 21, 147, 10, 65, 51, 14, 166, 76, 44, 95, 113, 62, 65, 2, 11], 'sse': 1723360.569420035}, 0: {'center': 2241.0, 'values': [2404, 2360, 2528, 1934, 2044, 2563, 2545, 2369, 2569, 2042, 2308, 2372, 2402, 1881, 1810, 1874, 2062, 2145, 2381, 2062, 2406], 'sse': 1233570.0}, 6: {'center': 2998.0, 'values': [3308, 2988, 2667, 2854, 3311, 3070, 3227, 2817, 3271, 2884, 3011, 2707, 2695, 2966, 2889, 2709, 2867, 3361, 3290, 3288, 2823, 3042, 2909], 'sse': 1134238.0}}

sk-learn中聚类效果评估

# -*-coding:utf-8-*-"""Author: ThinkgamerDesc:代码4-11 sk-learn中聚类效果评估
"""from sklearn import metricslabels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]# 以下预测结果均是  值越大 预测结果与真实结果越吻合# 兰德系数
print(metrics.adjusted_rand_score(labels_true, labels_pred))
# 互信息
print(metrics.adjusted_mutual_info_score(labels_true, labels_pred))# 同质性
print(metrics.homogeneity_score(labels_true, labels_pred))
# 完整性
print(metrics.completeness_score(labels_true, labels_pred))
# 同质性与完整性的调和平均
print(metrics.v_measure_score(labels_true, labels_pred) )# FMI
print( metrics.fowlkes_mallows_score(labels_true, labels_pred) )

结果

0.24242424242424243
0.2987924581708901
0.6666666666666669
0.420619835714305
0.5158037429793889
0.4714045207910317

基于Apriori算法实现频繁项集合相关规则挖掘

# -*-coding:utf-8-*-"""Author: ThinkgamerDesc:代码4-12 基于Apriori算法实现频繁项集合相关规则挖掘
"""class Apriori:def __init__(self, minSupport, minConfidence):# 最小支持度self.minSupport = minSupport# 最小置信度self.minConfidence = minConfidenceself.data = self.loadData()# 加载数据集def loadData(self):return [[1, 5], [2, 3, 4], [2, 3, 4, 5], [2, 3]]# 生成项集C1,不包含项集中每个元素出现的次数def createC1(self, data):C1 = list()  # C1为大小为1的项的集合for items in data:  # 遍历数据集for item in items:if [item] not in C1:C1.append([item])# map函数表示遍历C1中的每一个元素执行forzenset# frozenset表示“冰冻”的集合,即不可改变return list(map(frozenset, sorted(C1)))# 该函数用于从候选项集Ck生成Lk,Lk表示满足最低支持度的元素集合def scanD(self, Ck):# Data表示数据列表的列表 [set([]), set([]), set([]), set([])]Data = list(map(set, self.data))CkCount = {}# 统计Ck项集中每个元素出现的次数for items in Data:for one in Ck:# issubset:表示如果集合one中的每一元素都在items中则返回trueif one.issubset(items):CkCount.setdefault(one, 0)CkCount[one] += 1numItems = len(list(Data))  # 数据条数Lk = []  # 初始化符合支持度的项集supportData = {}  # 初始化所有符合条件的项集及对应的支持度for key in CkCount:# 计算每个项集的支持度,如果满足条件则把该项集加入到Lk列表中support = CkCount[key] * 1.0 / numItemsif support >= self.minSupport:Lk.insert(0, key)# 构建支持的项集的字典supportData[key] = supportreturn Lk, supportData# generateNewCk的输人参数为频繁项集列表Lk与项集元素个数k,输出为Ckdef generateNewCk(self, Lk, k):nextLk = []lenLk = len(Lk)# 若两个项集的长度为k-1,则必须前k-2项相同才可连接,即求并集,所以[:k-2]的实际作用为取列表的前k-1个元素for i in range(lenLk):for j in range(i + 1, lenLk):# 前k-2项相同时合并两个集合L1 = list(Lk[i])[: k - 2]L2 = list(Lk[j])[: k - 2]if sorted(L1) == sorted(L2):nextLk.append(Lk[i] | Lk[j])return nextLk# 生成频繁项集def gengrateLK(self):# 构建候选项集C1C1 = self.createC1(self.data)L1, supportData = self.scanD(C1)L = [L1]k = 2while len(L[k - 2]) > 0:# 组合项集Lk中的元素,声新的候选项集CkCk = self.generateNewCk(L[k - 2], k)Lk, supK = self.scanD(Ck)supportData.update(supK)L.append(Lk)k += 1return L, supportData# 生成关联规则def generateRules(self, L, supportData):ruleResult = []  # 最终记录的关联规则结果for i in range(1, len(L)):for ck in L[i]:Cks = [frozenset([item]) for item in ck]# 频繁项集中有三个及以上元素的集合self.rulesOfMore(ck, Cks, supportData, ruleResult)return ruleResult# 频繁项集只有两个元素def rulesOfTwo(self, ck, Cks, supportData, ruleResult):prunedH = []for oneCk in Cks:# 计算置信度conf = supportData[ck] / supportData[ck - oneCk]if conf >= self.minConfidence:print(ck - oneCk, "-->", oneCk, "Confidence is:", conf)ruleResult.append((ck - oneCk, oneCk, conf))prunedH.append(oneCk)return prunedH# 频繁项集中有三个及以上元素的集合,递归生成关联规则def rulesOfMore(self, ck, Cks, supportData, ruleResult):m = len(Cks[0])while len(ck) > m:Cks = self.rulesOfTwo(ck, Cks, supportData, ruleResult)if len(Cks) > 1:Cks = self.generateNewCk(Cks, m + 1)m += 1else:breakif __name__ == "__main__":apriori = Apriori(minSupport=0.5, minConfidence=0.6)L, supportData = apriori.gengrateLK()for one in L:print("项数为 %s 的频繁项集:" % (L.index(one) + 1), one)print("supportData:", supportData)print("minConf=0.6时:")rules = apriori.generateRules(L, supportData)

结果

项数为 1 的频繁项集: [frozenset({4}), frozenset({3}), frozenset({2}), frozenset({5})]
项数为 2 的频繁项集: [frozenset({2, 3}), frozenset({2, 4}), frozenset({3, 4})]
项数为 3 的频繁项集: [frozenset({2, 3, 4})]
项数为 4 的频繁项集: []
supportData: {frozenset({1}): 0.25, frozenset({5}): 0.5, frozenset({2}): 0.75, frozenset({3}): 0.75, frozenset({4}): 0.5, frozenset({3, 4}): 0.5, frozenset({2, 4}): 0.5, frozenset({2, 3}): 0.75, frozenset({4, 5}): 0.25, frozenset({3, 5}): 0.25, frozenset({2, 5}): 0.25, frozenset({2, 3, 4}): 0.5}
minConf=0.6时:
frozenset({3}) --> frozenset({2}) Confidence is: 1.0
frozenset({2}) --> frozenset({3}) Confidence is: 1.0
frozenset({4}) --> frozenset({2}) Confidence is: 1.0
frozenset({2}) --> frozenset({4}) Confidence is: 0.6666666666666666
frozenset({4}) --> frozenset({3}) Confidence is: 1.0
frozenset({3}) --> frozenset({4}) Confidence is: 0.6666666666666666
frozenset({3, 4}) --> frozenset({2}) Confidence is: 1.0
frozenset({2, 4}) --> frozenset({3}) Confidence is: 1.0
frozenset({2, 3}) --> frozenset({4}) Confidence is: 0.6666666666666666
frozenset({4}) --> frozenset({2, 3}) Confidence is: 1.0
frozenset({3}) --> frozenset({2, 4}) Confidence is: 0.6666666666666666
frozenset({2}) --> frozenset({3, 4}) Confidence is: 0.6666666666666666

数据挖掘-数据聚类 python实现相关推荐

  1. python函一维聚类_聚类实战:一维数组数据聚类

    大部分聚类方法针对的是多维数据,现实场景中还有可能存在以为数据的情况,针对以为数组的聚类和多维的数据有很大的不同,今天就来实战演练下: 需求内容:分析订单的价格分布 常见方案:按照100为梯度,分析不 ...

  2. python画罗小黑_python股票数据聚类算法:罗小黑战记的股票_XAC配资之家

    python股票数据聚类算法:罗小黑战记的股票 你好朋友,本文将为你分析python股票数据聚类算法怎么样以及罗小黑战记的股票的相关问题,如有任何疑问和留言咨询站长!有了尊严,我们的人生才会快乐.美好 ...

  3. python实现K-means多维数据聚类代码

    python实现K-means多维数据聚类 #!/usr/bin/env python #-*- coding:utf-8 -*- # author:wanglubao # datetime:2019 ...

  4. 【爬虫+数据可视化毕业设计:英雄联盟数据爬取及可视化分析,python爬虫可视化/数据分析/大数据/大数据屏/数据挖掘/数据爬取,程序开发-哔哩哔哩】

    [爬虫+数据可视化毕业设计:英雄联盟数据爬取及可视化分析,python爬虫可视化/数据分析/大数据/大数据屏/数据挖掘/数据爬取,程序开发-哔哩哔哩] https://b23.tv/TIoy6hj

  5. 【数据聚类】第三章第二节2:K-Means算法及其Python实现(算法实现、结果展示)

    pdf下载(密码:7281) 本文上接:[数据聚类]第三章第二节1:K-Means算法及其Python实现(距离度量方式.目标函数和算法流程) 本文下接:[数据聚类]第三章第二节3:K-Means算法 ...

  6. 【机器学习】—— K-means聚类算法原理详解 以及 二维、三维数据的K-means聚类Python实现

    文章目录 一.K-Means聚类算法原理过程 1.1 K-means聚类的一些细节 1.1.1 样本x(i)x^{(i)}x(i)与中心点μkμ_kμk​距离的衡量 1.1.2 K-means聚类算法 ...

  7. 【【数据可视化毕业设计:差旅数据可视化分析,python爬虫可视化/数据分析/大数据/大数据屏/数据挖掘/数据爬取,程序开发-哔哩哔哩】-哔哩哔哩】 https://b23.tv/iTt30QG

    [[数据可视化毕业设计:差旅数据可视化分析,python爬虫可视化/数据分析/大数据/大数据屏/数据挖掘/数据爬取,程序开发-哔哩哔哩]-哔哩哔哩] https://b23.tv/iTt30QG ht ...

  8. K-means 算法实现二维数据聚类

    所谓聚类分析,就是给定一个元素集合D,其中每个元素具有n个观测属性,对这些属性使用某种算法将D划分成K个子集,要求每个子集内部的元素之间相似度尽可能高,而不同子集的元素相似度尽可能低.聚类分析是一种无 ...

  9. 大数据聚类算法性能比较及实验报告

    在大数据领域这个聚类算法真是起到了十分重要的作用,只有通过有效地聚类才能得到非常直观的结果. 有一个实验要求对比两种大数据聚类算法的性能,具体的代码也不是由我实现的,我只是改了一部分,主要还是博客大佬 ...

最新文章

  1. oracle数据库gold,Oracle数据库之Oracle GoldenGate 12.2.0.1 安装、升级和删除
  2. HTTP协议的挑战者:RSocket
  3. android第一天
  4. [javaEE] JDBC快速入门
  5. 少儿编程教育是“揠苗助长”还是要培养未来的程序员?
  6. LeetCode 258 Add Digits
  7. python 持续集成部署_Jenkins部署git+python项目实现持续集成
  8. mysql 数据增量备份_MySQL数据库之mysql全量备份、增量备份实现方法
  9. 设计图书馆oracle触发器,Oracle触发器开发与设计
  10. java输出目录节点_节点如果不存在,如何创建目录?
  11. 使用C语言和Java分别实现冒泡排序和选择排序
  12. 记录淘宝里的点点滴滴
  13. eda交通灯控制器波形输入_EDA交通灯控制器设计
  14. 面试时工作经验不足,如何才能打动HR?
  15. 急!!!微信公众号数据迁移后openid无法转换
  16. 2020 校招,我是如何拿到小米、京东、字节大厂前端offer
  17. java学习书籍推荐
  18. 同花顺python_这是真的么 | 学会了用Python预测股票价格
  19. 信息系统项目管理:软件开发生命周期模型的选择比较
  20. 信息检索领域相关资料 (A Guide to Information Retrieval)

热门文章

  1. 北京新能源车指标新申请者或等8年 已排队至2027年
  2. 【深度学习智能手机步态识别】Deep Learning-Based Gait Recognition Using Smartphones in the Wild 论文理解
  3. 2018年北邮计算机考研分数线,2018年北京邮电大学软件学院考研复试分数线
  4. 布艺沙发选购知多少?
  5. 小程序的HTTP请求
  6. 电力监控系统的解决方案有哪些?
  7. windows+d失效问题解决 别看都看着里
  8. 广告学本科-近现代模拟题
  9. 论文阅读笔记:MGAT: Multi-view Graph Attention Networks
  10. linux如何修改用户属性,Linux 修改文件用户属性