序 此文为其他文章的代码部分:
🥵硬啃-Machine-Learning 
也提供了 notebook 形式: 代码地址 
数据预处理方法 标准化-均值移除 import  numpy as  npimport  sklearn.preprocessing as  spraw_samples = np.array([     [3.0 , -1.0 , 2.0 ],\     [0.0 , 4.0 , 3.0 ], \     [1.0 , -4.0 , 2.0 ]]\ ) print (raw_samples.mean(axis=0 ))print (raw_samples.std(axis=0 ))std_samples = raw_samples.copy()   for  col in  std_samples.T:      col_mean = col.mean()       col_std = col.std()       col -= col_mean       col /= col_std   print (std_samples.mean(axis=0 ))print (std_samples.std(axis=0 ))std_samples = sp.scale(raw_samples) print (std_samples.mean(axis=0 ))print (std_samples.std(axis=0 ))
[ 1.33333333 -0.33333333  2.33333333]
[1.24721913 3.29983165 0.47140452]
[ 5.55111512e-17  0.00000000e+00 -2.96059473e-16]
[1. 1. 1.]
[ 5.55111512e-17  0.00000000e+00 -2.96059473e-16]
[1. 1. 1.]
范围缩放 import  numpy as  npimport  sklearn.preprocessing as  spraw_samples = np.array([     [1.0 , 2.0 , 3.0 ],\     [4.0 , 5.0 , 6.0 ],\     [7.0 , 8.0 , 9.0 ]]).astype("float64" ) mms_samples = raw_samples.copy()   for  col in  mms_samples.T:    col_min = col.min ()     col_max = col.max ()     col -= col_min     col /= (col_max - col_min) print (mms_samples)mms_samples = sp.MinMaxScaler(feature_range=(0 , 1 ))\   .fit_transform(raw_samples)   print (mms_samples)
[[0.  0.  0. ]
 [0.5 0.5 0.5]
 [1.  1.  1. ]]
[[0.  0.  0. ]
 [0.5 0.5 0.5]
 [1.  1.  1. ]]
归一化 import  numpy as  npimport  sklearn.preprocessing as  spraw_samples = np.array([     [10.0 , 20.0 , 5.0 ],\     [8.0 , 10.0 , 1.0 ]]) nor_samples = raw_samples.copy() for  row in  nor_samples:    row /= abs (row).sum ()   print (nor_samples)  nor_samples = sp.normalize(raw_samples.copy(), norm='l1' ) print (nor_samples)  
[[0.28571429 0.57142857 0.14285714]
 [0.42105263 0.52631579 0.05263158]]
[[0.28571429 0.57142857 0.14285714]
 [0.42105263 0.52631579 0.05263158]]
二值化 import  numpy as  npimport  sklearn.preprocessing as  spraw_samples = np.array([[65.5 , 89.0 , 73.0 ],\                         [55.0 , 99.0 , 98.5 ],\                         [45.0 , 22.5 , 60.0 ]]) bin_samples = raw_samples.copy()   mask1 = bin_samples < 60  print (mask1)mask2 = bin_samples >= 60  bin_samples[mask1] = 0  print (bin_samples)bin_samples[mask2] = 1  print (bin_samples)  bin_transformer = sp.Binarizer(threshold=60  - 1 )   bin_samples = bin_transformer.transform(raw_samples.copy())   print (bin_samples)
[[False False False]
 [ True False False]
 [ True  True False]]
[[65.5 89.  73. ]
 [ 0.  99.  98.5]
 [ 0.   0.  60. ]]
[[1. 1. 1.]
 [0. 1. 1.]
 [0. 0. 1.]]
[[1. 1. 1.]
 [0. 1. 1.]
 [0. 0. 1.]]
独热编码示例 import  numpy as  npimport  sklearn.preprocessing as  spraw_samples = np.array([[1 , 3 , 2 ],\                         [7 , 5 , 4 ],\                         [1 , 8 , 6 ],\                         [7 , 3 , 9 ]]) one_hot_encoder = sp.OneHotEncoder(     sparse=False ,       dtype="int32" ,     categories="auto" )   oh_samples = one_hot_encoder.fit_transform(raw_samples.copy()) print (oh_samples)print (one_hot_encoder.inverse_transform(oh_samples))  
[[1 0 1 0 0 1 0 0 0]
 [0 1 0 1 0 0 1 0 0]
 [1 0 0 0 1 0 0 1 0]
 [0 1 1 0 0 0 0 0 1]]
[[1 3 2]
 [7 5 4]
 [1 8 6]
 [7 3 9]]
标签编码 import  numpy as  npimport  sklearn.preprocessing as  spraw_samples = np.array(['audi' , 'ford' , 'audi' , 'bmw' , 'ford' , 'bmw' ]) lb_encoder = sp.LabelEncoder()   lb_samples = lb_encoder.fit_transform(raw_samples.copy())   print (lb_samples)print (lb_encoder.inverse_transform(lb_samples))  
[0 2 0 1 2 1]
['audi' 'ford' 'audi' 'bmw' 'ford' 'bmw']
基本问题 回归问题 线性回归 import  numpy as  npimport  matplotlib.pyplot as  mpfrom  mpl_toolkits.mplot3d import  axes3dimport  sklearn.preprocessing as  sptrain_x = np.array([0.5 , 0.6 , 0.8 , 1.1 , 1.4 ])   train_y = np.array([5.0 , 5.5 , 6.0 , 6.8 , 7.0 ])   n_epochs = 30    l_rate = 0.01    epochs = []   losses = []   w0, w1 = [1 ], [1 ]   for  i in  range (1 , n_epochs + 1 ):    epochs.append(i)       y = w0[-1 ] + w1[-1 ] * train_x            loss = (((train_y - y)**2 ).sum ()) / 2      losses.append(loss)       print ("%d: w0=%f, w1=%f, loss=%f"  % (i, w0[-1 ], w1[-1 ], loss))          d0 = -(train_y - y).sum ()     d1 = -(train_x * (train_y - y)).sum ()          w0.append(w0[-1 ] - (d0 * l_rate))     w1.append(w1[-1 ] - (d1 * l_rate)) w0 = np.array(w0[:-1 ]) w1 = np.array(w1[:-1 ]) mp.figure("Losses" , facecolor="lightgray" )   mp.title("epoch" , fontsize=20 ) mp.ylabel("loss" , fontsize=14 ) mp.grid(linestyle=":" )   mp.plot(epochs, losses, c="blue" , label="loss" ) mp.legend()   mp.tight_layout()   pred_y = w0[-1 ] + w1[-1 ] * train_x   mp.figure("Linear Regression" , facecolor="lightgray" ) mp.title("Linear Regression" , fontsize=20 ) mp.xlabel("x" , fontsize=14 ) mp.ylabel("y" , fontsize=14 ) mp.grid(linestyle=":" ) mp.scatter(train_x, train_y, c="blue" , label="Traing" )   mp.plot(train_x, pred_y, c="red" , label="Regression" ) mp.legend() arr1 = np.linspace(0 , 10 , 500 )   arr2 = np.linspace(0 , 3.5 , 500 )   grid_w0, grid_w1 = np.meshgrid(arr1, arr2)   flat_w0, flat_w1 = grid_w0.ravel(), grid_w1.ravel()   loss_metrix = train_y.reshape(-1 , 1 )   outer = np.outer(train_x, flat_w1)   flat_loss = (((flat_w0 + outer - loss_metrix)**2 ).sum (axis=0 )) / 2  grid_loss = flat_loss.reshape(grid_w0.shape) mp.figure('Loss Function' ) ax = mp.axes(projection='3d' ) mp.title('Loss Function' , fontsize=14 ) ax.set_xlabel('w0' , fontsize=14 ) ax.set_ylabel('w1' , fontsize=14 ) ax.set_zlabel('loss' , fontsize=14 ) ax.plot_surface(grid_w0,                 grid_w1,                 grid_loss,                 rstride=10 ,                 cstride=10 ,                 cmap='jet' ) ax.plot(w0, w1, losses, 'o-' , c='orangered' , label='BGD' , zorder=5 ) mp.legend(loc='lower left' ) mp.show() 
1: w0=1.000000, w1=1.000000, loss=44.175000
2: w0=1.209000, w1=1.190600, loss=36.538828
3: w0=1.399164, w1=1.363579, loss=30.231687
4: w0=1.572208, w1=1.520546, loss=25.022227
5: w0=1.729693, w1=1.662961, loss=20.719373
6: w0=1.873039, w1=1.792151, loss=17.165309
7: w0=2.003532, w1=1.909325, loss=14.229691
8: w0=2.122345, w1=2.015577, loss=11.804865
9: w0=2.230542, w1=2.111905, loss=9.801916
10: w0=2.329091, w1=2.199215, loss=8.147408
11: w0=2.418871, w1=2.278330, loss=6.780688
12: w0=2.500681, w1=2.349997, loss=5.651660
13: w0=2.575247, w1=2.414898, loss=4.718950
14: w0=2.643230, w1=2.473648, loss=3.948384
15: w0=2.705228, w1=2.526811, loss=3.311740
16: w0=2.761786, w1=2.574896, loss=2.785706
17: w0=2.813402, w1=2.618367, loss=2.351029
18: w0=2.860524, w1=2.657645, loss=1.991807
19: w0=2.903561, w1=2.693114, loss=1.694907
20: w0=2.942886, w1=2.725122, loss=1.449482
21: w0=2.978836, w1=2.753985, loss=1.246572
22: w0=3.011719, w1=2.779990, loss=1.078777
23: w0=3.041814, w1=2.803399, loss=0.939987
24: w0=3.069373, w1=2.824449, loss=0.825153
25: w0=3.094629, w1=2.843355, loss=0.730107
26: w0=3.117790, w1=2.860315, loss=0.651405
27: w0=3.139046, w1=2.875507, loss=0.586204
28: w0=3.158572, w1=2.889091, loss=0.532154
29: w0=3.176523, w1=2.901216, loss=0.487315
30: w0=3.193044, w1=2.912016, loss=0.450086
import  numpy as  npimport  sklearn.linear_model as  lm  import  sklearn.metrics as  sm  import  matplotlib.pyplot as  mptrain_x = np.array([[0.5 ], [0.6 ], [0.8 ], [1.1 ], [1.4 ]])   train_y = np.array([5.0 , 5.5 , 6.0 , 6.8 , 7.0 ])   line_model = lm.LinearRegression() line_model.fit(train_x, train_y) pred_y = line_model.predict(train_x) print ("coef_:" , line_model.coef_)  print ("intercept_:" , line_model.intercept_)  mp.figure('Linear Regression' , facecolor='lightgray' ) mp.title('Linear Regression' , fontsize=20 ) mp.xlabel('x' , fontsize=14 ) mp.ylabel('y' , fontsize=14 ) mp.tick_params(labelsize=10 ) mp.grid(linestyle=':' ) mp.scatter(train_x, train_y, c='blue' , alpha=0.8 , s=60 , label='Sample' ) mp.plot(     train_x,       pred_y,       c='orangered' ,     label='Regression Line' ) mp.legend()   mp.show() 
coef_: [2.2189781]
intercept_: 4.107299270072994
import  matplotlib.pyplot as  pltimport  numpy as  npfrom  sklearn import  datasets, linear_model, model_selectionX, y = datasets.load_diabetes(return_X_y=True ) print (X.shape)print (X[0 ])X = X[:, np.newaxis, 2 ] X_train, X_test, y_train, y_test = model_selection.train_test_split(     X, y, test_size=0.33 ) model = linear_model.LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) plt.scatter(X_test, y_test, color='black' ) plt.plot(X_test, y_pred, color='blue' , linewidth=3 ) plt.show() 
(442, 10)
[ 0.03807591  0.05068012  0.06169621  0.02187235 -0.0442235  -0.03482076
 -0.04340085 -0.00259226  0.01990842 -0.01764613]
import  pandas as  pdimport  matplotlib.pyplot as  pltimport  numpy as  npfrom  datetime import  datetimepumpkins = pd.read_csv('./_data_set/US-pumpkins.csv' ) pumpkins = pumpkins[pumpkins['Package' ].str .contains('bushel' ,                                                      case =True ,                                                      regex=True )] pumpkins = pumpkins.drop([     c for  c in  pumpkins.columns if  c not  in  [         'Package' , 'Variety' , 'City Name' , 'Month' , 'Low Price' , 'High Price' ,         'Date'      ] ],                          axis=1 ) price = (pumpkins['Low Price' ] + pumpkins['High Price' ]) / 2  new_pumpkins = pd.DataFrame({          'Month' :     pd.DatetimeIndex(pumpkins['Date' ]).month,     'DayOfYear' :     pd.to_datetime(pumpkins['Date' ]).apply(         lambda  dt: (dt - datetime(dt.year, 1 , 1 )).days),     'Variety' :     pumpkins['Variety' ],     'City' :     pumpkins['City Name' ],     'Package' :     pumpkins['Package' ],     'Low Price' :     pumpkins['Low Price' ],     'High Price' :     pumpkins['High Price' ],     'Price' :     price }) new_pumpkins.loc[new_pumpkins['Package' ].str .contains('1 1/9' ),                  'Price' ] = price / (1  + 1  / 9 ) new_pumpkins.loc[new_pumpkins['Package' ].str .contains('1/2' ),                  'Price' ] = price / (1  / 2 ) plt.scatter(new_pumpkins.Price, new_pumpkins.Month) plt.xlabel('Price' ) plt.ylabel('Month' ) plt.show() new_pumpkins.groupby(['Month' ])['Price' ].mean().plot(kind='bar' ) plt.ylabel('Month' ) ax = None  colors = ['red' , 'blue' , 'green' , 'yellow' ] for  i, var in  enumerate (new_pumpkins['Variety' ].unique()):    ax = new_pumpkins[new_pumpkins['Variety' ] == var].plot.scatter('DayOfYear' ,                                                                    'Price' ,                                                                    ax=ax,                                                                    c=colors[i],                                                                    label=var) 
多项式回归 import  pandas as  pdimport  matplotlib.pyplot as  pltimport  numpy as  npfrom  datetime import  datetimefrom  sklearn.linear_model import  LinearRegressionfrom  sklearn.metrics import  r2_score, mean_squared_error, mean_absolute_errorfrom  sklearn.model_selection import  train_test_splitfrom  sklearn.preprocessing import  PolynomialFeaturesfrom  sklearn.pipeline import  make_pipelinepumpkins = pd.read_csv('./_data_set/US-pumpkins.csv' ) pumpkins = pumpkins[pumpkins['Package' ].str .contains('bushel' ,                                                      case =True ,                                                      regex=True )] pumpkins = pumpkins.drop([     c for  c in  pumpkins.columns if  c not  in  [         'Package' , 'Variety' , 'City Name' , 'Month' , 'Low Price' , 'High Price' ,         'Date'      ] ],                          axis=1 ) price = (pumpkins['Low Price' ] + pumpkins['High Price' ]) / 2  new_pumpkins = pd.DataFrame({     'Month' :     pd.DatetimeIndex(pumpkins['Date' ]).month,     'DayOfYear' :     pd.to_datetime(pumpkins['Date' ]).apply(         lambda  dt: (dt - datetime(dt.year, 1 , 1 )).days),     'Variety' :     pumpkins['Variety' ],     'City' :     pumpkins['City Name' ],     'Package' :     pumpkins['Package' ],     'Low Price' :     pumpkins['Low Price' ],     'High Price' :     pumpkins['High Price' ],     'Price' :     price }) new_pumpkins.loc[new_pumpkins['Package' ].str .contains('1 1/9' ),                  'Price' ] = price / 1.1  new_pumpkins.loc[new_pumpkins['Package' ].str .contains('1/2' ),                  'Price' ] = price * 2  pie_pumpkins = new_pumpkins[new_pumpkins['Variety' ] == 'PIE TYPE' ] X_train, X_test, y_train, y_test = train_test_split(     pie_pumpkins['DayOfYear' ].to_numpy().reshape(-1 , 1 ),     pie_pumpkins['Price' ],     test_size=0.2 ,     random_state=0 ) pipeline = make_pipeline(PolynomialFeatures(2 ), LinearRegression()) pipeline.fit(X_train, y_train) pred = pipeline.predict(X_test) mse = np.sqrt(mean_squared_error(y_test, pred)) print (f'Mean error: {mse:3.3 }  ({mse/np.mean(pred)*100 :3.3 } %)' )score = pipeline.score(X_train, y_train) print ('Model determination: ' , score)plt.scatter(X_test, y_test) plt.plot(sorted (X_test), pipeline.predict(sorted (X_test))) 
Mean error: 2.73 (17.0%)
Model determination:  0.07639977655280084
[<matplotlib.lines.Line2D at 0x19b75ee76d0>]
决策树-随机森林 决策树分类 import  numpy as  npimport  pandas as  pdimport  matplotlib.pyplot as  pltimport  matplotlib as  mplfrom  sklearn import  treefrom  sklearn.tree import  DecisionTreeClassifierfrom  sklearn.model_selection import  train_test_splitfrom  sklearn.metrics import  accuracy_scorefrom  sklearn.preprocessing import  LabelEncoderimport  pydotplusmpl.rcParams['font.sans-serif' ] = ['simHei' ] mpl.rcParams['axes.unicode_minus' ] = False  iris_feature_E = 'sepal length' , 'sepal width' , 'petal length' , 'petal width'  iris_feature = '花萼长度' , '花萼宽度' , '花瓣长度' , '花瓣宽度'  iris_class = 'Iris-setosa' , 'Iris-versicolor' , 'Iris-virginica'  path = './_data_set/iris_classification/iris.data'    data = pd.read_csv(path, header=None ) x = data[list (range (4 ))] y = LabelEncoder().fit_transform(data[4 ]) x = x[[0 , 1 ]] x_train, x_test, y_train, y_test = train_test_split(x,                                                     y,                                                     test_size=0.3 ,                                                     random_state=1 ) model = DecisionTreeClassifier(criterion='entropy' ) model.fit(x_train, y_train) y_train_pred = model.predict(x_train) print ('训练集正确率:' , accuracy_score(y_train, y_train_pred))y_test_hat = model.predict(x_test)   print ('测试集正确率:' , accuracy_score(y_test, y_test_hat))tree.export_graphviz(model,                      out_file='./_data_set/iris_classification/iris.dot' ,                      feature_names=iris_feature_E[0 :2 ],                      class_names=iris_class,                      filled=True ,                      rounded=True ,                      special_characters=True ) dot_data = tree.export_graphviz(model,                                 out_file=None ,                                 feature_names=iris_feature_E[0 :2 ],                                 class_names=iris_class,                                 filled=True ,                                 rounded=True ,                                 special_characters=True ) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf('./_data_set/iris_classification/iris.pdf' ) f = open ('./_data_set/iris_classification/iris.png' , 'wb' ) f.write(graph.create_png()) f.close() N, M = 50 , 50    x1_min, x2_min = x.min () x1_max, x2_max = x.max () t1 = np.linspace(x1_min, x1_max, N) t2 = np.linspace(x2_min, x2_max, M) x1, x2 = np.meshgrid(t1, t2)   x_show = np.stack((x1.flat, x2.flat), axis=1 )   print (x_show.shape)print ('x_show = \n' , x_show)cm_light = mpl.colors.ListedColormap(['#A0FFA0' , '#FFA0A0' , '#A0A0FF' ]) cm_dark = mpl.colors.ListedColormap(['g' , 'r' , 'b' ]) y_show_hat = model.predict(x_show)   print (y_show_hat.shape)print (y_show_hat)y_show_hat = y_show_hat.reshape(x1.shape)   print (y_show_hat)plt.figure(facecolor='w' ) plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light, shading='auto' )   plt.scatter(x_test[0 ],             x_test[1 ],             c=y_test.ravel(),             edgecolors='k' ,             s=100 ,             zorder=10 ,             cmap=cm_dark,             marker='*' )   plt.scatter(x[0 ], x[1 ], c=y.ravel(), edgecolors='k' , s=20 ,             cmap=cm_dark)   plt.xlabel(iris_feature[0 ], fontsize=13 ) plt.ylabel(iris_feature[1 ], fontsize=13 ) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.grid(b=True , ls=':' , color='#606060' ) plt.title('鸢尾花数据的决策树分类' , fontsize=15 ) plt.show() y_test = y_test.reshape(-1 ) print (y_test_hat)print (y_test)result = (y_test_hat == y_test)   acc = np.mean(result) print ('准确度: %.2f%%'  % (100  * acc))depth = np.arange(1 , 15 ) err_train_list = [] err_test_list = [] clf = DecisionTreeClassifier(criterion='entropy' ) for  d in  depth:    clf.set_params(max_depth=d)     clf.fit(x_train, y_train)     y_train_pred = clf.predict(x_train)     err_train = 1  - accuracy_score(y_train, y_train_pred)     err_train_list.append(err_train)     y_test_pred = clf.predict(x_test)     err_test = 1  - accuracy_score(y_test, y_test_pred)     err_test_list.append(err_test)     print (d, ' 测试集错误率: %.2f%%'  % (100  * err_test)) plt.figure(facecolor='w' ) plt.plot(depth,          err_test_list,          'ro-' ,          markeredgecolor='k' ,          lw=2 ,          label='测试集错误率' ) plt.plot(depth,          err_train_list,          'go-' ,          markeredgecolor='k' ,          lw=2 ,          label='训练集错误率' ) plt.xlabel('决策树深度' , fontsize=13 ) plt.ylabel('错误率' , fontsize=13 ) plt.legend(loc='lower left' , fontsize=13 ) plt.title('决策树深度与过拟合' , fontsize=15 ) plt.grid(b=True , ls=':' , color='#606060' ) plt.show() 
训练集正确率: 0.9523809523809523
测试集正确率: 0.6222222222222222
(2500, 2)
x_show =
 [[4.3        2.        ]
 [4.37346939 2.        ]
 [4.44693878 2.        ]
 ...
 [7.75306122 4.4       ]
 [7.82653061 4.4       ]
 [7.9        4.4       ]]
(2500,)
[0 0 0 ... 2 2 2]
[[0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 1 1 1]
 ...
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]
 [0 0 0 ... 2 2 2]]
[0 1 2 0 2 2 1 0 0 2 2 0 1 2 1 0 2 1 0 0 1 0 2 0 2 1 0 0 1 1 2 2 2 2 1 0 1
 0 2 1 2 0 1 1 1]
[0 1 1 0 2 1 2 0 0 2 1 0 2 1 1 0 1 1 0 0 1 1 1 0 2 1 0 0 1 2 1 2 1 2 2 0 1
 0 1 2 2 0 2 2 1]
准确度: 62.22%
1  测试集错误率: 44.44%
2  测试集错误率: 40.00%
3  测试集错误率: 20.00%
4  测试集错误率: 24.44%
5  测试集错误率: 24.44%
6  测试集错误率: 28.89%
7  测试集错误率: 37.78%
8  测试集错误率: 40.00%
9  测试集错误率: 37.78%
10  测试集错误率: 40.00%
11  测试集错误率: 37.78%
12  测试集错误率: 37.78%
13  测试集错误率: 40.00%
14  测试集错误率: 37.78%
随机森林 import  numpy as  npimport  pandas as  pdimport  matplotlib as  mplimport  matplotlib.pyplot as  pltfrom  sklearn.tree import  DecisionTreeClassifierfrom  sklearn.metrics import  accuracy_scorefrom  sklearn.model_selection import  train_test_splitmpl.rcParams['font.sans-serif' ] = ['SimHei' ] mpl.rcParams['axes.unicode_minus' ] = False  iris_feature = u'花萼长度' , u'花萼宽度' , u'花瓣长度' , u'花瓣宽度'  path = './_data_set/iris_classification/iris.data'    data = pd.read_csv(path, header=None ) x_prime = data[list (range (4 ))] y = pd.Categorical(data[4 ]).codes x_prime_train, x_prime_test, y_train, y_test = train_test_split(x_prime,                                                                 y,                                                                 train_size=0.7 ,                                                                 random_state=0 ) feature_pairs = [[0 , 1 ], [0 , 2 ], [0 , 3 ], [1 , 2 ], [1 , 3 ], [2 , 3 ]] plt.figure(figsize=(8 , 6 ), facecolor='#FFFFFF' ) for  i, pair in  enumerate (feature_pairs):         x_train = x_prime_train[pair]     x_test = x_prime_test[pair]          model = DecisionTreeClassifier(criterion='entropy' , min_samples_leaf=3 )     model.fit(x_train, y_train)     N, M = 500 , 500        x1_min, x2_min = x_train.min ()     x1_max, x2_max = x_train.max ()     t1 = np.linspace(x1_min, x1_max, N)     t2 = np.linspace(x2_min, x2_max, M)     x1, x2 = np.meshgrid(t1, t2)       x_show = np.stack((x1.flat, x2.flat), axis=1 )            y_train_pred = model.predict(x_train)     acc_train = accuracy_score(y_train, y_train_pred)     y_test_pred = model.predict(x_test)     acc_test = accuracy_score(y_test, y_test_pred)     print ('特征:' , iris_feature[pair[0 ]], ' + ' , iris_feature[pair[1 ]])     print ('\t训练集准确率: %.4f%%'  % (100  * acc_train))     print ('\t测试集准确率: %.4f%%\n'  % (100  * acc_test))     cm_light = mpl.colors.ListedColormap(['#A0FFA0' , '#FFA0A0' , '#A0A0FF' ])     cm_dark = mpl.colors.ListedColormap(['g' , 'r' , 'b' ])     y_hat = model.predict(x_show)     y_hat = y_hat.reshape(x1.shape)     plt.subplot(2 , 3 , i + 1 )     plt.contour(x1,                 x2,                 y_hat,                 colors='k' ,                 levels=[0 , 1 ],                 antialiased=True ,                 linewidths=1 )     plt.pcolormesh(x1, x2, y_hat, cmap=cm_light, shading='auto' )       plt.scatter(x_train[pair[0 ]],                 x_train[pair[1 ]],                 c=y_train,                 s=20 ,                 edgecolors='k' ,                 cmap=cm_dark,                 label=u'训练集' )     plt.scatter(x_test[pair[0 ]],                 x_test[pair[1 ]],                 c=y_test,                 s=80 ,                 marker='*' ,                 edgecolors='k' ,                 cmap=cm_dark,                 label=u'测试集' )     plt.xlabel(iris_feature[pair[0 ]], fontsize=12 )     plt.ylabel(iris_feature[pair[1 ]], fontsize=12 )          plt.xlim(x1_min, x1_max)     plt.ylim(x2_min, x2_max)     plt.grid(b=True , ls=':' , color='#606060' ) plt.suptitle(u'决策树对鸢尾花数据两特征组合的分类结果' , fontsize=15 ) plt.tight_layout(1 , rect=(0 , 0 , 1 , 0.94 ))   plt.show() 
特征: 花萼长度  +  花萼宽度
	训练集准确率: 85.7143%
	测试集准确率: 71.1111%
特征: 花萼长度  +  花瓣长度
	训练集准确率: 96.1905%
	测试集准确率: 91.1111%
特征: 花萼长度  +  花瓣宽度
	训练集准确率: 96.1905%
	测试集准确率: 86.6667%
特征: 花萼宽度  +  花瓣长度
	训练集准确率: 97.1429%
	测试集准确率: 95.5556%
特征: 花萼宽度  +  花瓣宽度
	训练集准确率: 96.1905%
	测试集准确率: 84.4444%
特征: 花瓣长度  +  花瓣宽度
	训练集准确率: 98.0952%
	测试集准确率: 97.7778%
C:\Users\utsuk\AppData\Local\Temp\ipykernel_25000\2108356862.py:83: MatplotlibDeprecationWarning: Passing the pad parameter of tight_layout() positionally is deprecated since Matplotlib 3.3; the parameter will become keyword-only two minor releases later.
  plt.tight_layout(1, rect=(0, 0, 1, 0.94))  # (left, bottom, right, top)
决策树-随机森林回归 import  numpy as  npimport  matplotlib as  mplimport  matplotlib.pyplot as  pltfrom  sklearn.ensemble import  RandomForestRegressorN = 100  x = np.random.rand(N) * 6  - 3  x.sort() y = np.sin(x) + np.random.randn(N) * 0.05  x = x.reshape(-1 , 1 ) print (x)print (y)model = RandomForestRegressor(n_estimators=20 , criterion='mse' , max_depth=10 ) model.fit(x, y) x_test = np.linspace(-3 , 3 , 50 ).reshape(-1 , 1 ) y_hat = model.predict(x_test) mpl.rcParams['font.sans-serif' ] = ['SimHei' ] mpl.rcParams['axes.unicode_minus' ] = False  plt.figure(facecolor='w' ) plt.plot(x, y, 'r*' , markersize=10 , markeredgecolor='k' , label='实际值' ) plt.plot(x_test, y_hat, 'g-' , linewidth=2 , label='预测值' ) plt.legend(loc='upper left' , fontsize=12 ) plt.xlabel('X' ) plt.ylabel('Y' ) plt.grid(b=True , ls=':' , color='#606060' ) plt.title('决策树-随机森林回归' , fontsize=15 ) plt.tight_layout(2 ) plt.show() depth = [2 , 4 , 6 , 8 , 10 ] clr = 'rgbmy'  model = RandomForestRegressor(n_estimators=20 , criterion='mse' ) plt.figure(facecolor='w' ) plt.plot(x, y, 'ro' , ms=5 , mec='k' , label='实际值' ) x_test = np.linspace(-3 , 3 , 50 ).reshape(-1 , 1 ) for  d, c in  zip (depth, clr):    model.set_params(max_depth=d)     model.fit(x, y)     y_hat = model.predict(x_test)     plt.plot(x_test,              y_hat,              '-' ,              color=c,              linewidth=2 ,              markeredgecolor='k' ,              label='Depth=%d'  % d) plt.legend(loc='upper left' , fontsize=12 ) plt.xlabel('X' ) plt.ylabel('Y' ) plt.grid(b=True , ls=':' , color='#606060' ) plt.title('决策树-随机森林回归' , fontsize=15 ) plt.tight_layout(2 ) plt.show() 
[[-2.95013857]
 [-2.91406777]
 [-2.90709403]
 [-2.90375999]
 [-2.87745415]
 [-2.86685497]
 [-2.84070033]
 [-2.81375697]
 [-2.77943784]
 [-2.77745775]
 [-2.66347611]
 [-2.65986762]
 [-2.64217075]
 [-2.63778267]
 [-2.63639243]
 [-2.60165162]
 [-2.48896096]
 [-2.43041085]
 [-2.39941506]
 [-2.38779079]
 [-2.37856289]
 [-2.32809632]
 [-2.29426051]
 [-2.25524085]
 [-2.23914776]
 [-2.16578847]
 [-2.0055294 ]
 [-1.9885004 ]
 [-1.73199798]
 [-1.61893958]
 [-1.58908223]
 [-1.54966302]
 [-1.49074428]
 [-1.46530843]
 [-1.40362808]
 [-1.3935955 ]
 [-1.3928819 ]
 [-1.37684247]
 [-1.3377295 ]
 [-1.33035029]
 [-1.24577529]
 [-1.15561278]
 [-1.12976361]
 [-0.92771942]
 [-0.91536325]
 [-0.83851709]
 [-0.54828961]
 [-0.54587447]
 [-0.54456521]
 [-0.51610774]
 [-0.38078143]
 [-0.27298529]
 [-0.20898971]
 [-0.20866835]
 [ 0.01703485]
 [ 0.03203468]
 [ 0.06107388]
 [ 0.07358949]
 [ 0.07361575]
 [ 0.10676155]
 [ 0.15818226]
 [ 0.20123638]
 [ 0.3410772 ]
 [ 0.45680655]
 [ 0.52384169]
 [ 0.65741898]
 [ 0.68306354]
 [ 0.82845395]
 [ 0.83952908]
 [ 0.99278446]
 [ 1.04865533]
 [ 1.16809926]
 [ 1.21294563]
 [ 1.44659934]
 [ 1.47606149]
 [ 1.48031876]
 [ 1.54476213]
 [ 1.54542061]
 [ 1.60452852]
 [ 1.85706958]
 [ 1.9814776 ]
 [ 2.07801869]
 [ 2.08420295]
 [ 2.08974078]
 [ 2.09458999]
 [ 2.21547939]
 [ 2.2354401 ]
 [ 2.2824592 ]
 [ 2.39313024]
 [ 2.4822308 ]
 [ 2.5275393 ]
 [ 2.56392915]
 [ 2.63696096]
 [ 2.65318554]
 [ 2.66286196]
 [ 2.67531127]
 [ 2.69337798]
 [ 2.78329004]
 [ 2.83409465]
 [ 2.94897411]]
[-0.16517944 -0.2651016  -0.26383212 -0.2408083  -0.21137157 -0.30729908
 -0.24489794 -0.42738593 -0.39536257 -0.34762803 -0.55252409 -0.40629331
 -0.47658554 -0.47459876 -0.50897455 -0.58933727 -0.57144697 -0.61564276
 -0.76543047 -0.64009055 -0.79612233 -0.77625958 -0.82824783 -0.74228023
 -0.6882561  -0.85479709 -0.9210133  -0.88824326 -0.94228155 -1.04205908
 -1.01456756 -1.0150173  -1.01319155 -0.93942161 -1.0101109  -1.00949492
 -1.00669835 -1.02762794 -0.90526463 -1.04873446 -0.98356087 -0.96345712
 -0.90790305 -0.78617335 -0.8215092  -0.7756252  -0.62805774 -0.61416394
 -0.55438357 -0.54877048 -0.24409899 -0.24654351 -0.22139215 -0.31816396
 -0.00948097  0.03731195  0.07871564  0.06089414  0.091661    0.10130855
  0.05916354  0.284293    0.32301758  0.47390064  0.43774825  0.59439442
  0.62285701  0.78479681  0.7888438   0.91498224  0.91654605  0.84583033
  0.90598404  0.99494553  1.05048367  0.97256267  1.04768316  1.09246729
  0.92367061  0.97097779  0.98148804  0.86766162  0.87249721  0.81545132
  0.80772307  0.75639662  0.73792794  0.75909448  0.71239606  0.63892773
  0.6373865   0.53978973  0.47836416  0.47544309  0.4156747   0.46233128
  0.43384357  0.32001956  0.2980857   0.16211947]
C:\Users\utsuk\AppData\Local\Temp\ipykernel_25000\2984870596.py:33: MatplotlibDeprecationWarning: Passing the pad parameter of tight_layout() positionally is deprecated since Matplotlib 3.3; the parameter will become keyword-only two minor releases later.
  plt.tight_layout(2)
C:\Users\utsuk\AppData\Local\Temp\ipykernel_25000\2984870596.py:60: MatplotlibDeprecationWarning: Passing the pad parameter of tight_layout() positionally is deprecated since Matplotlib 3.3; the parameter will become keyword-only two minor releases later.
  plt.tight_layout(2)
分类问题 逻辑回归 二分类 import  numpy as  npimport  sklearn.linear_model as  lmimport  matplotlib.pyplot as  mpx = np.array([[3 , 1 ], [2 , 5 ], [1 , 8 ], [6 , 4 ],\               [5 , 2 ], [3 , 5 ], [4 , 7 ], [4 , -1 ]]) y = np.array([0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 ]) model = lm.LogisticRegression() model.fit(x, y)   test_x = np.array([[3 , 9 ], [6 , 1 ]]) test_y = model.predict(test_x)   print (test_y)left = x[:, 0 ].min () - 1  right = x[:, 0 ].max () + 1  buttom = x[:, 1 ].min () - 1  top = x[:, 1 ].max () + 1  grid_x, grid_y = np.meshgrid(np.arange(left, right, 0.01 ),                              np.arange(buttom, top, 0.01 )) print ("grid_x.shape:" , grid_x.shape)print ("grid_y.shape:" , grid_y.shape)mesh_x = np.column_stack((grid_x.ravel(), grid_y.ravel())) print ("mesh_x.shape:" , mesh_x.shape)mesh_z = model.predict(mesh_x) mesh_z = mesh_z.reshape(grid_x.shape) mp.figure('Logistic Regression' , facecolor='lightgray' ) mp.title('Logistic Regression' , fontsize=20 ) mp.xlabel('x' , fontsize=14 ) mp.ylabel('y' , fontsize=14 ) mp.tick_params(labelsize=10 ) mp.pcolormesh(grid_x, grid_y, mesh_z, cmap='gray' , shading='auto' ) mp.scatter(     x[:, 0 ],       x[:, 1 ],       c=y,     cmap='brg' ,     s=80 ) mp.scatter(test_x[:, 0 ], test_x[:, 1 ], c="red" , marker='s' , s=80 ) mp.show() 
[1 0]
grid_x.shape: (1100, 700)
grid_y.shape: (1100, 700)
mesh_x.shape: (770000, 2)
多分类 import  numpy as  npimport  sklearn.linear_model as  lmimport  matplotlib.pyplot as  mpx = np.array([[4 , 7 ], [3.5 , 8 ], [3.1 , 6.2 ], [0.5 , 1 ], [1 , 2 ], [1.2 , 1.9 ],               [6 , 2 ], [5.7 , 1.5 ], [5.4 , 2.2 ]]) y = np.array([0 , 0 , 0 , 1 , 1 , 1 , 2 , 2 , 2 ]) model = lm.LogisticRegression(C=200 )   model.fit(x, y)   left = x[:, 0 ].min () - 1  right = x[:, 0 ].max () + 1  h = 0.005  buttom = x[:, 1 ].min () - 1  top = x[:, 1 ].max () + 1  v = 0.005  grid_x, grid_y = np.meshgrid(np.arange(left, right, h),                              np.arange(buttom, top, v)) mesh_x = np.column_stack((grid_x.ravel(), grid_y.ravel())) mesh_z = model.predict(mesh_x) mesh_z = mesh_z.reshape(grid_x.shape) mp.figure('Logistic Classification' , facecolor='lightgray' ) mp.title('Logistic Classification' , fontsize=20 ) mp.xlabel('x' , fontsize=14 ) mp.ylabel('y' , fontsize=14 ) mp.tick_params(labelsize=10 ) mp.pcolormesh(grid_x, grid_y, mesh_z, cmap='gray' , shading='auto' ) mp.scatter(x[:, 0 ], x[:, 1 ], c=y, cmap='brg' , s=80 ) mp.show() 
import  pandas as  pdimport  matplotlib.pyplot as  pltimport  numpy as  npfrom  datetime import  datetimefrom  sklearn.linear_model import  LinearRegressionfrom  sklearn.metrics import  r2_score, mean_squared_error, mean_absolute_errorfrom  sklearn.model_selection import  train_test_splitfrom  sklearn.preprocessing import  PolynomialFeatures, LabelEncoderfrom  sklearn.pipeline import  make_pipelineimport  seaborn as  snspumpkins = pd.read_csv('./_data_set/US-pumpkins.csv' ) new_pumpkins = pumpkins.drop([     c for  c in  pumpkins.columns if  c not  in      ['Color' , 'Origin' , 'Item Size' , 'Variety' , 'City Name' , 'Package' ] ],                              axis=1 ) new_pumpkins.dropna(inplace=True ) new_pumpkins = new_pumpkins.apply(LabelEncoder().fit_transform) sns.PairGrid(new_pumpkins).map (sns.scatterplot) sns.catplot(x="Color" , y="Item Size" , kind="violin" , data=new_pumpkins) 
<seaborn.axisgrid.FacetGrid at 0x19b7ed39fd0>
支持向量机-SVM 核函数 线性
径向基-高斯
import  numpy as  npimport  pandas as  pdimport  matplotlib as  mplimport  matplotlib.pyplot as  pltfrom  sklearn import  svmfrom  sklearn.model_selection import  train_test_splitfrom  sklearn.metrics import  accuracy_scorefrom  sklearn.model_selection import  GridSearchCVfrom  time import  timeiris_feature = '花萼长度' , '花萼宽度' , '花瓣长度' , '花瓣宽度'  path = './_data_set/iris_classification/iris.data'    data = pd.read_csv(path, header=None ) x, y = data[[0 , 1 ]], pd.Categorical(data[4 ]).codes x_train, x_test, y_train, y_test = train_test_split(x,                                                     y,                                                     random_state=1 ,                                                     test_size=0.4 ) svm_clf = svm.SVC(C=3 , kernel='linear' ) print ('GridSearchCV begin...' )t = time() clf = svm_clf clf.fit(x_train, y_train.ravel()) t_end = time() print ('耗时:%d秒'  % (t_end - t))print (clf.score(x_train, y_train))  print ('训练集准确率:' , accuracy_score(y_train, clf.predict(x_train)))print (clf.score(x_test, y_test))print ('测试集准确率:' , accuracy_score(y_test, clf.predict(x_test)))print (x_train[:5 ])print ('decision_function:\n' , clf.decision_function(x_train))print ('\npredict:\n' , clf.predict(x_train))x1_min, x2_min = x.min () x1_max, x2_max = x.max () x1, x2 = np.mgrid[x1_min:x1_max:300j , x2_min:x2_max:300j ]   grid_test = np.stack((x1.flat, x2.flat), axis=1 )   grid_hat = clf.predict(grid_test)   grid_hat = grid_hat.reshape(x1.shape)   mpl.rcParams['font.sans-serif' ] = ['SimHei' ] mpl.rcParams['axes.unicode_minus' ] = False  cm_light = mpl.colors.ListedColormap(['#A0FFA0' , '#FFA0A0' , '#A0A0FF' ]) cm_dark = mpl.colors.ListedColormap(['g' , 'r' , 'b' ]) plt.figure(facecolor='w' ) plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, shading='auto' ) plt.scatter(x[0 ], x[1 ], c=y, edgecolors='k' , s=50 , cmap=cm_dark)   plt.scatter(x_test[0 ], x_test[1 ], s=120 , facecolors='none' ,             zorder=10 )   plt.xlabel(iris_feature[0 ], fontsize=13 ) plt.ylabel(iris_feature[1 ], fontsize=13 ) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.title('鸢尾花SVM二特征分类' , fontsize=16 ) plt.grid(b=True , ls=':' ) plt.tight_layout(pad=1.5 ) plt.show() 
GridSearchCV begin...
耗时:0秒
0.7888888888888889
训练集准确率: 0.7888888888888889
0.7833333333333333
测试集准确率: 0.7833333333333333
       0    1
11   4.8  3.4
113  5.7  2.5
123  6.3  2.7
12   4.8  3.0
2    4.7  3.2
decision_function:
 [[ 2.27077043  0.77466667 -0.23050192]
 [-0.26084184  2.25751125  1.0560141 ]
 [-0.28293421  2.25843306  1.22796515]
 [ 2.24308998  0.92588576 -0.2355186 ]
 [ 2.26643883  0.80879936 -0.24170145]
 [-0.28069428  1.23581332  2.24817619]
 [ 2.28018898  0.76457517 -0.24714914]
 [-0.25832462  1.21350955  2.20811726]
 [-0.27289547  1.20094518  2.249947  ]
 [-0.28479994  1.2318958   2.26058535]
 [-0.23449614  2.22389578  1.08207816]
 [-0.27824598  1.19362022  2.2618816 ]
 [-0.24940836  1.20624284  2.19142888]
 [ 2.2732283  -0.2577432   0.82271427]
 [-0.2872948   2.27977562  1.1680301 ]
 [-0.28016055  1.24346593  2.23969255]
 [-0.28069428  1.23581332  2.24817619]
 [-0.17539161  2.25364746  0.77515212]
 [-0.25325305  1.14281388  2.23566431]
 [-0.2421088   2.25159667  0.90072452]
 [ 2.24374936 -0.23761503  0.93581753]
 [ 2.24150985  0.87804111 -0.2241573 ]
 [ 2.28041834 -0.27382086  0.86293615]
 [-0.27824598  1.19362022  2.2618816 ]
 [ 2.27361996  0.84008337 -0.26180365]
 [-0.26919985  2.24943754  1.18347413]
 [-0.28629221  2.26101768  1.23745359]
 [-0.23808639  1.19803786  2.16960812]
 [ 2.24670891 -0.21840299  0.83780678]
 [ 2.23817757  0.82105098 -0.19112107]
 [-0.2549617   2.24213564  1.1247623 ]
 [ 2.24150985  0.87804111 -0.2241573 ]
 [-0.30409023  1.27536123  2.28319903]
 [-0.27073104  1.23503534  2.2197061 ]
 [-0.30441277  1.26883084  2.28803158]
 [-0.28752494  1.24411663  2.25991942]
 [-0.29292012  2.28000741  1.22212062]
 [-0.23176304  1.04104242  2.22722296]
 [-0.28694643  1.15125962  2.28088265]
 [-0.19956441  2.19653578  1.01742803]
 [ 2.26878977 -0.27918692  1.16597018]
 [-0.2549617   2.24213564  1.1247623 ]
 [ 2.18519501  1.23494087 -0.26146238]
 [-0.2512831   2.2597216   0.89408741]
 [-0.29066599  1.240849    2.26933123]
 [ 2.25729439 -0.2129225   0.79411601]
 [-0.29441953  1.26382422  2.26345809]
 [ 2.26377705 -0.22976306  0.79617407]
 [ 2.27077043  0.77466667 -0.23050192]
 [-0.2445374   1.07681969  2.23683943]
 [-0.21840757  2.21786592  1.00447438]
 [-0.28524498  1.22203776  2.26581127]
 [ 2.24947915  0.81045802 -0.20787766]
 [ 2.2684482  -0.24915018  0.81973595]
 [-0.20907334  1.12013512  2.17603067]
 [ 2.26728139  0.82893904 -0.24984081]
 [-0.2549617   2.24213564  1.1247623 ]
 [ 2.25207771  0.85867453 -0.23431088]
 [-0.28069428  1.23581332  2.24817619]
 [ 1.05055292  2.22662497 -0.23240122]
 [ 2.2722284   0.7997718  -0.24896247]
 [-0.26723747  1.19201787  2.24286157]
 [ 2.27827649  0.82682975 -0.26630655]
 [-0.17333043  2.16553945  1.02939988]
 [ 2.26890439  0.89311197 -0.26243597]
 [-0.2512831   2.2597216   0.89408741]
 [-0.23341849  2.27309387  0.7709214 ]
 [ 2.25617562 -0.22437834  0.81410784]
 [ 2.22986475  0.96247553 -0.22562355]
 [ 0.93771503  2.24006469 -0.23365567]
 [ 2.28631446 -0.25590025  0.75474384]
 [-0.23808639  1.19803786  2.16960812]
 [-0.2646401   1.23019768  2.20613126]
 [ 2.25729439 -0.2129225   0.79411601]
 [-0.28479994  1.2318958   2.26058535]
 [-0.22087216  2.20491832  1.08971052]
 [-0.29260626  2.28238001  1.20719635]
 [-0.29101034  1.23272406  2.27341138]
 [-0.2549617   2.24213564  1.1247623 ]
 [ 2.19974418 -0.17699012  0.91119813]
 [ 2.21209755  1.01040036 -0.21350121]
 [-0.27849238  2.26059076  1.19977637]
 [-0.26997469  2.24280567  1.20408124]
 [-0.29874557  1.25977449  2.27874043]
 [-0.25377271  2.24886239  1.06528288]
 [-0.27961589  2.2500049   1.22933155]
 [-0.27219081  1.21466357  2.24182934]
 [-0.2884762   2.2710352   1.22512174]
 [-0.28479994  1.2318958   2.26058535]
 [ 2.24150985  0.87804111 -0.2241573 ]]
predict:
 [0 1 1 0 0 2 0 2 2 2 1 2 2 0 1 2 2 1 2 1 0 0 0 2 0 1 1 2 0 0 1 0 2 2 2 2 1
 2 2 1 0 1 0 1 2 0 2 0 0 2 1 2 0 0 2 0 1 0 2 1 0 2 0 1 0 1 1 0 0 1 0 2 2 0
 2 1 1 2 1 0 0 1 1 2 1 1 2 1 2 0]
import  numpy as  npfrom  sklearn import  svmfrom  scipy import  statsfrom  sklearn.metrics import  accuracy_scoreimport  matplotlib as  mplimport  matplotlib.pyplot as  pltdef  extend (a, b, r=0.01  ):    return  a * (1  + r) - b * r, -a * r + b * (1  + r) np.random.seed(0 ) N = 200  x = np.empty((4  * N, 2 )) means = [(-1 , 1 ), (1 , 1 ), (1 , -1 ), (-1 , -1 )] sigmas = [     np.eye(2 ), 2  * np.eye(2 ),     np.diag((1 , 2 )),     np.array(((3 , 2 ), (2 , 3 ))) ] for  i in  range (4 ):    mn = stats.multivariate_normal(means[i], sigmas[i] * 0.07 )     x[i * N:(i + 1 ) * N, :] = mn.rvs(N) a = np.array((0 , 1 , 2 , 3 )).reshape((-1 , 1 )) y = np.tile(a, N).flatten() clf = svm.SVC(C=1 , kernel='rbf' , gamma=1 , decision_function_shape='ovo' ) clf.fit(x, y) y_hat = clf.predict(x) acc = accuracy_score(y, y_hat) np.set_printoptions(suppress=True ) print ('预测正确的样本个数:%d,正确率:%.2f%%'  % (round (acc * 4  * N), 100  * acc))print (clf.decision_function(x))print (y_hat)x1_min, x2_min = np.min (x, axis=0 ) x1_max, x2_max = np.max (x, axis=0 ) x1_min, x1_max = extend(x1_min, x1_max) x2_min, x2_max = extend(x2_min, x2_max) x1, x2 = np.mgrid[x1_min:x1_max:500j , x2_min:x2_max:500j ] x_test = np.stack((x1.flat, x2.flat), axis=1 ) y_test = clf.predict(x_test) y_test = y_test.reshape(x1.shape) cm_light = mpl.colors.ListedColormap(     ['#FF8080' , '#80FF80' , '#8080FF' , '#F0F080' ]) cm_dark = mpl.colors.ListedColormap(['r' , 'g' , 'b' , 'y' ]) mpl.rcParams['font.sans-serif' ] = ['SimHei' ] mpl.rcParams['axes.unicode_minus' ] = False  plt.figure(facecolor='w' ) plt.pcolormesh(x1, x2, y_test, cmap=cm_light, shading='auto' ) plt.contour(x1, x2, y_test, levels=(0 , 1 , 2 ), colors='k' , linestyles='--' ) plt.scatter(x[:, 0 ],             x[:, 1 ],             s=20 ,             c=y,             cmap=cm_dark,             edgecolors='k' ,             alpha=0.7 ) plt.xlabel('$X_1$' , fontsize=11 ) plt.ylabel('$X_2$' , fontsize=11 ) plt.xlim((x1_min, x1_max)) plt.ylim((x2_min, x2_max)) plt.grid(b=True ) plt.tight_layout(pad=2.5 ) plt.title('SVM多分类方法:One/One or One/Other' , fontsize=14 ) plt.show() 
预测正确的样本个数:793,正确率:99.12%
[[ 1.30403817  1.18371967  1.61069453  0.53379555  0.10282667 -0.6639782 ]
 [ 1.20484592  1.00041165  1.13023042  0.32840742  0.16888308 -0.33559223]
 [ 1.28448754  1.15305262  1.24310512  0.59725054 -0.31474389 -0.97622623]
 ...
 [-0.23584035 -0.08224918 -1.09483656  0.1554822  -1.12200744 -1.12840424]
 [ 0.2447751   0.34444513 -1.55255237  0.17064062 -1.24348982 -1.41973039]
 [-0.03070327 -0.15566364 -1.9254549  -0.09600454 -1.23897289 -1.44257329]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
朴素贝叶斯 class  NBClassify (object ):    def  __init__ (self, fillNa=1  ):         self .fillNa = 1          pass      def  train (self, trainSet ):                           dictTag = {}         for  subTuple in  trainSet:             dictTag[str (                 subTuple[1 ])] = 1  if  str (subTuple[1 ]) not  in  dictTag.keys(                 ) else  dictTag[str (subTuple[1 ])] + 1                   tagProbablity = {}         totalFreq = sum ([value for  value in  dictTag.values()])         for  key, value in  dictTag.items():             tagProbablity[key] = value / totalFreq                  self .tagProbablity = tagProbablity                                    dictFeaturesBase = {}         for  subTuple in  trainSet:             for  key, value in  subTuple[0 ].items():                 if  key not  in  dictFeaturesBase.keys():                     dictFeaturesBase[key] = {value: 1 }                 else :                     if  value not  in  dictFeaturesBase[key].keys():                         dictFeaturesBase[key][value] = 1                      else :                         dictFeaturesBase[key][value] += 1                                              dictFeatures = {}.fromkeys([key for  key in  dictTag])         for  key in  dictFeatures.keys():             dictFeatures[key] = {}.fromkeys([key for  key in  dictFeaturesBase])         for  key, value in  dictFeatures.items():             for  subkey in  value.keys():                 value[subkey] = {}.fromkeys(                     [x for  x in  dictFeaturesBase[subkey].keys()])                                                               for  subTuple in  trainSet:             for  key, value in  subTuple[0 ].items():                 dictFeatures[subTuple[1 ]][key][value] = 1  if  dictFeatures[                     subTuple[1 ]][key][value] == None  else  dictFeatures[                         subTuple[1 ]][key][value] + 1                            for  tag, featuresDict in  dictFeatures.items():             for  featureName, fetureValueDict in  featuresDict.items():                 for  featureKey, featureValues in  fetureValueDict.items():                     if  featureValues == None :                         fetureValueDict[featureKey] = 1                   for  tag, featuresDict in  dictFeatures.items():             for  featureName, fetureValueDict in  featuresDict.items():                 totalCount = sum (                     [x for  x in  fetureValueDict.values() if  x != None ])                 for  featureKey, featureValues in  fetureValueDict.items():                     fetureValueDict[                         featureKey] = featureValues / totalCount if  featureValues != None  else  None          self .featuresProbablity = dictFeatures              def  classify (self, featureDict ):         resultDict = {}                  for  key, value in  self .tagProbablity.items():             iNumList = []             for  f, v in  featureDict.items():                 if  self .featuresProbablity[key][f][v]:                     iNumList.append(self .featuresProbablity[key][f][v])             conditionPr = 1              for  iNum in  iNumList:                 conditionPr *= iNum             resultDict[key] = value * conditionPr                  resultList = sorted (resultDict.items(),                             key=lambda  x: x[1 ],                             reverse=True )         return  resultList[0 ][0 ] if  __name__ == '__main__' :    trainSet = [         ({             "症状" : "打喷嚏" ,             "职业" : "护士"          }, "感冒 " ),         ({             "症状" : "打喷嚏" ,             "职业" : "农夫"          }, "过敏 " ),         ({             "症状" : "头痛" ,             "职业" : "建筑工人"          }, "脑震荡" ),         ({             "症状" : "头痛" ,             "职业" : "建筑工人"          }, "感冒 " ),         ({             "症状" : "打喷嚏" ,             "职业" : "教师"          }, "感冒 " ),         ({             "症状" : "头痛" ,             "职业" : "教师"          }, "脑震荡" ),     ]     monitor = NBClassify()          monitor.train(trainSet)               result = monitor.classify({"症状" : "打喷嚏" , "职业" : "建筑工人" })     print (result) 
感冒
多元问题-菜肴分类 数据加载与清洗 import  pandas as  pdimport  matplotlib.pyplot as  pltimport  matplotlib as  mplimport  numpy as  npfrom  imblearn.over_sampling import  SMOTEdf = pd.read_csv('./_data_set/cuisines_classification/cuisines.csv' ) df.head() 
Unnamed: 0 cuisine almond angelica anise anise_seed apple apple_brandy apricot armagnac ... whiskey white_bread white_wine whole_grain_wheat_flour wine wood yam yeast yogurt zucchini 0 65 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 1 66 indian 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2 67 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 3 68 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4 69 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0 
5 rows × 385 columns
def  create_ingredient_df (df ):    ingredient_df = df.T.drop(['cuisine' ,                                'Unnamed: 0' ]).sum (axis=1 ).to_frame('value' )     ingredient_df = ingredient_df[(ingredient_df.T != 0 ).any ()]     ingredient_df = ingredient_df.sort_values(by='value' ,                                               ascending=False ,                                               inplace=False )     return  ingredient_df for  i in  ["thai" , "japanese" , "chinese" , "indian" , "korean" ]:    ingredient_df = create_ingredient_df(df[(df.cuisine == i)])     ingredient_df.head(10 ).plot.barh().set_title(i) feature_df = df.drop(['cuisine' , 'Unnamed: 0' , 'rice' , 'garlic' , 'ginger' ],                      axis=1 ) labels_df = df.cuisine 
transformed_feature_df, transformed_label_df = SMOTE().fit_resample(     feature_df, labels_df) print (    f'===================== old label count: ===================== \n{labels_df.value_counts()} \      \n===================== new label count: ===================== \n{transformed_label_df.value_counts()} ' ) transformed_df = pd.concat([transformed_label_df, transformed_feature_df],                            axis=1 ,                            join='outer' ) transformed_df.info() transformed_df.to_csv("./_data_set/cuisines_classification/cleaned_cuisines.csv" ) 
===================== old label count: =====================
korean      799
indian      598
chinese     442
japanese    320
thai        289
Name: cuisine, dtype: int64
===================== new label count: =====================
indian      799
thai        799
chinese     799
japanese    799
korean      799
Name: cuisine, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3995 entries, 0 to 3994
Columns: 381 entries, cuisine to zucchini
dtypes: int64(380), object(1)
memory usage: 11.6+ MB
分类-1 import  pandas as  pdfrom  sklearn.linear_model import  LogisticRegressionfrom  sklearn.model_selection import  train_test_split, cross_val_scorefrom  sklearn.metrics import  accuracy_score, precision_score, confusion_matrix, classification_report, precision_recall_curvefrom  sklearn.svm import  SVCimport  numpy as  npcuisines_df = pd.read_csv(     "./_data_set/cuisines_classification/cleaned_cuisines.csv" ) cuisines_df.head() 
Unnamed: 0 cuisine almond angelica anise anise_seed apple apple_brandy apricot armagnac ... whiskey white_bread white_wine whole_grain_wheat_flour wine wood yam yeast yogurt zucchini 0 0 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 1 1 indian 1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2 2 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 3 3 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 4 4 indian 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0 
5 rows × 382 columns
cuisines_label_df = cuisines_df['cuisine' ] cuisines_feature_df = cuisines_df.drop(['Unnamed: 0' , 'cuisine' ], axis=1 ) X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df,                                                     cuisines_label_df,                                                     test_size=0.3 ) """ 1137      korean 428         thai 191       indian 1336      korean 3947        thai           ... 3335    japanese 3539        thai 436         thai 1875      indian 3079    japanese Name: cuisine, Length: 2796, dtype: object ['korean' 'thai' 'indian' ... 'thai' 'indian' 'japanese']  """ model = LogisticRegression(multi_class='ovr' ,                            solver='liblinear' ).fit(X_train, np.ravel(y_train)) accuracy = model.score(X_test, y_test) print (f"Accuracy is {accuracy} " )
Accuracy is 0.7906588824020017
line_num = 50  print (f'ingredients: {X_test.iloc[line_num][X_test.iloc[line_num]!=0 ].keys()} ' )print (f'cuisine: {y_test.iloc[line_num]} ' )test = X_test.iloc[line_num].values.reshape(-1 , 1 ).T proba = model.predict_proba(test) classes = model.classes_ resultdf = pd.DataFrame(data=proba, columns=classes) topPrediction = resultdf.T.sort_values(by=[0 ], ascending=[False ]) topPrediction.head() 
ingredients: Index(['butter', 'cayenne', 'cheese', 'cinnamon', 'coriander', 'onion',
       'tomato', 'turmeric'],
      dtype='object')
cuisine: indian
D:\Scoop\apps\anaconda3\current\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(
0 indian 0.962110 thai 0.021403 korean 0.011124 japanese 0.003710 chinese 0.001653 
y_pred = model.predict(X_test) print (classification_report(y_test, y_pred))
              precision    recall  f1-score   support
     chinese       0.76      0.68      0.72       242
      indian       0.91      0.93      0.92       239
    japanese       0.72      0.76      0.74       222
      korean       0.84      0.80      0.82       264
        thai       0.77      0.84      0.80       232
    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199
分类-2 from  sklearn.neighbors import  KNeighborsClassifierfrom  sklearn.linear_model import  LogisticRegressionfrom  sklearn.svm import  SVCfrom  sklearn.ensemble import  RandomForestClassifier, AdaBoostClassifierfrom  sklearn.model_selection import  train_test_split, cross_val_scorefrom  sklearn.metrics import  accuracy_score, precision_score, confusion_matrix, classification_report, precision_recall_curveimport  numpy as  npimport  pandas as  pdcuisines_df = pd.read_csv(     "./_data_set/cuisines_classification/cleaned_cuisines.csv" ) cuisines_label_df = cuisines_df['cuisine' ] cuisines_feature_df = cuisines_df.drop(['Unnamed: 0' , 'cuisine' ], axis=1 ) X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df,                                                     cuisines_label_df,                                                     test_size=0.3 ) 
C = 10  classifiers = {          'Linear SVC' : SVC(kernel='linear' , C=C, probability=True , random_state=0 ),          'KNN classifier' : KNeighborsClassifier(C),          'SVC' : SVC(),          'RFST' : RandomForestClassifier(n_estimators=100 ),     'ADA' : AdaBoostClassifier(n_estimators=100 ) } for  index, (name, classifier) in  enumerate (classifiers.items()):    classifier.fit(X_train, np.ravel(y_train))     y_pred = classifier.predict(X_test)     accuracy = accuracy_score(y_test, y_pred)     print ("Accuracy (train) for %s: %0.1f%%"  % (name, accuracy * 100 ))     print (classification_report(y_test, y_pred)) 
Accuracy (train) for Linear SVC: 80.2%
              precision    recall  f1-score   support
     chinese       0.73      0.76      0.74       250
      indian       0.86      0.89      0.88       235
    japanese       0.78      0.78      0.78       245
      korean       0.88      0.74      0.80       227
        thai       0.78      0.84      0.81       242
    accuracy                           0.80      1199
   macro avg       0.81      0.80      0.80      1199
weighted avg       0.81      0.80      0.80      1199
Accuracy (train) for KNN classifier: 73.6%
              precision    recall  f1-score   support
     chinese       0.71      0.73      0.72       250
      indian       0.80      0.86      0.83       235
    japanese       0.67      0.80      0.73       245
      korean       0.91      0.51      0.65       227
        thai       0.69      0.77      0.73       242
    accuracy                           0.74      1199
   macro avg       0.76      0.73      0.73      1199
weighted avg       0.75      0.74      0.73      1199
Accuracy (train) for SVC: 82.8%
              precision    recall  f1-score   support
     chinese       0.77      0.81      0.79       250
      indian       0.88      0.93      0.90       235
    japanese       0.82      0.76      0.79       245
      korean       0.90      0.77      0.83       227
        thai       0.79      0.88      0.83       242
    accuracy                           0.83      1199
   macro avg       0.83      0.83      0.83      1199
weighted avg       0.83      0.83      0.83      1199
Accuracy (train) for RFST: 84.5%
              precision    recall  f1-score   support
     chinese       0.82      0.81      0.82       250
      indian       0.85      0.93      0.89       235
    japanese       0.86      0.81      0.83       245
      korean       0.90      0.83      0.86       227
        thai       0.81      0.85      0.83       242
    accuracy                           0.84      1199
   macro avg       0.85      0.85      0.85      1199
weighted avg       0.85      0.84      0.84      1199
Accuracy (train) for ADA: 69.3%
              precision    recall  f1-score   support
     chinese       0.64      0.42      0.51       250
      indian       0.81      0.85      0.83       235
    japanese       0.65      0.61      0.63       245
      korean       0.62      0.82      0.71       227
        thai       0.73      0.79      0.76       242
    accuracy                           0.69      1199
   macro avg       0.69      0.70      0.69      1199
weighted avg       0.69      0.69      0.68      1199
聚类问题 聚类入门-数据分布 import  matplotlib.pyplot as  pltimport  pandas as  pddf = pd.read_csv("./_data_set/Clustering/nigerian-songs.csv" ) df.info() df.describe() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530 entries, 0 to 529
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   name              530 non-null    object
 1   album             530 non-null    object
 2   artist            530 non-null    object
 3   artist_top_genre  530 non-null    object
 4   release_date      530 non-null    int64
 5   length            530 non-null    int64
 6   popularity        530 non-null    int64
 7   danceability      530 non-null    float64
 8   acousticness      530 non-null    float64
 9   energy            530 non-null    float64
 10  instrumentalness  530 non-null    float64
 11  liveness          530 non-null    float64
 12  loudness          530 non-null    float64
 13  speechiness       530 non-null    float64
 14  tempo             530 non-null    float64
 15  time_signature    530 non-null    int64
dtypes: float64(8), int64(4), object(4)
memory usage: 66.4+ KB
release_date length popularity danceability acousticness energy instrumentalness liveness loudness speechiness tempo time_signature count 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 mean 2015.390566 222298.169811 17.507547 0.741619 0.265412 0.760623 0.016305 0.147308 -4.953011 0.130748 116.487864 3.986792 std 3.131688 39696.822259 18.992212 0.117522 0.208342 0.148533 0.090321 0.123588 2.464186 0.092939 23.518601 0.333701 min 1998.000000 89488.000000 0.000000 0.255000 0.000665 0.111000 0.000000 0.028300 -19.362000 0.027800 61.695000 3.000000 25% 2014.000000 199305.000000 0.000000 0.681000 0.089525 0.669000 0.000000 0.075650 -6.298750 0.059100 102.961250 4.000000 50% 2016.000000 218509.000000 13.000000 0.761000 0.220500 0.784500 0.000004 0.103500 -4.558500 0.097950 112.714500 4.000000 75% 2017.000000 242098.500000 31.000000 0.829500 0.403000 0.875750 0.000234 0.164000 -3.331000 0.177000 125.039250 4.000000 max 2020.000000 511738.000000 73.000000 0.966000 0.954000 0.995000 0.910000 0.811000 0.582000 0.514000 206.007000 5.000000 
import  seaborn as  snsdef  draw (df ):    top = df['artist_top_genre' ].value_counts()     plt.figure(figsize=(10 , 7 ))     sns.barplot(x=top[:5 ].index, y=top[:5 ].values)     plt.xticks(rotation=45 )     plt.title('Top genres' , color='blue' ) draw(df) new_df = df[df['artist_top_genre' ] != 'Missing' ] draw(new_df) featured_df = new_df[((new_df['artist_top_genre' ] == 'afro dancehall' )                       | (new_df['artist_top_genre' ] == 'afropop' )                       | (new_df['artist_top_genre' ] == 'nigerian pop' ))                      & (new_df['popularity' ] > 0 )] draw(featured_df) corrmat = featured_df.corr() f, ax = plt.subplots(figsize=(12 , 9 )) sns.heatmap(corrmat, vmax=.8 , square=True ) 
<AxesSubplot:>
K-Means-与衡量指标 入门-音乐分类 !pip install seaborn import  matplotlib.pyplot as  pltimport  pandas as  pdimport  seaborn as  snsdf = pd.read_csv("./_data_set/Clustering/nigerian-songs.csv" ) df.head() 
Looking in indexes: http://mirrors.aliyun.com/pypi/simple/
Requirement already satisfied: seaborn in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (0.11.2)
Requirement already satisfied: scipy>=1.0 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from seaborn) (1.7.1)
Requirement already satisfied: numpy>=1.15 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from seaborn) (1.20.3)
Requirement already satisfied: pandas>=0.23 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from seaborn) (1.4.2)
Requirement already satisfied: matplotlib>=2.2 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from seaborn) (3.4.3)
Requirement already satisfied: python-dateutil>=2.7 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from matplotlib>=2.2->seaborn) (2.8.2)
Requirement already satisfied: cycler>=0.10 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from matplotlib>=2.2->seaborn) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from matplotlib>=2.2->seaborn) (3.0.4)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from matplotlib>=2.2->seaborn) (1.3.2)
Requirement already satisfied: pillow>=6.2.0 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from matplotlib>=2.2->seaborn) (9.0.1)
Requirement already satisfied: pytz>=2020.1 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from pandas>=0.23->seaborn) (2021.3)
Requirement already satisfied: six>=1.5 in d:\scoop\apps\anaconda3\2021.11\lib\site-packages (from python-dateutil>=2.7->matplotlib>=2.2->seaborn) (1.16.0)
name album artist artist_top_genre release_date length popularity danceability acousticness energy instrumentalness liveness loudness speechiness tempo time_signature 0 Sparky Mandy & The Jungle Cruel Santino alternative r&b 2019 144000 48 0.666 0.8510 0.420 0.534000 0.1100 -6.699 0.0829 133.015 5 1 shuga rush EVERYTHING YOU HEARD IS TRUE Odunsi (The Engine) afropop 2020 89488 30 0.710 0.0822 0.683 0.000169 0.1010 -5.640 0.3600 129.993 3 2 LITT! LITT! AYLØ indie r&b 2018 207758 40 0.836 0.2720 0.564 0.000537 0.1100 -7.127 0.0424 130.005 4 3 Confident / Feeling Cool Enjoy Your Life Lady Donli nigerian pop 2019 175135 14 0.894 0.7980 0.611 0.000187 0.0964 -4.961 0.1130 111.087 4 4 wanted you rare. Odunsi (The Engine) afropop 2018 152049 25 0.702 0.1160 0.833 0.910000 0.3480 -6.044 0.0447 105.115 4 
fetured_df = df[((df['artist_top_genre' ] == 'afro dancehall' )                  | (df['artist_top_genre' ] == 'afropop' )                  | (df['artist_top_genre' ] == 'nigerian pop' ))                 & (df['popularity' ] > 0 )] top = fetured_df['artist_top_genre' ].value_counts() plt.figure(figsize=(10 , 7 )) sns.barplot(x=top.index, y=top.values) plt.xticks(rotation=45 ) plt.title('Top genres' , color='blue' ) plt.figure(figsize=(20 , 20 ), dpi=200 ) """   plt.subplot(4, 3, 1)   sns.boxplot(x='popularity', data=df)   plt.subplot(4, 3, 2)   sns.boxplot(x='acousticness', data=df)   plt.subplot(4, 3, 3)   sns.boxplot(x='energy', data=df)   plt.subplot(4, 3, 4)   sns.boxplot(x='instrumentalness', data=df)   plt.subplot(4, 3, 5)   sns.boxplot(x='liveness', data=df)   plt.subplot(4, 3, 6)   sns.boxplot(x='loudness', data=df)   plt.subplot(4, 3, 7)   sns.boxplot(x='speechiness', data=df)   plt.subplot(4, 3, 8)   sns.boxplot(x='tempo', data=df)   plt.subplot(4, 3, 9)   sns.boxplot(x='time_signature', data=df)   plt.subplot(4, 3, 10)   sns.boxplot(x='danceability', data=df)   plt.subplot(4, 3, 11)   sns.boxplot(x='length', data=df)   plt.subplot(4, 3, 12)   sns.boxplot(x='release_date', data=df)  """ for  i, v in  enumerate ([        "popularity" , "acousticness" , "energy" , "instrumentalness" , "liveness" ,         "loudness" , "speechiness" , "tempo" , "time_signature" , "danceability" ,         "length" , "release_date"  ]):     plt.subplot(4 , 3 , i + 1 )     sns.boxplot(x=v, data=df)      
from  sklearn.preprocessing import  LabelEncoderfrom  sklearn.cluster import  KMeansfrom  sklearn import  metricsle = LabelEncoder() X = df.loc[:, ('artist_top_genre' , 'popularity' , 'danceability' ,                'acousticness' , 'loudness' , 'energy' )] X['artist_top_genre' ] = le.fit_transform(X['artist_top_genre' ]) y = le.transform(df['artist_top_genre' ]) nclusters = 3  seed = 0  km = KMeans(n_clusters=nclusters, random_state=seed).fit(X) y_cluster_kmeans = km.predict(X) print (y_cluster_kmeans)score = metrics.silhouette_score(X, y_cluster_kmeans) print (score)
[2 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 2 0 0 1 0 0 2 1 1 2 2 2 0
 2 0 1 2 2 2 0 1 1 2 1 0 0 0 0 1 1 1 1 1 2 0 2 1 0 2 0 1 2 1 2 1 1 1 1 2 1
 0 2 2 0 0 0 1 0 0 1 1 1 1 0 0 1 1 1 2 1 0 0 1 1 1 0 0 1 0 1 2 0 1 1 2 0 1
 1 1 1 1 1 1 0 1 0 1 0 1 0 2 1 2 0 1 1 1 0 0 1 1 0 2 0 1 1 0 0 0 1 1 0 1 0
 2 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 2 1 1 1 1 1 0
 1 1 1 1 0 2 0 0 0 0 0 0 1 1 2 1 1 1 1 1 2 1 1 0 0 1 1 1 1 1 0 0 1 0 1 0 0
 1 2 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 2 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1
 2 0 0 1 1 0 0 0 1 0 0 1 1 1 2 1 1 2 0 0 1 0 1 1 0 1 1 1 0 2 1 1 2 2 2 0 2
 2 2 2 2 2 2 2 1 2 2 1 2 1 2 1 2 2 2 0 1 0 1 1 0 1 2 1 1 2 1 0 2 2 0 0 0 1
 0 1 0 1 1 1 1 2 1 1 2 2 1 0 2 0 2 1 1 2 2 2 0 0 2 2 1 1 0 0 0 1 1 1 1 1 1
 1 1 1 1 2 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 0 0 0 1 2 1 1 2 1 0 0 0 2 2 2 0 2
 2 2 0 2 2 0 1 0 1 2 0 1 1 1 1 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1
 0 1 1 1 1 1 1 0 1 1 1 2 0 0 1 1 1 1 2 1 0 2 1 1 1 0 1 1 2 1 0 0 0 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 1]
0.5918299843029218
wcss = [] for  i in  range (1 , 11 ):    kmeans = KMeans(n_clusters=i, init='k-means++' , random_state=42 ).fit(X)     wcss.append(kmeans.inertia_) plt.figure(figsize=(10 , 5 )) sns.lineplot(range (1 , 11 ), wcss, marker='o' , color='red' ) plt.title('Elbow' ) plt.xlabel('Number of clusters' ) plt.ylabel('WCSS' ) plt.show() 
D:\Scoop\apps\anaconda3\current\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(
D:\Scoop\apps\anaconda3\current\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
kmeans = KMeans(n_clusters=3 ).fit(X) labels = kmeans.predict(X) plt.scatter(df['popularity' ], df['danceability' ], c=labels) plt.xlabel('popularity' ) plt.ylabel('danceability' ) plt.show() correct_labels = sum (y == kmeans.labels_) print ("%d/%d samples were correctly labeled."  %      (correct_labels, y.size)) print ('Accuracy score: {0:0.2f}' .format (correct_labels / float (y.size)))
93/530 samples were correctly labeled.
Accuracy score: 0.18
sample-2 import  numpy as  npimport  matplotlib.colorsimport  matplotlib.pyplot as  pltimport  sklearn.datasets as  dsfrom  sklearn.metrics import  homogeneity_score, completeness_score, v_measure_score, adjusted_mutual_info_score,\    adjusted_rand_score, silhouette_score from  sklearn.cluster import  KMeansfrom  mpl_toolkits.mplot3d import  Axes3Ddef  expand (a, b ):    d = (b - a) * 0.1      return  a - d, b + d N = 400  centers = 4  data, y = ds.make_blobs(N, n_features=2 , centers=centers, random_state=2 ) data2, y2 = ds.make_blobs(N,                           n_features=2 ,                           centers=centers,                           cluster_std=(1 , 2.5 , 0.5 , 2 ),                           random_state=2 ) data3 = np.vstack(     (data[y == 0 ][:], data[y == 1 ][:50 ], data[y == 2 ][:20 ], data[y == 3 ][:5 ])) y3 = np.array([0 ] * 100  + [1 ] * 50  + [2 ] * 20  + [3 ] * 5 ) m = np.array(((1 , 1 ), (1 , 3 ))) data_r = data.dot(m) matplotlib.rcParams['font.sans-serif' ] = ['SimHei' ] matplotlib.rcParams['axes.unicode_minus' ] = False  cm = matplotlib.colors.ListedColormap(list ('rgbm' )) data_list = data, data, data_r, data_r, data2, data2, data3, data3 y_list = y, y, y, y, y2, y2, y3, y3 titles = '原始数据' , 'KMeans++聚类' , '旋转后数据' , '旋转后KMeans++聚类' ,\           '方差不相等数据' , '方差不相等KMeans++聚类' , '数量不相等数据' , '数量不相等KMeans++聚类'  model = KMeans(n_clusters=4 , init='k-means++' , n_init=5 ) fig = plt.figure(figsize=(8 , 9 ), facecolor='w' ) for  i, (x, y, title) in  enumerate (zip (data_list, y_list, titles), start=1 ):         ax = fig.add_subplot(4 , 2 , i)     plt.title(title)     if  i % 2  == 1 :         y_pred = y     else :         y_pred = model.fit_predict(x)     print (i)     print ('Homogeneity:' , homogeneity_score(y, y_pred))     print ('completeness:' , completeness_score(y, y_pred))     print ('V measure:' , v_measure_score(y, y_pred))     print ('AMI:' , adjusted_mutual_info_score(y, y_pred))     print ('ARI:' , adjusted_rand_score(y, y_pred))     print ('Silhouette:' , silhouette_score(x, y_pred), '\n' )     ax.scatter(x[:, 0 ], x[:, 1 ], s=10 , c=y_pred, cmap=cm, edgecolors='none' )     ax.grid(b=True , ls=':' ) plt.tight_layout(2 , rect=(0 , 0 , 1 , 0.95 )) plt.suptitle('数据分布对KMeans聚类的影响' , fontsize=18 ) plt.show() 
1
Homogeneity: 1.0
completeness: 1.0
V measure: 1.0
AMI: 1.0
ARI: 1.0
Silhouette: 0.616436816839852
2
Homogeneity: 0.9898828240244267
completeness: 0.9899006758819153
V measure: 0.9898917498726852
AMI: 0.9898081557479033
ARI: 0.9933165272203728
Silhouette: 0.6189656317733315
3
Homogeneity: 1.0
completeness: 1.0
V measure: 1.0
AMI: 1.0
ARI: 1.0
Silhouette: 0.5275987244664399
4
Homogeneity: 0.7296158940840607
completeness: 0.7315285272632114
V measure: 0.7305709588584066
AMI: 0.7283397010755561
ARI: 0.6783811042853299
Silhouette: 0.5366236044449266
5
Homogeneity: 1.0
completeness: 1.0
V measure: 1.0
AMI: 1.0
ARI: 1.0
Silhouette: 0.4790725752982868
6
Homogeneity: 0.7449364376693913
completeness: 0.7755445167472191
V measure: 0.7599323988656883
AMI: 0.757903292819801
ARI: 0.7113213508090338
Silhouette: 0.5737260449304202
7
Homogeneity: 1.0
completeness: 1.0
V measure: 1.0
AMI: 1.0
ARI: 1.0
Silhouette: 0.5975066093204152
8
Homogeneity: 0.9776347312784609
completeness: 0.9728632742060752
V measure: 0.975243166591057
AMI: 0.9745709993295113
ARI: 0.9906840043816505
Silhouette: 0.6013877858619149
C:\Users\utsuk\AppData\Local\Temp\ipykernel_28356\867049078.py:57: MatplotlibDeprecationWarning: Passing the pad parameter of tight_layout() positionally is deprecated since Matplotlib 3.3; the parameter will become keyword-only two minor releases later.
  plt.tight_layout(2, rect=(0, 0, 1, 0.95))
层次聚类 import  numpy as  npimport  matplotlib as  mplimport  matplotlib.pyplot as  pltfrom  sklearn.cluster import  AgglomerativeClusteringfrom  sklearn.neighbors import  kneighbors_graphimport  sklearn.datasets as  dsimport  warningsdef  expand (a, b, r ):    d = (b - a) * r     return  a - d, b + d if  __name__ == '__main__' :    warnings.filterwarnings(action='ignore' , category=UserWarning)     np.set_printoptions(suppress=True )     np.random.seed(0 )     n_clusters = 4      N = 400      data1, y1 = ds.make_blobs(n_samples=N,                               n_features=2 ,                               centers=((-1 , 1 ), (1 , 1 ), (1 , -1 ), (-1 , -1 )),                               cluster_std=(0.1 , 0.2 , 0.3 , 0.4 ),                               random_state=0 )     data1 = np.array(data1)     n_noise = int (0.1  * N)     r = np.random.rand(n_noise, 2 )     data_min1, data_min2 = np.min (data1, axis=0 )     data_max1, data_max2 = np.max (data1, axis=0 )     r[:, 0 ] = r[:, 0 ] * (data_max1 - data_min1) + data_min1     r[:, 1 ] = r[:, 1 ] * (data_max2 - data_min2) + data_min2     data1_noise = np.concatenate((data1, r), axis=0 )     y1_noise = np.concatenate((y1, [4 ] * n_noise))     data2, y2 = ds.make_moons(n_samples=N, noise=.05 )     data2 = np.array(data2)     n_noise = int (0.1  * N)     r = np.random.rand(n_noise, 2 )     data_min1, data_min2 = np.min (data2, axis=0 )     data_max1, data_max2 = np.max (data2, axis=0 )     r[:, 0 ] = r[:, 0 ] * (data_max1 - data_min1) + data_min1     r[:, 1 ] = r[:, 1 ] * (data_max2 - data_min2) + data_min2     data2_noise = np.concatenate((data2, r), axis=0 )     y2_noise = np.concatenate((y2, [3 ] * n_noise))     mpl.rcParams['font.sans-serif' ] = ['SimHei' ]     mpl.rcParams['axes.unicode_minus' ] = False      cm = mpl.colors.ListedColormap(['r' , 'g' , 'b' , 'm' , 'c' ])     plt.figure(figsize=(10 , 8 ), facecolor='w' )     plt.cla()     linkages = ("ward" , "complete" , "average" )     for  index, (n_clusters, data, y) in  enumerate (         ((4 , data1, y1), (4 , data1_noise, y1_noise), (2 , data2, y2),          (2 , data2_noise, y2_noise))):         plt.subplot(4 , 4 , 4  * index + 1 )         plt.scatter(data[:, 0 ], data[:, 1 ], c=y, s=12 , edgecolors='k' , cmap=cm)         plt.title('Prime' , fontsize=12 )         plt.grid(b=True , ls=':' )         data_min1, data_min2 = np.min (data, axis=0 )         data_max1, data_max2 = np.max (data, axis=0 )         plt.xlim(expand(data_min1, data_max1, 0.05 ))         plt.ylim(expand(data_min2, data_max2, 0.05 ))         connectivity = kneighbors_graph(data,                                         n_neighbors=7 ,                                         mode='distance' ,                                         metric='minkowski' ,                                         p=2 ,                                         include_self=True )         connectivity = 0.5  * (connectivity + connectivity.T)         for  i, linkage in  enumerate (linkages):             ac = AgglomerativeClustering(n_clusters=n_clusters,                                          affinity='euclidean' ,                                          connectivity=connectivity,                                          linkage=linkage)             ac.fit(data)             y = ac.labels_             plt.subplot(4 , 4 , i + 2  + 4  * index)             plt.scatter(data[:, 0 ],                         data[:, 1 ],                         c=y,                         s=12 ,                         edgecolors='k' ,                         cmap=cm)             plt.title(linkage, fontsize=12 )             plt.grid(b=True , ls=':' )             plt.xlim(expand(data_min1, data_max1, 0.05 ))             plt.ylim(expand(data_min2, data_max2, 0.05 ))     plt.suptitle('层次聚类的不同合并策略' , fontsize=15 )     plt.tight_layout(0.5 , rect=(0 , 0 , 1 , 0.95 ))     plt.show() 
降维问题 缺失值比率 import  pandas as  pdimport  numpy as  npimport  matplotlib.pyplot as  plttrain = pd.read_csv("./_data_set/降维问题/train_v9rqX0R.csv" ) a = train.isnull().sum () / len (train) * 100  variable = [] for  i in  range (0 , 12 ):    if  a[i] <= 20 :           variable.append(train.columns[i]) print (variable)
['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']
低方差过滤 import  pandas as  pdimport  numpy as  npimport  matplotlib.pyplot as  plttrain = pd.read_csv("./_data_set/降维问题/train_v9rqX0R.csv" ) train['Item_Weight' ].fillna(train['Item_Weight' ].median(), inplace=True ) train['Outlet_Size' ].fillna(train['Outlet_Size' ].mode()[0 ], inplace=True ) print (train.isnull().sum () / len (train) * 100 )numeric = train[[     'Item_Weight' , 'Item_Visibility' , 'Item_MRP' , 'Outlet_Establishment_Year'  ]] var = numeric.var() numeric = numeric.columns variable = [] for  i in  range (0 , len (var)):    if  var[i] >= 10 :           variable.append(numeric[i]) print (variable)
Item_Identifier              0.0
Item_Weight                  0.0
Item_Fat_Content             0.0
Item_Visibility              0.0
Item_Type                    0.0
Item_MRP                     0.0
Outlet_Identifier            0.0
Outlet_Establishment_Year    0.0
Outlet_Size                  0.0
Outlet_Location_Type         0.0
Outlet_Type                  0.0
Item_Outlet_Sales            0.0
dtype: float64
['Item_Weight', 'Item_MRP', 'Outlet_Establishment_Year']
代码相关 存储-读取模型 import  numpy as  npimport  sklearn.linear_model as  lm  import  sklearn.metrics as  sm  import  matplotlib.pyplot as  mpimport  picklex = np.array([[0.5 ], [0.6 ], [0.8 ], [1.1 ], [1.4 ]])   y = np.array([5.0 , 5.5 , 6.0 , 6.8 , 7.0 ])   model = lm.LinearRegression() model.fit(x, y) print ("训练完成." )with  open ('linear_model.pkl' , 'wb' ) as  f:    pickle.dump(model, f)     print ("保存模型完成." ) with  open ('linear_model.pkl' , 'rb' ) as  f:    model = pickle.load(f)     print ("加载模型完成." ) pred_y = model.predict(x) mp.figure('Linear Regression' , facecolor='lightgray' ) mp.title('Linear Regression' , fontsize=20 ) mp.xlabel('x' , fontsize=14 ) mp.ylabel('y' , fontsize=14 ) mp.tick_params(labelsize=10 ) mp.grid(linestyle=':' ) mp.scatter(x, y, c='blue' , alpha=0.8 , s=60 , label='Sample Points' ) mp.plot(x, pred_y, c='orangered' , label='Regression' ) mp.legend() mp.show() 
训练完成.
保存模型完成.
加载模型完成.
Web-app 一 train 一个逻辑回归模型并用 pickle 打包
然后load
import  pandas as  pdimport  numpy as  npfrom  sklearn.preprocessing import  LabelEncoderfrom  sklearn.model_selection import  train_test_splitfrom  sklearn.metrics import  accuracy_score, classification_reportfrom  sklearn.linear_model import  LogisticRegressionimport  pickleufos = pd.read_csv('./_data_set/web-app/1/ufos.csv' ) ufos = pd.DataFrame({     'Seconds' : ufos['duration (seconds)' ],     'Country' : ufos['country' ],     'Latitude' : ufos['latitude' ],     'Longitude' : ufos['longitude' ] }) ufos.Country.unique() ufos.dropna(inplace=True ) ufos = ufos[(ufos['Seconds' ] >= 1 ) & (ufos['Seconds' ] <= 60 )] ufos['Country' ] = LabelEncoder().fit_transform(ufos['Country' ]) ufos.head() x = ufos[['Seconds' , 'Latitude' , 'Longitude' ]] y = ufos['Country' ] X_train, X_test, y_train, y_test = train_test_split(x,                                                     y,                                                     test_size=0.2 ,                                                     random_state=0 ) model = LogisticRegression(max_iter=1000 ).fit(X_train, y_train) y_pred = model.predict(X_test) print (classification_report(y_test, y_pred))print ('Predicted labels: ' , y_pred)print ('Accuracy: ' , accuracy_score(y_test, y_pred))model_name = './_data_set/web-app/1/ufo-model.pkl'  pickle.dump(model, open (model_name, 'wb' )) 
              precision    recall  f1-score   support
           0       1.00      1.00      1.00        41
           1       0.85      0.47      0.60       250
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00       131
           4       0.97      1.00      0.98      4743
    accuracy                           0.97      5173
   macro avg       0.96      0.89      0.92      5173
weighted avg       0.97      0.97      0.97      5173
Predicted labels:  [4 4 4 ... 3 4 4]
Accuracy:  0.9702300405953992
model_load = pickle.load(open (model_name, 'rb' )) print (model_load.predict([[50 , 44 , -12 ]]))
[3]
D:\Scoop\apps\anaconda3\current\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(
二 import  pandas as  pdfrom  sklearn.model_selection import  train_test_splitfrom  sklearn.svm import  SVCfrom  sklearn.model_selection import  cross_val_scorefrom  sklearn.metrics import  accuracy_score, precision_score, confusion_matrix, classification_reportdata = pd.read_csv('./_data_set/cuisines_classification/cleaned_cuisines.csv' ) X = data.iloc[:, 2 :] y = data[['cuisine' ]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ) model = SVC(kernel='linear' , C=10 , probability=True , random_state=0 ) model.fit(X_train, y_train.values.ravel()) y_pred = model.predict(X_test) print (classification_report(y_test, y_pred))
              precision    recall  f1-score   support
     chinese       0.73      0.68      0.70       242
      indian       0.89      0.88      0.88       246
    japanese       0.80      0.79      0.79       262
      korean       0.82      0.75      0.78       229
        thai       0.75      0.88      0.81       220
    accuracy                           0.79      1199
   macro avg       0.79      0.80      0.79      1199
weighted avg       0.80      0.79      0.79      1199
from  skl2onnx import  convert_sklearnfrom  skl2onnx.common.data_types import  FloatTensorTypeinitial_type = [('float_input' , FloatTensorType([None , 380 ]))] options = {id (model): {'nocl' : True , 'zipmap' : False }} onx = convert_sklearn(model, initial_types=initial_type, options=options) with  open ("./_data_set/web-app/2/model.onnx" , "wb" ) as  f:    f.write(onx.SerializeToString()) 
信息论 两点分布信息熵 import  numpy as  npimport  matplotlib.pyplot as  plteps = 1e-5  p = np.linspace(eps, 1  - eps, 100 ) h = -(1  - p) * np.log2(1  - p) - p * np.log2(p) plt.plot(p, h, label='Information entropy' , color='red' , lw=3 ) plt.xlabel('Probability' , fontsize=16 ) plt.ylabel('Entropy' , fontsize=16 ) plt.legend(loc='best' , fontsize=16 ) plt.grid(True ) plt.show() 
基尼系数与-ln import  numpy as  npimport  matplotlib.pyplot as  plteps = 1e-4  p = np.linspace(eps, 1  - eps, 100 ) h = -(1  - p) * np.log2(1  - p) - p * np.log2(p) gini = 2  * (1  - p) * p plt.plot(p, gini, 'r-' , lw=3 ) plt.plot(p, h / 2 , 'g-' , lw=3 ) plt.title('Gini(p) / Ln(p)' , fontsize=16 ) plt.xlabel('p' , fontsize=14 ) plt.ylabel('H' , fontsize=14 ) plt.legend(['Gini' , 'Ln' ], loc='best' , fontsize=14 ) plt.show() 
借物表 [1] : https://discover304.top/ 
[2] : 【上海交大】【腾讯】强强联合 机器学习+深度学习 
[3] : The Ultimate Guide to 12 Dimensionality Reduction Techniques (with Python codes) 
[4] : https://github.com/microsoft/ML-For-Beginners