pip install numpy
pip install pandas
pip install scikit-learn
pip install matplotlib
Iris也称鸢尾花卉数据集,是一类多重变量分析的数据集。 数据集包含150个数据样本,分为3类,每类50个数据,每个数据包含4个属性(花萼长度,花萼宽度,花瓣长度,花瓣宽度)。 可通过花萼长度,花萼宽度,花瓣长度,花瓣宽度4个属性预测鸢尾花卉属于山鸢尾,杂色鸢尾,维吉尼亚鸢尾3个种类中的哪一类。 
import  numpy as  np
import  pandas as  pd
from  sklearn. model_selection import  train_test_split
from  sklearn. preprocessing import  StandardScaler
from  sklearn. neighbors import  KNeighborsClassifier
from  sklearn. metrics import  accuracy_score
from  sklearn. datasets import  load_iris
import  matplotlib. pyplot as  plt
from  matplotlib. colors import  ListedColormap
iris =  load_iris( ) 
X =  iris. data[ : ,  : 2 ]  
y =  iris. target
X_train,  X_test,  y_train,  y_test =  train_test_split( X,  y,  test_size= 0.3 ,  random_state= 42 ) 
scaler =  StandardScaler( ) 
X_train =  scaler. fit_transform( X_train) 
X_test =  scaler. transform( X_test) 
knn =  KNeighborsClassifier( n_neighbors= 3 ) 
knn. fit( X_train,  y_train) 
y_pred =  knn. predict( X_test) 
accuracy =  accuracy_score( y_test,  y_pred) 
print ( f"模型的预测准确率为:  { accuracy *  100 : .2f } %" ) 
plt. rcParams[ 'font.family' ]  =  [ 'sans-serif' ] 
plt. rcParams[ 'font.sans-serif' ]  =  [ 'SimHei' ] 
plt. rcParams[ 'axes.unicode_minus' ] = False 
plt. figure( figsize= ( 12 ,  6 ) ) 
plt. subplot( 1 ,  2 ,  1 ) 
plt. scatter( X_train[ : ,  0 ] ,  X_train[ : ,  1 ] ,  c= y_train,  cmap= plt. cm. Paired,  edgecolors= 'k' ,  s= 30 ) 
plt. title( "训练集 - 花萼长度 vs 花萼宽度" ) 
plt. xlabel( "花萼长度 (cm)" ) 
plt. ylabel( "花萼宽度 (cm)" ) 
plt. subplot( 1 ,  2 ,  2 ) 
plt. scatter( X_test[ : ,  0 ] ,  X_test[ : ,  1 ] ,  c= y_pred,  cmap= plt. cm. Paired,  edgecolors= 'k' ,  s= 30 ) 
plt. title( "测试集 - 花萼长度 vs 花萼宽度" ) 
plt. xlabel( "花萼长度 (cm)" ) 
plt. ylabel( "花萼宽度 (cm)" ) 
plt. tight_layout( ) 
plt. show( ) 
X_train_2d =  X_train[ : ,  : 2 ] 
X_test_2d =  X_test[ : ,  : 2 ] 
knn_2d =  KNeighborsClassifier( n_neighbors= 3 ) 
knn_2d. fit( X_train_2d,  y_train) 
x_min,  x_max =  X_train_2d[ : ,  0 ] . min ( )  -  1 ,  X_train_2d[ : ,  0 ] . max ( )  +  1  
y_min,  y_max =  X_train_2d[ : ,  1 ] . min ( )  -  1 ,  X_train_2d[ : ,  1 ] . max ( )  +  1  
xx,  yy =  np. meshgrid( np. arange( x_min,  x_max,  0.1 ) ,  np. arange( y_min,  y_max,  0.1 ) ) 
Z =  knn_2d. predict( np. c_[ xx. ravel( ) ,  yy. ravel( ) ] ) 
Z =  Z. reshape( xx. shape) 
plt. rcParams[ 'font.family' ]  =  [ 'sans-serif' ] 
plt. rcParams[ 'font.sans-serif' ]  =  [ 'SimHei' ] 
plt. rcParams[ 'axes.unicode_minus' ] = False 
plt. figure( figsize= ( 12 ,  6 ) ) 
plt. subplot( 1 ,  2 ,  1 ) 
plt. contourf( xx,  yy,  Z,  alpha= 0.3 ,  cmap= ListedColormap( [ 'red' ,  'green' ,  'blue' ] ) ) 
plt. scatter( X_test_2d[ : ,  0 ] ,  X_test_2d[ : ,  1 ] ,  c= y_test,  edgecolors= 'k' ,  cmap= ListedColormap( [ 'red' ,  'green' ,  'blue' ] ) ) 
plt. title( 'KNN决策边界 - 测试集' ) 
plt. xlabel( '花萼长度 (cm)' ) 
plt. ylabel( '花萼宽度 (cm)' ) 
plt. tight_layout( ) 
plt. show( ) 
load_wine是一个葡萄酒数据集,是一类多重变量分析的数据集。 数据集包含178个数据样本,分为3类,第一类59个样本,第二类71个样本,第三类48个样本,每个样本包含13个化学特征(这些化学特征包括酸度、灰分、酒精浓度等)。 可通过酸度、灰分、酒精浓度等化学特征预测葡萄酒属于琴酒,雪莉,贝尔莫得3个种类中的哪一类。 from  sklearn. datasets import  load_wine
from  sklearn. model_selection import  train_test_split
from  sklearn. neighbors import  KNeighborsClassifier
import  matplotlib. pyplot as  plt
wine =  load_wine( ) 
X =  wine. data[ : ,  : 2 ]   
y =  wine. target 
X_train,  X_test,  y_train,  y_test =  train_test_split( X,  y,  test_size= 0.3 ,  random_state= 42 ) 
knn =  KNeighborsClassifier( n_neighbors= 5 ) 
knn. fit( X_train,  y_train) 
y_pred =  knn. predict( X_test) 
accuracy =  accuracy_score( y_test,  y_pred) 
print ( f"模型的预测准确率为:  { accuracy *  100 : .2f } %" ) 
plt. figure( figsize= ( 8 ,  6 ) ) 
xx,  yy =  np. meshgrid( np. linspace( X[ : ,  0 ] . min ( )  -  1 ,  X[ : ,  0 ] . max ( )  +  1 ,  100 ) , np. linspace( X[ : ,  1 ] . min ( )  -  1 ,  X[ : ,  1 ] . max ( )  +  1 ,  100 ) ) 
Z =  knn. predict( np. c_[ xx. ravel( ) ,  yy. ravel( ) ] ) 
Z =  Z. reshape( xx. shape) plt. contourf( xx,  yy,  Z,  alpha= 0.3 ,  cmap= plt. cm. Paired) 
plt. scatter( X_test[ : ,  0 ] ,  X_test[ : ,  1 ] ,  c= y_test,  edgecolors= 'k' ,  cmap= plt. cm. Paired) 
plt. title( 'KNN Classification (Wine Dataset)' ) 
plt. xlabel( 'Feature 1' ) 
plt. ylabel( 'Feature 2' ) 
plt. show( ) 
load_breast_cancer是一个乳腺癌肿瘤数据集,是一类多重变量分析的数据集。 数据集包含569个数据样本,分为2类,第一类357个样本,第二类212个样本,每个样本包含30个属性(这些属性包括肿瘤的半径、纹理、对称性等)。 可通过半径、纹理、对称性等属性预测肿瘤属于良性(B)、恶性(M)2个种类中的哪一类。 from  sklearn. datasets import  load_breast_cancer
from  sklearn. model_selection import  train_test_split
from  sklearn. neighbors import  KNeighborsClassifier
import  matplotlib. pyplot as  plt
cancer =  load_breast_cancer( ) 
X =  cancer. data[ : ,  : 2 ]   
y =  cancer. target 
X_train,  X_test,  y_train,  y_test =  train_test_split( X,  y,  test_size= 0.3 ,  random_state= 42 ) 
knn =  KNeighborsClassifier( n_neighbors= 5 ) 
knn. fit( X_train,  y_train) 
y_pred =  knn. predict( X_test) 
accuracy =  accuracy_score( y_test,  y_pred) 
print ( f"模型的预测准确率为:  { accuracy *  100 : .2f } %" ) 
plt. figure( figsize= ( 8 ,  6 ) ) 
xx,  yy =  np. meshgrid( np. linspace( X[ : ,  0 ] . min ( )  -  1 ,  X[ : ,  0 ] . max ( )  +  1 ,  100 ) , np. linspace( X[ : ,  1 ] . min ( )  -  1 ,  X[ : ,  1 ] . max ( )  +  1 ,  100 ) ) 
Z =  knn. predict( np. c_[ xx. ravel( ) ,  yy. ravel( ) ] ) 
Z =  Z. reshape( xx. shape) plt. contourf( xx,  yy,  Z,  alpha= 0.3 ,  cmap= plt. cm. RdBu) 
plt. scatter( X_test[ : ,  0 ] ,  X_test[ : ,  1 ] ,  c= y_test,  edgecolors= 'k' ,  cmap= plt. cm. RdBu) 
plt. title( 'KNN Classification (Breast Cancer Dataset)' ) 
plt. xlabel( 'Feature 1' ) 
plt. ylabel( 'Feature 2' ) 
plt. show( )