准备工作
pip install numpy
pip install pandas
pip install scikit-learn
pip install matplotlib
KNN算法
示例一: 鸢尾花数据集分类
Iris也称鸢尾花卉数据集,是一类多重变量分析的数据集。 数据集包含150个数据样本,分为3类,每类50个数据,每个数据包含4个属性(花萼长度,花萼宽度,花瓣长度,花瓣宽度)。 可通过花萼长度,花萼宽度,花瓣长度,花瓣宽度4个属性预测鸢尾花卉属于山鸢尾
,杂色鸢尾
,维吉尼亚鸢尾
3个种类中的哪一类。
import numpy as np
import pandas as pd
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import StandardScaler
from sklearn. neighbors import KNeighborsClassifier
from sklearn. metrics import accuracy_score
from sklearn. datasets import load_iris
import matplotlib. pyplot as plt
from matplotlib. colors import ListedColormap
iris = load_iris( )
X = iris. data[ : , : 2 ]
y = iris. target
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.3 , random_state= 42 )
scaler = StandardScaler( )
X_train = scaler. fit_transform( X_train)
X_test = scaler. transform( X_test)
knn = KNeighborsClassifier( n_neighbors= 3 )
knn. fit( X_train, y_train)
y_pred = knn. predict( X_test)
accuracy = accuracy_score( y_test, y_pred)
print ( f"模型的预测准确率为: { accuracy * 100 : .2f } %" )
plt. rcParams[ 'font.family' ] = [ 'sans-serif' ]
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. figure( figsize= ( 12 , 6 ) )
plt. subplot( 1 , 2 , 1 )
plt. scatter( X_train[ : , 0 ] , X_train[ : , 1 ] , c= y_train, cmap= plt. cm. Paired, edgecolors= 'k' , s= 30 )
plt. title( "训练集 - 花萼长度 vs 花萼宽度" )
plt. xlabel( "花萼长度 (cm)" )
plt. ylabel( "花萼宽度 (cm)" )
plt. subplot( 1 , 2 , 2 )
plt. scatter( X_test[ : , 0 ] , X_test[ : , 1 ] , c= y_pred, cmap= plt. cm. Paired, edgecolors= 'k' , s= 30 )
plt. title( "测试集 - 花萼长度 vs 花萼宽度" )
plt. xlabel( "花萼长度 (cm)" )
plt. ylabel( "花萼宽度 (cm)" )
plt. tight_layout( )
plt. show( )
X_train_2d = X_train[ : , : 2 ]
X_test_2d = X_test[ : , : 2 ]
knn_2d = KNeighborsClassifier( n_neighbors= 3 )
knn_2d. fit( X_train_2d, y_train)
x_min, x_max = X_train_2d[ : , 0 ] . min ( ) - 1 , X_train_2d[ : , 0 ] . max ( ) + 1
y_min, y_max = X_train_2d[ : , 1 ] . min ( ) - 1 , X_train_2d[ : , 1 ] . max ( ) + 1
xx, yy = np. meshgrid( np. arange( x_min, x_max, 0.1 ) , np. arange( y_min, y_max, 0.1 ) )
Z = knn_2d. predict( np. c_[ xx. ravel( ) , yy. ravel( ) ] )
Z = Z. reshape( xx. shape)
plt. rcParams[ 'font.family' ] = [ 'sans-serif' ]
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. figure( figsize= ( 12 , 6 ) )
plt. subplot( 1 , 2 , 1 )
plt. contourf( xx, yy, Z, alpha= 0.3 , cmap= ListedColormap( [ 'red' , 'green' , 'blue' ] ) )
plt. scatter( X_test_2d[ : , 0 ] , X_test_2d[ : , 1 ] , c= y_test, edgecolors= 'k' , cmap= ListedColormap( [ 'red' , 'green' , 'blue' ] ) )
plt. title( 'KNN决策边界 - 测试集' )
plt. xlabel( '花萼长度 (cm)' )
plt. ylabel( '花萼宽度 (cm)' )
plt. tight_layout( )
plt. show( )
示例二:葡萄酒数据集分类
load_wine是一个葡萄酒数据集,是一类多重变量分析的数据集。 数据集包含178个数据样本,分为3类,第一类59个样本,第二类71个样本,第三类48个样本,每个样本包含13个化学特征(这些化学特征包括酸度、灰分、酒精浓度等)。 可通过酸度、灰分、酒精浓度等化学特征预测葡萄酒属于琴酒
,雪莉
,贝尔莫得
3个种类中的哪一类。
from sklearn. datasets import load_wine
from sklearn. model_selection import train_test_split
from sklearn. neighbors import KNeighborsClassifier
import matplotlib. pyplot as plt
wine = load_wine( )
X = wine. data[ : , : 2 ]
y = wine. target
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.3 , random_state= 42 )
knn = KNeighborsClassifier( n_neighbors= 5 )
knn. fit( X_train, y_train)
y_pred = knn. predict( X_test)
accuracy = accuracy_score( y_test, y_pred)
print ( f"模型的预测准确率为: { accuracy * 100 : .2f } %" )
plt. figure( figsize= ( 8 , 6 ) )
xx, yy = np. meshgrid( np. linspace( X[ : , 0 ] . min ( ) - 1 , X[ : , 0 ] . max ( ) + 1 , 100 ) , np. linspace( X[ : , 1 ] . min ( ) - 1 , X[ : , 1 ] . max ( ) + 1 , 100 ) )
Z = knn. predict( np. c_[ xx. ravel( ) , yy. ravel( ) ] )
Z = Z. reshape( xx. shape) plt. contourf( xx, yy, Z, alpha= 0.3 , cmap= plt. cm. Paired)
plt. scatter( X_test[ : , 0 ] , X_test[ : , 1 ] , c= y_test, edgecolors= 'k' , cmap= plt. cm. Paired)
plt. title( 'KNN Classification (Wine Dataset)' )
plt. xlabel( 'Feature 1' )
plt. ylabel( 'Feature 2' )
plt. show( )
示例三:乳腺癌肿瘤数据集分类
load_breast_cancer是一个乳腺癌肿瘤数据集,是一类多重变量分析的数据集。 数据集包含569个数据样本,分为2类,第一类357个样本,第二类212个样本,每个样本包含30个属性(这些属性包括肿瘤的半径、纹理、对称性等)。 可通过半径、纹理、对称性等属性预测肿瘤属于良性(B)
、恶性(M)
2个种类中的哪一类。
from sklearn. datasets import load_breast_cancer
from sklearn. model_selection import train_test_split
from sklearn. neighbors import KNeighborsClassifier
import matplotlib. pyplot as plt
cancer = load_breast_cancer( )
X = cancer. data[ : , : 2 ]
y = cancer. target
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size= 0.3 , random_state= 42 )
knn = KNeighborsClassifier( n_neighbors= 5 )
knn. fit( X_train, y_train)
y_pred = knn. predict( X_test)
accuracy = accuracy_score( y_test, y_pred)
print ( f"模型的预测准确率为: { accuracy * 100 : .2f } %" )
plt. figure( figsize= ( 8 , 6 ) )
xx, yy = np. meshgrid( np. linspace( X[ : , 0 ] . min ( ) - 1 , X[ : , 0 ] . max ( ) + 1 , 100 ) , np. linspace( X[ : , 1 ] . min ( ) - 1 , X[ : , 1 ] . max ( ) + 1 , 100 ) )
Z = knn. predict( np. c_[ xx. ravel( ) , yy. ravel( ) ] )
Z = Z. reshape( xx. shape) plt. contourf( xx, yy, Z, alpha= 0.3 , cmap= plt. cm. RdBu)
plt. scatter( X_test[ : , 0 ] , X_test[ : , 1 ] , c= y_test, edgecolors= 'k' , cmap= plt. cm. RdBu)
plt. title( 'KNN Classification (Breast Cancer Dataset)' )
plt. xlabel( 'Feature 1' )
plt. ylabel( 'Feature 2' )
plt. show( )