透過上面的方法,剔除採樣的目的是為了確保n(label數)可以大於2,而且文件的長度不等於0。同樣,也排除已經選過的類別。備標註為2種類別的檔案會以雙重顏色的圈圈表示。
為了進行可視化,藉由PCA (Principal Component Analysis 主成分分析) 和CCA (Canonical Correlation Analysis 典型相關分析) 找到前兩個主要成分將數據projecting(投影)後來執行分類。使用sklearn.multiclass.OneVsRestClassifier,metaclassifier(元分類器)使用兩個帶有線性內核的SVC來學習每個類別的discriminative model(判別模型)。
Copy import numpy as np
import matplotlib . pyplot as plt
from sklearn . datasets import make_multilabel_classification
from sklearn . multiclass import OneVsRestClassifier
from sklearn . svm import SVC
from sklearn . decomposition import PCA
from sklearn . cross_decomposition import CCA
Copy def plot_hyperplane ( clf , min_x , max_x , linestyle , label ):
# get the separating hyperplane
w = clf . coef_ [ 0 ]
a = - w [ 0 ] / w [ 1 ]
xx = np . linspace (min_x - 5 , max_x + 5 ) # make sure the line is long enough
yy = a * xx - (clf . intercept_ [ 0 ] ) / w [ 1 ]
plt . plot (xx, yy, linestyle, label = label)
Copy def plot_subfigure ( X , Y , subplot , title , transform ):
if transform == "pca" :
X = PCA (n_components = 2 ). fit_transform (X)
elif transform == "cca" :
X = CCA (n_components = 2 ). fit (X, Y). transform (X)
else :
raise ValueError
min_x = np . min (X[:, 0 ])
max_x = np . max (X[:, 0 ])
min_y = np . min (X[:, 1 ])
max_y = np . max (X[:, 1 ])
classif = OneVsRestClassifier ( SVC (kernel = 'linear' ))
classif . fit (X, Y)
plt . subplot ( 2 , 2 , subplot)
plt . title (title)
zero_class = np . where (Y[:, 0 ])
one_class = np . where (Y[:, 1 ])
plt . scatter (X[:, 0 ], X[:, 1 ], s = 40 , c = 'gray' , edgecolors = ( 0 , 0 , 0 ))
plt . scatter (X[zero_class, 0 ], X[zero_class, 1 ], s = 160 , edgecolors = 'b' ,
facecolors = 'none' , linewidths = 2 , label = 'Class 1' )
plt . scatter (X[one_class, 0 ], X[one_class, 1 ], s = 80 , edgecolors = 'orange' ,
facecolors = 'none' , linewidths = 2 , label = 'Class 2' )
plot_hyperplane (classif.estimators_[ 0 ], min_x, max_x, 'k--' ,
'Boundary\nfor class 1' )
plot_hyperplane (classif.estimators_[ 1 ], min_x, max_x, 'k-.' ,
'Boundary\nfor class 2' )
plt . xticks (())
plt . yticks (())
plt . xlim (min_x - .5 * max_x, max_x + .5 * max_x)
plt . ylim (min_y - .5 * max_y, max_y + .5 * max_y)
if subplot == 2 :
plt . xlabel ( 'First principal component' )
plt . ylabel ( 'Second principal component' )
plt . legend (loc = "upper left" )
plt . figure (figsize = ( 8 , 6 ))
Copy X , Y = make_multilabel_classification (n_classes = 2 , n_labels = 1 ,
allow_unlabeled = True ,
random_state = 1 )
plot_subfigure (X, Y, 1 , "With unlabeled samples + CCA" , "cca" )
plot_subfigure (X, Y, 2 , "With unlabeled samples + PCA" , "pca" )
X , Y = make_multilabel_classification (n_classes = 2 , n_labels = 1 ,
allow_unlabeled = False ,
random_state = 1 )
plot_subfigure (X, Y, 3 , "Without unlabeled samples + CCA" , "cca" )
plot_subfigure (X, Y, 4 , "Without unlabeled samples + PCA" , "pca" )
plt . subplots_adjust ( .04 , .02 , .97 , .94 , .09 , .2 )
plt . show ()
Copy print ( __doc__ )
import numpy as np
import matplotlib . pyplot as plt
from sklearn . datasets import make_multilabel_classification
from sklearn . multiclass import OneVsRestClassifier
from sklearn . svm import SVC
from sklearn . decomposition import PCA
from sklearn . cross_decomposition import CCA
def plot_hyperplane ( clf , min_x , max_x , linestyle , label ):
# get the separating hyperplane
w = clf . coef_ [ 0 ]
a = - w [ 0 ] / w [ 1 ]
xx = np . linspace (min_x - 5 , max_x + 5 ) # make sure the line is long enough
yy = a * xx - (clf . intercept_ [ 0 ] ) / w [ 1 ]
plt . plot (xx, yy, linestyle, label = label)
def plot_subfigure ( X , Y , subplot , title , transform ):
if transform == "pca" :
X = PCA (n_components = 2 ). fit_transform (X)
elif transform == "cca" :
X = CCA (n_components = 2 ). fit (X, Y). transform (X)
else :
raise ValueError
min_x = np . min (X[:, 0 ])
max_x = np . max (X[:, 0 ])
min_y = np . min (X[:, 1 ])
max_y = np . max (X[:, 1 ])
classif = OneVsRestClassifier ( SVC (kernel = 'linear' ))
classif . fit (X, Y)
plt . subplot ( 2 , 2 , subplot)
plt . title (title)
zero_class = np . where (Y[:, 0 ])
one_class = np . where (Y[:, 1 ])
plt . scatter (X[:, 0 ], X[:, 1 ], s = 40 , c = 'gray' , edgecolors = ( 0 , 0 , 0 ))
plt . scatter (X[zero_class, 0 ], X[zero_class, 1 ], s = 160 , edgecolors = 'b' ,
facecolors = 'none' , linewidths = 2 , label = 'Class 1' )
plt . scatter (X[one_class, 0 ], X[one_class, 1 ], s = 80 , edgecolors = 'orange' ,
facecolors = 'none' , linewidths = 2 , label = 'Class 2' )
plot_hyperplane (classif.estimators_[ 0 ], min_x, max_x, 'k--' ,
'Boundary\nfor class 1' )
plot_hyperplane (classif.estimators_[ 1 ], min_x, max_x, 'k-.' ,
'Boundary\nfor class 2' )
plt . xticks (())
plt . yticks (())
plt . xlim (min_x - .5 * max_x, max_x + .5 * max_x)
plt . ylim (min_y - .5 * max_y, max_y + .5 * max_y)
if subplot == 2 :
plt . xlabel ( 'First principal component' )
plt . ylabel ( 'Second principal component' )
plt . legend (loc = "upper left" )
plt . figure (figsize = ( 8 , 6 ))
X , Y = make_multilabel_classification (n_classes = 2 , n_labels = 1 ,
allow_unlabeled = True ,
random_state = 1 )
plot_subfigure (X, Y, 1 , "With unlabeled samples + CCA" , "cca" )
plot_subfigure (X, Y, 2 , "With unlabeled samples + PCA" , "pca" )
X , Y = make_multilabel_classification (n_classes = 2 , n_labels = 1 ,
allow_unlabeled = False ,
random_state = 1 )
plot_subfigure (X, Y, 3 , "Without unlabeled samples + CCA" , "cca" )
plot_subfigure (X, Y, 4 , "Without unlabeled samples + PCA" , "pca" )
plt . subplots_adjust ( .04 , .02 , .97 , .94 , .09 , .2 )
plt . show ()