%matplotlib inlinefrom__future__import divisionimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.datasets import make_blobsfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysisn_train =20# samples for trainingn_test =200# samples for testingn_averages =50# how often to repeat classificationn_features_max =75# maximum number of featuresstep =4# step size for the calculationdefgenerate_data(n_samples,n_features): X, y =make_blobs(n_samples=n_samples, n_features=1, centers=[[-2], [2]])# add non-discriminative featuresif n_features >1: X = np.hstack([X, np.random.randn(n_samples, n_features -1)])return X, y
X, y =generate_data(10, 5)import pandas as pdpd.set_option('precision',2)df=pd.DataFrame(np.hstack([y.reshape(10,1),X]))df.columns = ['y','X0','X1','X2','X2','X4']print(df)
結果顯示如下。。我們可以看到只有X的第一行特徵資料(X0) 與目標數值 y 有一個明確的對應關係,也就是y為1時,數值較大。
from__future__import divisionimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.datasets import make_blobsfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysisn_train =20# samples for trainingn_test =200# samples for testingn_averages =50# how often to repeat classificationn_features_max =75# maximum number of featuresstep =4# step size for the calculationdefgenerate_data(n_samples,n_features):"""Generate random blob-ish data with noisy features. This returns an array of input data with shape `(n_samples, n_features)` and an array of `n_samples` target labels. Only one feature contains discriminative information, the other features contain only noise. """ X, y =make_blobs(n_samples=n_samples, n_features=1, centers=[[-2], [2]])# add non-discriminative featuresif n_features >1: X = np.hstack([X, np.random.randn(n_samples, n_features -1)])return X, yacc_clf1, acc_clf2 = [], []n_features_range =range(1, n_features_max +1, step)for n_features in n_features_range: score_clf1, score_clf2 =0,0for _ inrange(n_averages): X, y =generate_data(n_train, n_features) clf1 =LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto').fit(X, y) clf2 =LinearDiscriminantAnalysis(solver='lsqr', shrinkage=None).fit(X, y) X, y =generate_data(n_test, n_features) score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages)features_samples_ratio = np.array(n_features_range)/ n_trainplt.plot(features_samples_ratio, acc_clf1, linewidth=2, label="Linear Discriminant Analysis with shrinkage", color='r')plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label="Linear Discriminant Analysis", color='g')plt.xlabel('n_features / n_samples')plt.ylabel('Classification accuracy')plt.legend(loc=1, prop={'size': 12})plt.suptitle('Linear Discriminant Analysis vs. \shrinkage Linear Discriminant Analysis (1 discriminative feature)')plt.show()