import numpy as npfrom sklearn.datasets import load_bostonfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.pipeline import Pipelinefrom sklearn.preprocessing import Imputerfrom sklearn.cross_validation import cross_val_scorerng = np.random.RandomState(0)dataset =load_boston()X_full, y_full = dataset.data, dataset.targetn_samples = X_full.shape[0]n_features = X_full.shape[1]# Estimate the score on the entire dataset, with no missing valuesestimator =RandomForestRegressor(random_state=0, n_estimators=100)score =cross_val_score(estimator, X_full, y_full).mean()print("Score with the entire dataset = %.2f"% score)# Add missing values in 75% of the linesmissing_rate =0.75n_missing_samples = np.floor(n_samples * missing_rate)missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool)))rng.shuffle(missing_samples)missing_features = rng.randint(0, n_features, n_missing_samples)# Estimate the score without the lines containing missing valuesX_filtered = X_full[~missing_samples,:]y_filtered = y_full[~missing_samples]estimator =RandomForestRegressor(random_state=0, n_estimators=100)score =cross_val_score(estimator, X_filtered, y_filtered).mean()print("Score without the samples containing missing values = %.2f"% score)# Estimate the score after imputation of the missing valuesX_missing = X_full.copy()X_missing[np.where(missing_samples)[0], missing_features]=0y_missing = y_full.copy()estimator =Pipeline([("imputer", Imputer(missing_values=0, strategy="mean", axis=0)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))])score =cross_val_score(estimator, X_missing, y_missing).mean()print("Score after imputation of the missing values = %.2f"% score)
results:
Score with the entire dataset = 0.56
Score without the samples containing missing values = 0.48
Score after imputation of the missing values = 0.55