x_broken = np.array(x)
x_broken[:,::2]/=10
estimator = KNeighborsClassifier()
original_scores = cross_val_score(estimator,x,y,scoring='accuracy')print("The original average accuracy for is {0:.1f}%".format(np.mean(original_scores)*100))
broken_scores = cross_val_score(estimator,x_broken,y,scoring='accuracy')print("The broken average accuracy for is {0:.1f}%".format(np.mean(broken_scores)*100))
The original average accuracy for is 82.3%
The broken average accuracy for is 71.5%
我们发现在破坏后的数据集中算法的准确率显著下降。
因此我们需要对特征值规范化,避免特征值差异过大使得算法准确率下降。
from sklearn.preprocessing import MinMaxScaler
x_transformed = MinMaxScaler().fit_transform(x)
3 组装
x_transformed = MinMaxScaler().fit_transform(x_broken)
estimator = KNeighborsClassifier()
transformed_scores = cross_val_score(estimator,x_transformed,y,scoring='accuracy')print("The average accuracy for is {0:.1f}%".format(np.mean(transformed_scores)*100))
from sklearn.pipeline import Pipeline
scaling_pipeline = Pipeline([('scale',MinMaxScaler()),('predict',KNeighborsClassifier())])
使用流水线:
scores = cross_val_score(scaling_pipeline,x_broken,y,scoring='accuracy')print("The pipeline scored an average accuracy for is {0:.1f}%".format(np.mean(transformed_scores)*100))
The pipeline scored an average accuracy for is 82.3%