cross_val_score
в задачах классификацииprint(accuracy_score(y_balanced, np.zeros_like(y_balanced)))
print(accuracy_score(y_unbalanced, np.zeros_like(y_unbalanced)))
0.480499719708 0.974691358025
precision, recall, thresholds = precision_recall_curve(y_balanced, y_hat)
thresholds = [0] + thresholds.tolist()
plot(thresholds, precision, label='precision');
plot(thresholds, recall, label='recall');
legend(); xlabel('Threshold'); ylabel('Precision or recall');
title('Precision and recall for balanced dataset');
thresholds = np.linspace(0, 1, 100)
scores = [f1_score(y_balanced, (y_hat > t).astype(np.int)) for t in thresholds]
plot(thresholds, scores, label='F-score');
scatter(thresholds[np.argmax(scores)], np.max(scores), label='Maximum');
legend(); xlabel('Threshold'); ylabel('F-score');
title('F-score for balanced dataset');
roc_auc_score(y_balanced, y_hat)
0.70524418709213299
print(roc_auc_score(y_balanced, y_hat))
print(roc_auc_score(y_balanced, 100 * y_hat + 5))
print(roc_auc_score(y_balanced, y_hat ** 2))
0.705244187092 0.705244187092 0.705244187092
hist([roc_auc_score(y_balanced, np.random.random(size=y_balanced.shape[0])) for i in range(1000)], bins=25);
xlabel('AUC');
title('Distribution of AUC for random classifier');
fpr, tpr, _ = roc_curve(y_balanced, y_hat)
plot(fpr, tpr, lw=2);
plot([0, 1], [0, 1], linestyle='--');
xlim([0.0, 1.0]); ylim([0.0, 1.05]);
xlabel('False Positive Rate'); ylabel('True Positive Rate');
title('ROC for balanced dataset');
print(np.mean(cross_val_score(LogisticRegression(), x_unbalanced, y_unbalanced, cv=StratifiedKFold(10))))
print(roc_auc_score(y_unbalanced, np.zeros_like(y_unbalanced)))
0.9771648334 0.5
log_loss(y_balanced, y_hat, eps=0)
0.62378626588930719
y_hat[0] = 1 - y_balanced[0]
log_loss(y_balanced, y_hat, eps=0)
inf
np.mean(y_unbalanced)
0.025308641975308643
print(np.mean(cross_val_score(LogisticRegression(), x_unbalanced, y_unbalanced, cv=cv, scoring='accuracy')))
print(np.mean(cross_val_score(LogisticRegression(), x_unbalanced, y_unbalanced, cv=cv, scoring='roc_auc')))
0.981562067774 0.993632917551
np.mean(cross_val_score(LogisticRegression(), x_unbalanced, y_unbalanced, cv=cv, scoring='f1'))
0.63985523564465452
plot(alphas, scores, label='F-score');
scatter(0.5, get_score(0.5, y_unbalanced, y_hat), label='0.5');
scatter(alphas[np.argmax(scores)], np.max(scores), label='optimal');
xlabel('alpha'); ylabel('F-score'); legend();
scikit-learn
имеют параметр class_weight
label
: вес} или "balanced"
clf = LogisticRegression(class_weight='balanced')
y_hat = cross_val_predict(clf, x_unbalanced, y_unbalanced, cv=cv, method='predict_proba')[:, 1]
plot(alphas, scores, label='F-score');
scatter(0.5, get_score(0.5, y_unbalanced, y_hat), label='0.5');
scatter(alphas[np.argmax(scores)], np.max(scores), label='optimal');
xlabel('alpha'); ylabel('F-score'); legend();
Уменьшение размера доминирующего класса
Увеличение размера минорного класса
clf = pipeline.make_pipeline(over_sampling.SMOTE(), LogisticRegression())
y_hat = cross_val_predict(clf, x_unbalanced, y_unbalanced, cv=cv, method='predict_proba')[:, 1]
plot(alphas, scores, label='F-score');
scatter(0.5, get_score(0.5, y_unbalanced, y_hat), label='0.5');
scatter(alphas[np.argmax(scores)], np.max(scores), label='optimal');
xlabel('alpha'); ylabel('F-score'); legend();