The following snippet is a reusable multicore K-Fold classifier for scikit-learn
models. The return value is an array of cross validation scores of length N_SPLITS * N_REPEATS
. The sklearn
library already provides a simple interface for multicore cross validation through the cross_val_score
function, but it does not provide a facility for repeating the cross validation process. I typically like to repeat the cross validation process in order to analyze the distribution of scores.
Check out this GitHub file for an example implementation.
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RepeatedKFold from sklearn.base import clone from joblib import Parallel, delayed # Number of jobs to run in parallel # Set to number of computer cores to use N_JOBS = 10 N_SPLITS = 5 N_REPEATS = 4 def _fit_and_score(classifier, X, y, w, train_index, test_index, i): # The function used by joblib to split, train, and score cross validations X_train = X.iloc[train_index] X_test = X.iloc[test_index] y_train = y.iloc[train_index] y_test = y.iloc[test_index] w_train = w.iloc[train_index] w_test = w.iloc[test_index] classifier.fit(X_train, y_train, w_train) score = classifier.score(X_test, y_test, w_test) return score def repeated_k_fold(classifier, X, y, w): # Perform repeated k-fold cross validation on a classifier. Spread fitting # job over multiple computer cores. n_jobs = N_JOBS n_splits = N_SPLITS n_repeats = N_REPEATS total_fits = n_splits * n_repeats _k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats) parallel = Parallel(n_jobs=n_jobs) scores = parallel( delayed(_fit_and_score)( clone(classifier), X, y, w, train_index, test_index, i ) for i, (train_index, test_index) in enumerate(_k_fold.split(X)) ) return np.array(scores)
Leave a Reply