The following snippet is a reusable multicore K-Fold classifier for
scikit-learn models. The return value is an array of cross validation scores of length
N_SPLITS * N_REPEATS. The
sklearn library already provides a simple interface for multicore cross validation through the
cross_val_score function, but it does not provide a facility for repeating the cross validation process. I typically like to repeat the cross validation process in order to analyze the distribution of scores.
Check out this GitHub file for an example implementation.
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RepeatedKFold from sklearn.base import clone from joblib import Parallel, delayed # Number of jobs to run in parallel # Set to number of computer cores to use N_JOBS = 10 N_SPLITS = 5 N_REPEATS = 4 def _fit_and_score(classifier, X, y, w, train_index, test_index, i): # The function used by joblib to split, train, and score cross validations X_train = X.iloc[train_index] X_test = X.iloc[test_index] y_train = y.iloc[train_index] y_test = y.iloc[test_index] w_train = w.iloc[train_index] w_test = w.iloc[test_index] classifier.fit(X_train, y_train, w_train) score = classifier.score(X_test, y_test, w_test) return score def repeated_k_fold(classifier, X, y, w): # Perform repeated k-fold cross validation on a classifier. Spread fitting # job over multiple computer cores. n_jobs = N_JOBS n_splits = N_SPLITS n_repeats = N_REPEATS total_fits = n_splits * n_repeats _k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats) parallel = Parallel(n_jobs=n_jobs) scores = parallel( delayed(_fit_and_score)( clone(classifier), X, y, w, train_index, test_index, i ) for i, (train_index, test_index) in enumerate(_k_fold.split(X)) ) return np.array(scores)