
This post is simply some practice code for fitting and doing parameter selection (via cross validation) of SVM and Random Forest classifiers, on 3 made-up datasets. Everything is done in the scikit-learn library in Python.
Code is adapted from this webpage.
It looks a bit long, but that’s mainly due to the complexity of plotting in matplotlib.pyplot.
First, import the necessary functions. (I’ve commented out certain lines form the original webpage that are not used here.)
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from numpy.random import Generator, PCG64
Make up example datsets
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.default_rng(PCG64(12345))
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),
make_circles(noise=0.2, factor=0.5, random_state=1),
linearly_separable]
Initialize classifiers and specify parameter grids to search on.
names = ['Linear SVM', 'RBF SVM', 'RF']
# classifiers
clf_svc_linear = SVC(kernel='linear')
clf_svc_rbf = SVC(kernel='rbf')
clf_rf = RandomForestClassifier(max_features=1)
classifiers = [clf_svc_linear, clf_svc_rbf, clf_rf]
# search grids
svc_linear_grid = {'C': [0.1, 1, 10]}
svc_rbf_grid = {'gamma': [0.1, 1, 10]}
rf_grid = {'max_depth': [3,4,5], 'n_estimators': [5,10,20]}
grids = [svc_linear_grid, svc_rbf_grid, rf_grid]
Go over the datasets, do CV selection, and predict with the best paramters selected.
Also print accuracy score on the test data set.
n_classifiers = len(classifiers)
figure = plt.figure(figsize=(n_classifiers * 3, 9))
h = 0.1 # step size for the mesh grid (for plotting)
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.3, random_state=42)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
if ds_cnt == 0:
ax.set_title("Input data")
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,
edgecolors='k')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for j, clf in enumerate(classifiers):
ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
# do CV parameter selection
clf_cv = GridSearchCV(clf, grids[j])
clf_cv.fit(X_train, y_train)
# get the prediction score on the test set (using best parameters)
score = clf_cv.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
if hasattr(clf_cv, "decision_function"):
Z = clf_cv.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf_cv.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,
edgecolors='k')
# Plot the testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
edgecolors='k', alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
if ds_cnt == 0:
ax.set_title(names[j])
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.show()