# How to do stratified sampling so that SVM can work well on training set for skewed data

After doing some studies (https://datascience.stackexchange.com/questions/1107/quick-guide-into-training-highly-imbalanced-data-sets), I found out that Weighted SVM is a classification approach to handle class imbalance problem. My data set has two classes - rare event (minority class, labeled as 1) and the remaining is the majority class (label 0). So I implemented the supervised classification weighted svm technique. The weights are given as the inverse of the class frequency. I added an additional tuning for the C parameter (boxconstraint). The training is done using 5 folds cross-validation approach where I have ensured that there is atleast one representative of the minority class. The method works well on the training set. I got good performance after the training. This I can say by looking at the confusion matrix for the training data as

```
cmMatrix =
1442 28
0 30
```

**PROBLEM**: If I give a highly imbalanced unseen new data set (this set is never used by the model and is the Test set) to the trained SVM model, the prediction on this data is totally biased towards the majority class!! The code works for prediction on the training set but not on the test set.Even if I use the same training set for testing but by shuffling the rows, I get the poor performance. However, if I use the exact same training set in the same order as used to train the SVM, I get very good results.

The confusion matrix for the test data prediction is

```
cmMatrix_TestData =
97 3
5 0
```

I fail to understand why the weighted SVM cannot predict correctly on the new data set. Then what is the point of using the weighted SVM?

I don't know the concept of stratified sampling and remember reading somewhere that we need to split the data using this approach. Maybe during the hyperparameter optimization stage there is no example belonging to the minority class eventhough I have ensured that there should be. I don't know what the problem is in my implementation. I shall really appreciate if somebody can help in the implementation and where I have done wrong. Here is the full code: it is a toy example only where the data is generated randomly.

It can be clearly seen that the SVM performs poorly on the test data with no examples classified as rare (minority class).

```
clear all
rng('default');
data1 =[];
data2 =[];
allData =[];
featSize=3;
% random simulation of some data
y = zeros(1,featSize);
s = 1:featSize;
t = 0.105;
a=1470; %number of examples of the majority class
b=30; %number of examples of the minority (rare event) class
for i = 1:a
x = randn(1,featSize);
data1 = [data1; x];
end
for i = 1:b
y = randn(1,featSize) + t.*s;
data2 = [data2; y];
end
allData=[data1; data2];
% label the data, gives 0 to Normal data and 1 to abnormal data
allData(:,featSize+1) = [zeros(1,a) ones(1,b)];
targets = allData(:,featSize+1); % these are the labels
RARE_DATA = allData(allData(:,end)==1,:);
NORMAL_DATA = allData(allData(:,end)==0,:);
aClass = size(NORMAL_DATA,1) ;%the size of the normal class label 0
bClass = size(RARE_DATA,1);%the size of the rare class label 1
Data = [NORMAL_DATA;RARE_DATA] ;
% assign weights
Data(:,featSize+2) = [1/aClass*ones(1,aClass) 1/bClass*ones(1,bClass)];
weight = Data(:,featSize+2);
kFolds = 5; % this is where you specify your number of folds
bestSVM = struct('SVMModel', NaN, ... % this is to store the best SVM
'C', NaN, 'FeaturesIdx', NaN, 'Score', Inf);
%the main outer loop will run for as many folds you specified in kFolds and
%will prepare a training set and a testing set
for k = 1:kFolds
r=randperm(numel(targets));
tot=floor(numel(targets)*0.9); % It selects 90% of data for training and 10% for testing
trainIdx = r(1:tot);
trainData=Data(r(1:tot),1:featSize);
trainTarg = Data(r(1:tot),featSize+1 );
weightTrain = weight(trainIdx);
testIdx = (r(tot+3:end) );
testTarg = Data(r(tot+3:end),featSize+1);
weightTest = weight(testIdx);
% This avoids the test data to have empty set of abnormal data
rarray = randperm(bClass )+aClass ;
rand1 = rarray(1);
rand2 = rarray(2);
testIdx = [testIdx rand1 rand2];
testTarg = [testTarg; 0];
testTarg = [testTarg; 0];
testData = Data(testIdx,1:featSize);
size_trainingData = 100*(length(trainData)/size(Data,1));
size_testTarg = 100*(length(testTarg)/size(Data,1));
disp(['TrainingSet Size: ',num2str(size_trainingData), '%;', ' CV size: = ', num2str(size_testTarg), '%'])
% forward feature selection starts, prepare for feature selection.
bestFeatScore = inf;
bestFeatCombo = struct('SVM', NaN, 'feat', NaN, 'C', NaN);
for b = 1:(2^featSize) - 1
% this is to choose the features. e.g. [1 0 0] selects the first
% feature out of three features.
featCombo= find(de2bi(b, featSize));
% featCombo = [1:featSize];
% this is the grid search for the BoxConstraint
% this is the grid search for the BoxConstraint
bestCScore = inf;
bestC = NaN;
gridC = 2.^(-5:2:15);
for C = gridC
% another 5-fold cross-validation for parameter selection, C
kIdxC = crossvalind('Kfold', length(trainTarg), kFolds);
L = zeros(1, kFolds);
for kC = 1:kFolds
trainDataC = trainData(kIdxC~=kC, :);
trainTargC = trainTarg(kIdxC~=kC);
testDataC = trainData(kIdxC==kC, :);
testTargC = trainTarg(kIdxC==kC);
anSVMModel = fitcsvm(trainDataC(:, featCombo), trainTargC, ...
'KernelFunction', 'RBF', 'KernelScale', 'auto', ...
'BoxConstraint', C,'Weight',weightTrain(kIdxC~=kC) );
L(kC) = loss(anSVMModel,testDataC(:, featCombo), testTargC);
end
L = mean(L);
if L < bestCScore
bestCScore = L;
bestC = C;
end
end
% we need to retrain here and save the SVM for the best C
bestCSVM = fitcsvm(trainData(:, featCombo), trainTarg, ...
'KernelFunction', 'RBF', 'KernelScale', 'auto', ...
'BoxConstraint', bestC,'Weight',weightTrain);
bestCScore = loss(bestCSVM,testData(:, featCombo), testTarg);
% saving the best SVM on feature selection
% if the current SVM performs (scores) the same as the best so far,
%but the current SVM has a smaller set of features that give the same performance,
%I choose to replace the best so far with an SVM that uses less features.
if (bestCScore < bestFeatScore) || ...
((bestCScore == bestFeatScore) && ...
(length(featCombo) < length(bestFeatCombo.feat)))
bestFeatScore = bestCScore;
bestFeatCombo.SVM = bestCSVM;
bestFeatCombo.feat = featCombo;
bestFeatCombo.C = bestC;
end
end
% saving the best SVM over all folds
if bestFeatScore < bestSVM.Score
bestSVM.SVMModel = bestFeatCombo.SVM;
bestSVM.C = bestFeatCombo.C;
bestSVM.FeaturesIdx = bestFeatCombo.feat;
bestSVM.Score = bestFeatScore;
end
end
% So, when you have a test set that has been untouched during modelling and
% kept aside for the final prediction, you use the training and validation set as a
% whole and run the tuned model on this dataset.
% Then predict on the untouched test data set
finalSVM = fitcsvm(Data(:,bestSVM.FeaturesIdx),Data(:,featSize+1), ...
'KernelFunction', 'RBF', 'KernelScale', 'auto', ...
'BoxConstraint', bestSVM.C,'Weight',weight );
final_predict_lb = predict(finalSVM,Data(:,bestSVM.FeaturesIdx));
actual_lbl=grp2idx(Data(:,featSize+1));
YPred = grp2idx(final_predict_lb);
[cmMatrix]= confusionmat(actual_lbl,YPred )
%TEST SET simulate new set
a=100; % number of examples of normal
b=5; % number of examples of the rare (abnormal) event
data1 =[];
data2 =[];
for i = 1:a
x = randn(1,featSize);
data1 = [data1; x];
end
for i = 1:b
y = randn(1,featSize) + t.*s;
data2 = [data2; y];
end
TestData=[data1; data2]; % test sata is created
% label the data, gives 0 to Normal data and 1 to abnormal data
TestData(:,featSize+1) = [zeros(1,a) ones(1,b)];
Test_targets = TestData(:,featSize+1);
test_pred = predict(finalSVM,TestData(:, bestSVM.FeaturesIdx));
test_lbl=grp2idx(Test_targets);
YTest = grp2idx(test_pred);
[cmMatrix_TestData]= confusionmat(test_lbl,YTest )
```