Training by stratified cross-validation `cvpartition` yields poor performance on test data

I found out that Weighted SVM is a classification approach to handle class imbalance problem. My data set is highly imbalanced with rare event (minority class, labeled as 1) and the majority class (label 0). So I implemented the supervised classification weighted svm technique with stratified cross-validation as these are able to handle class imbalance. I added an additional tuning for the C parameter (boxconstraint). The training is done using 5 folds cross-validation approach. The method works well on the training set. I get good performance after the training. This I can say because by looking at the confusion matrix after the training.

cmMatrix =

        1443          27
           0          30

It is generally recommended to re-train using the optimized hyperparameters. So, I ran the trained model on the entire dataset again (re-trained it) an predicted on the same dataset.

PROBLEM: If I give a highly imbalanced unseen new data set (this set is never used by the model and is the Test set) to the trained SVM model, the prediction on this data is totally biased towards the majority class as shown below

cmMatrix_TestData =

        98     2
         5     0

Even if I use the same training set for testing but by shuffling the rows, I get the same poor performance. However, if I use the exact same training set in the same order as used to train the SVM, I get very good results.

Where did I go wrong? Please help, I practically have no method working for class imbalance problem whereas several articles and suggestion suggest these two approaches which I am not able to make it work for me. Here is the MATLAB code: it is a toy example only where the data is generated randomly.

clear all
rng('default');
data1 =[];
data2 =[];
allData =[];
featSize=3;
% random simulation of some data
y = zeros(1,featSize);
s = 1:featSize;
t = 0.105;
a=1470; %number of examples of the majority class
b=30;   %number of examples of the minority (rare event) class
for i = 1:a
    x = randn(1,featSize);
    data1 = [data1; x];
end

for i = 1:b

    y = randn(1,featSize) + t.*s;

    data2 = [data2; y];
end

allData=[data1; data2];   

% label the data, gives 0 to Normal data and 1 to abnormal data
allData(:,featSize+1) = [zeros(1,a) ones(1,b)];
targets = allData(:,featSize+1); % these are the labels
RARE_DATA = allData(allData(:,end)==1,:);
NORMAL_DATA = allData(allData(:,end)==0,:);
aClass = size(NORMAL_DATA,1) ;%the size of  Normal class label 0
bClass = size(RARE_DATA,1);%the size of abnormal class label 1
Data  = [NORMAL_DATA;RARE_DATA] ;
%asign weights as inverse of class frequency
Data(:,featSize+2) = [1/aClass*ones(1,aClass) 1/bClass*ones(1,bClass)];

weight = Data(:,featSize+2);

kFolds = 5;     % this is where you specify your number of folds
bestSVM = struct('SVMModel', NaN, ...     % this is to store the best SVM
    'C', NaN, 'FeaturesIdx', NaN, 'Score', Inf);

 CVO = cvpartition( targets,'k', kFolds,'Stratify',true);
%he main outer loop will run for as many folds you specified in kFolds and
%will prepare a training set and a testing set
for k = 1:CVO.NumTestSets
    trainIdx = CVO.training(k);
    testIdx = CVO.test(k);
    trainData=Data(trainIdx,1:featSize);
    trainTarg = Data(trainIdx,featSize+1 );
    weightTrain = weight(trainIdx);


    testTarg = Data(testIdx,featSize+1);
    weightTest = weight(testIdx);
    testData = Data(testIdx,1:featSize);
    size_training  =  size(trainTarg,1);
    size_testTarg =  size(testTarg,1);  
    disp(['TrainingSet Size: ',num2str(size_training ),   ' CV size: = ', num2str(size_testTarg) ])
    % forward feature selection starts, prepare for feature selection.
    bestFeatScore = inf;
    bestFeatCombo = struct('SVM', NaN, 'feat', NaN, 'C', NaN);
    for b = 1:(2^featSize) - 1
        % this is to choose the features. e.g. [1 0 0] selects the first
        % feature out of three features.
        featCombo= find(de2bi(b, featSize));

        % this is the grid search for the BoxConstraint
        bestCScore = inf;
        bestC = NaN;
        gridC = 2.^(-5:2:15);
        for C = gridC
            % another 5-fold cross-validation for parameter selection, C

            kIdxC = cvpartition( trainTarg,'k', kFolds,'Stratify',true);
            L = zeros(1, kFolds);
            for kC = 1:kIdxC.NumTestSets
                trIdx = kIdxC.training(kC);
                teIdx = kIdxC.test(kC);
                trainDataC = trainData(trIdx, :);
                trainTargC = trainTarg(trIdx);
                testDataC = trainData(teIdx, :);
                testTargC = trainTarg(teIdx);
                anSVMModel = fitcsvm(trainDataC(:, featCombo), trainTargC, ...
                    'KernelFunction', 'RBF', 'KernelScale', 'auto', ...
                    'BoxConstraint', C,'Weight',weightTrain(trIdx) );
                L(kC) = loss(anSVMModel,testDataC(:, featCombo), testTargC);
            end
            L = mean(L);
            if L < bestCScore
                bestCScore = L;
                bestC = C;
            end
        end
        % we need to retrain here and save the SVM for the best C
        bestCSVM = fitcsvm(trainData(:, featCombo), trainTarg, ...
            'KernelFunction', 'RBF', 'KernelScale', 'auto', ...
            'BoxConstraint', bestC,'Weight',weightTrain);
        bestCScore = loss(bestCSVM,testData(:, featCombo), testTarg);
        % saving the best SVM on feature selection
        % if the current SVM performs (scores) the same as the best so far,
        %but the current SVM has a smaller set of features that give the same performance,
        %I choose to replace the best so far with an SVM that uses less features.
        if (bestCScore < bestFeatScore) || ...
                ((bestCScore == bestFeatScore) && ...
                (length(featCombo) < length(bestFeatCombo.feat)))
            bestFeatScore = bestCScore;
            bestFeatCombo.SVM = bestCSVM;
            bestFeatCombo.feat = featCombo;
            bestFeatCombo.C = bestC;
        end
     end

    % saving the best SVM over all folds
    if bestFeatScore < bestSVM.Score
        bestSVM.SVMModel = bestFeatCombo.SVM;
        bestSVM.C = bestFeatCombo.C;
        bestSVM.FeaturesIdx = bestFeatCombo.feat;
        bestSVM.Score = bestFeatScore 
    end
end
% So, when you have a test set that has been untouched during modelling and
% kept aside for the final prediction, you use the training and validation set as a
% whole and run the tuned model on this dataset.
% Then predict on the untouched test data set

finalSVM = fitcsvm(Data(:,bestSVM.FeaturesIdx),Data(:,featSize+1), ...
    'KernelFunction', 'RBF', 'KernelScale', 'auto', ...
    'BoxConstraint', bestSVM.C,'Weight',weight  );
final_predict_lb = predict(finalSVM,Data(:,bestSVM.FeaturesIdx));
actual_lbl=grp2idx(Data(:,featSize+1));
YPred = grp2idx(final_predict_lb);
[cmMatrix]=  confusionmat(actual_lbl,YPred )



%TEST SET simulate new set
a=100; % number of examples of normal
b=5; % number of examples of the rare (abnormal) event
data1 =[];
data2 =[];
for i = 1:a
    x = randn(1,featSize);
    data1 = [data1; x];
end

for i = 1:b

    y = randn(1,featSize) + t.*s;

    data2 = [data2; y];
end

TestData=[data1; data2];    % test sata is created

% label the data, gives 0 to Normal data and 1 to abnormal data
TestData(:,featSize+1) = [zeros(1,a) ones(1,b)];
Test_targets = TestData(:,featSize+1);
ind = randperm(numel(Test_targets));


test_pred = predict(finalSVM,TestData(ind, bestSVM.FeaturesIdx));
test_lbl=grp2idx(Test_targets(ind));
YTest = grp2idx(test_pred);
[cmMatrix_TestData]=  confusionmat(test_lbl,YTest )