Higher Accuracy using SimpleTransformers vs Transformers Library with BERT

I am working on a project for text classification using BERT

I am getting ca. 90% accuracy by using simple transformers. But I only get like 60% using my own training for-loop (not published here) or using the trainer module from the transformers library. Both are done with the default parameters of simple transformers.

I am really struggling to understand why there is such a difference in performance

Dataset is from Kaggle: https://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news

Imports:

from transformers import BertForSequenceClassification, AdamW, BertTokenizer, get_linear_schedule_with_warmup, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
from pathlib import Path
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from torch.nn import functional as F
from collections import defaultdict
import random
from simpletransformers.classification import ClassificationModel

Data Pre-Processing:

#loading phrase bank dataset
phrase_bank_dataset = "all-data.csv"
phrase_bank_dataset_file = Path(phrase_bank_dataset)
file_loaded = False
while not file_loaded:
  if phrase_bank_dataset_file.exists():
    phrase_bank_dataset = pd.read_csv(phrase_bank_dataset, encoding='latin-1')
    phrase_bank_dataset = phrase_bank_dataset.values.tolist()
    file_loaded = True
    print("Dataset Loaded")
  else:
    print("File not Found")
#correcting the format of phrase bank dataset
phrase_dataset = pd.DataFrame(columns=["news", "sentiment"])
for ele in phrase_bank_dataset:
  news = ele[1]
  #converting sentiment text into numbers
  sentiment = 0 if ele[0] == 'negative' else 1 if ele[0] == 'neutral' else 2
  row = [news, sentiment]
  phrase_dataset.loc[len(phrase_dataset)] = row
print(phrase_dataset)

Simple Transformers Code:

model = ClassificationModel('bert', 'bert-base-cased', num_labels=3,use_cuda=True)
train,eva = train_test_split(labeled_dataset,test_size = 0.2)

train_df = pd.DataFrame({
    'text': train['news'],
    'label': train['sentiment']
})

eval_df = pd.DataFrame({
    'text': eva['news'],
    'label': eva['sentiment']
})

model.train_model(train_df)

result, model_outputs, wrong_predictions = model.eval_model(eval_df)

lst = []
for arr in model_outputs:
    lst.append(np.argmax(arr))
true = eval_df['label'].tolist()
predicted = lst
sklearn.metrics.accuracy_score(true,predicted)

Transformers Trainer Code:

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', stride = 0.8)
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 3)

if torch.cuda.is_available():
  print("\nUsing: ", torch.cuda.get_device_name(0))
  device = torch.device('cuda')
else:
  print("\nUsing: CPU")
  device = torch.device('cpu')
model = model.to(device)

#custom dataset class

class NewsSentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item

  def __len__(self):
      return len(self.labels)

#method for tokenizing dataset list

def tokenize_headlines(headlines, labels, tokenizer):

  encodings = tokenizer.batch_encode_plus(
      headlines,
      add_special_tokens = True,
      pad_to_max_length = True,
      return_attention_mask = True
  )

  dataset = NewsSentimentDataset(encodings, labels)
  return dataset

#splitting dataset into training and validation set
all_headlines = phrase_dataset['news'].tolist()
all_labels = phrase_dataset['sentiment'].tolist()

train_headlines, val_headlines, train_labels, val_labels = train_test_split(phrase_headlines, phrase_labels, test_size=.2)

val_dataset = tokenize_headlines(val_headlines, val_labels, tokenizer)
train_dataset = tokenize_headlines(train_headlines, val_labels, tokenizer)

#data loader
train_batch_size = 8
val_batch_size = 8

train_data_loader = DataLoader(train_dataset, batch_size = train_batch_size, sampler=RandomSampler(train_dataset))
val_data_loader = DataLoader(val_dataset, batch_size = val_batch_size, sampler=SequentialSampler(val_dataset))

#optimizer and scheduler
num_epochs = 1
num_steps = len(train_data_loader) * num_epochs
optimizer = AdamW(model.parameters(), lr=4e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=30, num_training_steps=num_steps)

#training and evaluation with trainer moduel from huggingfaces

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset  ,          # evaluation dataset
    compute_metrics=compute_metrics           
)

trainer.train()
trainer.evaluate()