Python: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same

I'm developing a Speech to Text model. However while training I'm getting the following error:

***** Running training *****
  Num examples = 531
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 134
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-72-9f65de613925> in <module>()
     12 
     13 # with torch.nn.DataParallel():
---> 14 trainer.train()

13 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
    297                             _single(0), self.dilation, self.groups)
    298         return F.conv1d(input, weight, bias, self.stride,
--> 299                         self.padding, self.dilation, self.groups)
    300 
    301     def forward(self, input: Tensor) -> Tensor:

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

I understand it is complaining that input is on CPU while the model is on GPU. Can someone please help me fix this? I'm not sure how to put input values on CUDA.

############# Training ####################
import torch
torch.cuda.empty_cache()
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


from transformers import Trainer
torch.cuda.empty_cache()
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=processor.feature_extractor,
)

# with torch.nn.DataParallel():
trainer.train()

Link to notebook: https://colab.research.google.com/drive/1b5UmblSssdSfusXh-3R5PJ6y8phV8U3x?usp=sharing

How many English words
do you know?
Test your English vocabulary size, and measure
how many words do you know
Online Test
Powered by Examplum