Custom sklearn Pipeline to transform both X and y

I created my own custom pipeline for text processing. Inside the .transform() method, I want to remove the target row if there are no tokens.

class SpacyVectorizer(BaseEstimator, TransformerMixin):
  def __init__(
      self, 
      alpha_only: bool = True,
      lemmatize: bool = True, 
      remove_stopwords: bool = True, 
      case_fold: bool = True,
    ):
    self.alpha_only = alpha_only
    self.lemmatize = lemmatize
    self.remove_stopwords = remove_stopwords
    self.case_fold = case_fold
    self.nlp = spacy.load(
      name='en_core_web_sm', 
      disable=["parser", "ner"]
    )
  
  def fit(self, X, y=None):
    return self
  
  def transform(self, X, y):
    # Bag-of-Words matrix
    bow_matrix = []
    
    # Iterate over documents in SpaCy pipeline 
    for i, doc in enumerate(nlp.pipe(X)):
      # Words array
      words = []

      # Tokenize document
      for token in doc:

        # Remove non-alphanumeric tokens
        if self.alpha_only and not token.is_alpha:
          continue
        
        # Stopword removal
        if self.remove_stopwords and token.is_stop:
          continue
        
        # Lemmatization
        if self.lemmatize:
          token = token.lemma_
        
        # Case folding
        if self.case_fold:
          token = str(token).casefold()

        # Append token to words array
        words.append(token)
      
      # Update the Bow representation
      if words:
        # Preprocessed document
        new_doc = ' '.join(words)
        
        # L2-normalized vector of preprocessed document
        word_vec = nlp(new_doc).vector
      
      else:
        # Remove target label
        y.drop(y.index[i], inplace=True)

      # Update the BoW matrix
      bow_matrix.append(word_vec)

    # Return BoW matrix  
    return bow_matrix

Unfortunately, because I cannot pass the y vector to the .transform() method, it does not work.

How can I force the pipeline to pass both X and y parameters? Is there any other workaround on how to do it? I don't want to pass y via .fit_transform(), because test data shouldn't be fitted.

1 answer

  • answered 2022-03-02 10:44 raghav Aggarwal

    def transform(self, X, y=None):
    

    Here you have written y = None, which means if you aren't passing any y value then it's taking a default value as None.

    In order to force a pipeline to pass a y value u should write

    def transform(self, X, y):
         pass
    

    If you do this then you have to pass a y value, else it will return a error

    the space problem I am talking about

    class SpacyVectorizer:
        def __init__(
          self, 
          alpha_only: bool = True,
          lemmatize: bool = True, 
          remove_stopwords: bool = True, 
          case_fold: bool = True,
        ):
            self.alpha_only = alpha_only
            self.lemmatize = lemmatize
            self.remove_stopwords = remove_stopwords
            self.case_fold = case_fold
            self.nlp = spacy.load(
              name='en_core_web_sm', 
              disable=["parser", "ner"]
            )
        def transform(self, X, y):
        # Bag-of-Words matrix
            bow_matrix = []
    
            # Iterate over documents in SpaCy pipeline 
            for i, doc in enumerate(nlp.pipe(X)):
              # Words array
              words = []
    
              # Tokenize document
              for token in doc:
    
                # Remove non-alphanumeric tokens
                if self.alpha_only and not token.is_alpha:
                  continue
    
                # Stopword removal
                if self.remove_stopwords and token.is_stop:
                  continue
    
                # Lemmatization
                if self.lemmatize:
                  token = token.lemma_
    
                # Case folding
                if self.case_fold:
                  token = str(token).casefold()
    
                # Append token to words array
                words.append(token)
    
              # Update the Bow representation
              if words:
                # Preprocessed document
                new_doc = ' '.join(words)
    
                # L2-normalized vector of preprocessed document
                word_vec = nlp(new_doc).vector
    
              else:
                # Remove target label
                y.drop(y.index[i], inplace=True)
    
              # Update the BoW matrix
              bow_matrix.append(word_vec)
    
            # Return BoW matrix  
            return bow_matrix
    

    The error you are getting might be because of the space problem, as self might be taking x value and X parameter might be taking y value

How many English words
do you know?
Test your English vocabulary size, and measure
how many words do you know
Online Test
Powered by Examplum