How to pass user defined function inside TfidfVectorizer.fit_transform()

I have function for text preprocessing which is simply removing stopwords as:

def text_preprocessing():
    df['text'] = df['text'].apply(word_tokenize)
    df['text']=df['text'].apply(lambda x: [item for item in x if item not in stopwords])
    new_array=[]
    for keywords in df['text']: #converts list of words into string
         P=" ".join(str(x) for x in keywords)
         new_array.append(P)
    df['text'] = new_array
    return df['text']

I want to pass text_preprocessing() into another function tf_idf() which gives feature matrix what I essentially did as:-

def tf_idf():
    tfidf = TfidfVectorizer()
    feature_array = tfidf.fit_transform(text_preprocessing)
    keywords_data=pd.DataFrame(feature_array.toarray(), columns=tfidf.get_feature_names())
    return keywords_data

I got an error as TypeError: 'function' object is not iterable

1 answer

  • answered 2018-07-15 19:49 KRKirov

    Rather than building additional functions for stop-word removal you can simply pass a custom list of stop-words to TfidfVectorizer. As you can see in the example below "test" is successfully excluded from the Tfidf vocabulary.

    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # Setting up
    numbers = np.random.randint(1, 5, 3)
    text = ['This is a test.', 'Is this working?', "Let's see."]
    df = pd.DataFrame({'text': text, 'numbers': numbers})
    
    # Define custom stop words and instantiate TfidfVectorizer with them
    my_stopwords = ['test'] # the list can be longer
    tfidf = TfidfVectorizer(stop_words=my_stopwords)
    text_tfidf = tfidf.fit_transform(df['text'])
    
    # Optional - concatenating tfidf with df
    df_tfidf = pd.DataFrame(text_tfidf.toarray(), columns=tfidf.get_feature_names())
    df = pd.concat([df, df_tfidf], axis=1)
    
    # Initial df
    df
    Out[133]: 
       numbers              text
    0        2   This is a test.
    1        4  Is this working?
    2        3        Let's see.
    
    tfidf.vocabulary_
    Out[134]: {'this': 3, 'is': 0, 'working': 4, 'let': 1, 'see': 2}
    
    # Final df
    df
    Out[136]: 
       numbers              text        is       let       see      this   working
    0        2   This is a test.  0.707107  0.000000  0.000000  0.707107  0.000000
    1        4  Is this working?  0.517856  0.000000  0.000000  0.517856  0.680919
    2        3        Let's see.  0.000000  0.707107  0.707107  0.000000  0.000000