Why does my Webscraper built using python return an empty list when it should return scraped data?

I am trying to scrape product details such as product name,price,category,color from https://nike.co.in Despite giving the correct Xpath to the script, It does not seem to be scraping the details and it gives an empty list. Here's my complete script:

import time
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager


def scrape_nike(shop_by_category):
    website_address = ['https://nike.co.in']
    options = webdriver.ChromeOptions()
    options.add_argument('start-maximized')
    options.add_argument("window-size=1200x600")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    delays = [7, 4, 6, 2, 10, 19]
    delay = np.random.choice(delays)
    for crawler in website_address:
        browser.get(crawler)
        time.sleep(2)
        time.sleep(delay)

        browser.find_element_by_xpath('//*[@id="VisualSearchInput"]').send_keys(shop_by_category, Keys.ENTER)
        product_price = browser.find_elements_by_xpath('//*[@id="Wall"]/div/div[5]/div/main/section/div/div[1]/div/figure/div/div[3]/div/div/div/div')
        product_price_list = [elem.text for elem in product_price]
        product_category = browser.find_elements_by_xpath('//*[@id="Wall"]/div/div[5]/div/main/section/div/div[1]/div/figure/div/div[1]/div/div[2]')
        product_category_list = [elem.text for elem in product_category]
        product_name = browser.find_elements_by_xpath('//*[@id="Nike Air Zoom Vomero 15"]')
        product_name_list = [elem.text for elem in product_name]
        product_colors = browser.find_elements_by_xpath('//*[@id="Wall"]/div/div[5]/div/main/section/div/div[4]/div/figure/div/div[2]/div/button/div')
        product_colors_list = [elem.text for elem in product_colors]
        print(product_price_list)
        print(product_category_list)
        print(product_name_list)
        print(product_colors_list)


if __name__ == '__main__':
    category_name_list = ['running']
    for category in category_name_list:
        scrape_nike(category)

The output that I want is something like:

[Rs 1000, Rs 2990, Rs 3000,....]
[Mens running shoes, Womens running shoes, ...]
[Nike Air Zoom Pegasus, Nike Quest 3, ...]
[5 colors, 1 colors, 3 colors, ...]

But the output that I am getting right now is:

[]
[]
[]
[]

What is the exact issue because of which I am getting empty lists? I do not understand. Please help!!

EDIT: I am now able to get just a single product details in my lists, whereas I want all products, here's my change in the code

product_price = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="Wall"]/div/div[5]/div/main/section/div/div[1]/div/figure/div/div[3]/div/div/div/div')))
        product_price_list = [elem.text for elem in product_price]
        product_category = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="Wall"]/div/div[5]/div/main/section/div/div[1]/div/figure/div/div[1]/div/div[2]')))
        product_category_list = [elem.text for elem in product_category]
        product_name = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="Nike Air Zoom Vomero 15"]')))
        product_name_list = [elem.text for elem in product_name]
        product_colors = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="Wall"]/div/div[5]/div/main/section/div/div[4]/div/figure/div/div[2]/div/button/div')))
        product_colors_list = [elem.text for elem in product_colors]

This gives:

['₹13,495']
["Men's Running Shoe"]
['Nike Air Zoom Vomero 15']
['5 Colours']

I want multiple such entries

EDIT-2*: I have also tried using beautifulsoup4 but that also returned an empty output.

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import pandas as pd


def adidas(shop_by_category):
    driver = webdriver.Chrome("F:\\chromedriver\chromedriver.exe")

    titles = []  # List to store name of the product
    prices = []  # List to store price of the product
    category = []  # List to store category of the product
    colors = []  # List to store the no of colors of the product

    # URL to fetch from Can be looped over / crawled multiple urls
    driver.get('https://nike.co.in')
    driver.find_element_by_xpath('//*[@id="VisualSearchInput"]').send_keys(shop_by_category, Keys.ENTER)
    content = driver.page_source
    soup = BeautifulSoup(content, features="lxml")

    # Parsing content
    for div in soup.findAll('div', attrs={'class': 'product-card__body'}):
        name = div.find('div', attrs={'class': 'product-card__title'})
        price = div.find('div', attrs={'class': 'product-price css-11s12ax is-current-price'})
        subtitle = div.find('div', attrs={'class': 'product-card__subtitle'})
        color = div.find('div', attrs={'class': 'product-card__product-count'})
        titles.append(name.text)
        prices.append(price.text)
        category.append(subtitle.text)
        colors.append(color.text)

    # Storing scraped content
    df = pd.DataFrame({'Product Name': titles, 'Price': prices, 'Category': category, 'Colors': colors})
    df.to_csv('adidas.csv', index=False, encoding='utf-8')


if __name__ == '__main__':
    category_name_list = ['running']
    for category in category_name_list:
        adidas(category)

1 answer

  • answered 2021-05-05 11:56 Lucan

    You can get all of the information you require by using the CLASS_NAME selector as each product card is helpfully given a descriptive class.

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from webdriver_manager.chrome import ChromeDriverManager
    
    driver = webdriver.Chrome(ChromeDriverManager().install())
    
    try:
        # Set the URL explicitly for the example
        driver.get("https://www.nike.com/in/w?q=running&vst=running")
    
        # Click away the blocking popup requesting cookie permissions
        # This is not the way to do it properly. This is to keep the sample short.
        driver.implicitly_wait(10)
        popup = driver.find_element(By.ID, 'hf_cookie_text_cookieAccept')
        popup.click()
        driver.implicitly_wait(10)
    
        # Begin scraping elements
        product_cards_container = driver.find_element(By.CLASS_NAME, "product-grid__items")
        product_cards = product_cards_container.find_elements(By.CLASS_NAME, "product-card")
        for card in product_cards:
            title = card.find_element(By.CLASS_NAME, "product-card__title")
            category = card.find_element(By.CLASS_NAME, "product-card__subtitle")
            colors = card.find_element(By.CLASS_NAME, "product-card__product-count")
            price = card.find_element(By.CLASS_NAME, "product-price")
            print(title.text)
            print(category.text)
            print(colors.text)
            print(price.text)
    
    except Exception as e:
        print(e)
    finally:
        driver.quit()
    

    Which returns for one element:

    Nike Revolution 5 FlyEase
    Men's Running Shoe
    1 Colour
    ₹3,695
    

    Notice how product_cards uses the plural find_elements, this allows us to iterate over its child elements which in this case will contain the product cards. Once we have our card WebElement, we can find our data within the context of individual cards.

    You've used an explicit wait in your edit to the question so I assume that you understand why that is better than random time.sleep()'s, however, I'll link to the documentation on explicit waits as it will be beneficial for this task where the cards are 'lazy loaded'. You may also need to scroll to the bottom of the page to collect all product cards, you can see how to do that from the documentation or from a previous answer of mine to a similar question.