web scrapping - get prices on Amazon
I want to target the aria-hidden
on Amazon, since I found current prices of products may vary each by each and the element with aria-hidden
looks like a solution to get what I want.
1 answer
-
answered 2022-01-23 02:33
vinalti
The html-css selector should be
.aprice > span[aria-hidden]
If this is not enough to select the good price, check the
class="..."
of the parents elements and add them to the selector. (Selector for class =>.xxxx
and ID =>#xxx
) For instance:// This is a JQuery selector $("") $("#container .aprice > span[aria-hidden]")
do you know?
how many words do you know
See also questions close to this topic
-
ValueError: All arrays must be of the same length when scraping
I try to input different zip codes and scrape information for Target products. However, results in this error:ValueError: All arrays must be of the same length and there is nothing in my CSV file. I guess because I did not successfully scrap e all the information. Can anyone give me some suggestions on how to improve the code? I appreciate any help. Thanks.
Following is my code:
#Target Url list urlList = [ 'https://www.target.com/p/pataday-once-daily-relief-extra-strength-drops-0-085-fl-oz/-/A-83775159?preselect=81887758#lnk=sametab', 'https://www.target.com/p/kleenex-ultra-soft-facial-tissue/-/A-84780536?preselect=12964744#lnk=sametab', 'https://www.target.com/p/claritin-24-hour-non-drowsy-allergy-relief-tablets-loratadine/-/A-80354268?preselect=14351285#lnk=sametab', 'https://www.target.com/p/opti-free-pure-moist-rewetting-drops-0-4-fl-oz/-/A-14358641#lnk=sametab', 'https://www.target.com/p/allegra-24-hour-allergy-relief-tablets-fexofenadine-hydrochloride/-/A-15068699?preselect=14042732#lnk=sametab', 'https://www.target.com/p/nasacort-allergy-relief-spray-triamcinolone-acetonide/-/A-15143450?preselect=15503329#lnk=sametab', 'https://www.target.com/p/genexa-dextromethorphan-kids-39-cough-and-chest-congestion-suppressant-4-fl-oz/-/A-80130848#lnk=sametab', 'https://www.target.com/p/zyrtec-24-hour-allergy-relief-tablets-cetirizine-hcl/-/A-15075280?preselect=79847258#lnk=sametab', 'https://www.target.com/p/pataday-twice-daily-eye-allergy-itch-and-redness-relief-drops-0-17-fl-oz/-/A-78780978#lnk=sametab', 'https://www.target.com/p/systane-gel-drops-lubricant-eye-gel-0-33-fl-oz/-/A-14523072#lnk=sametab'] zipCodeList = [3911,4075,4467,96970,96960,49220,49221,49224,48001,49227,48101,48002,48003,48004] while(True): priceArray = [] nameArray = [] zipCodeArray =[] GMTArray = [] TCIN = [] UPC = [] def ScrapingTarget(url): wait_imp = 10 CO = webdriver.ChromeOptions() CO.add_experimental_option('useAutomationExtension', False) CO.add_argument('--ignore-certificate-errors') CO.add_argument('--start-maximized') wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',options=CO) wd.get(url) wd.implicitly_wait(wait_imp) # needed to click onto the "Show more" to get the tcin and upc xpath = '//*[@id="tabContent-tab-Details"]/div/button' element_present = EC.presence_of_element_located((By.XPATH, xpath)) WebDriverWait(wd, 5).until(element_present) showMore = wd.find_element(by=By.XPATH, value=xpath) sleep(3) showMore.click() # showMore = wd.find_element(by=By.XPATH, value="//*[@id='tabContent-tab-Details']/div/button") # sleep(2) #showMore.click() soup = BeautifulSoup(wd.page_source, 'html.parser') # gets a list of all elements under "Specifications" try: # gets a list of all elements under "Specifications" div = soup.find("div", {"class": "styles__StyledCol-sc-ct8kx6-0 iKGdHS h-padding-h-tight"}) list = div.find_all("div") for a in range(len(list)): list[a] = list[a].text # locates the elements in the list tcin = [v for v in list if v.startswith("TCIN")] upc = [v for v in list if v.startswith("UPC")] except: tcin = "Error" upc = "Error" TCIN.append(tcin) UPC.append(upc) for zipcode in zipCodeList: try: #click the delivery address address = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[1]/button[2]") address.click() #click the Edit location editLocation = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[2]/button") editLocation.click() except: #directly click he Edit location editLocation = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div[1]/div/div[1]/button") editLocation.click() #input ZipCode inputZipCode = wd.find_element(by=By.XPATH, value="//*[@id='enter-zip-or-city-state']") inputZipCode.clear() inputZipCode.send_keys(zipcode) #click submit clickSubmit = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[2]/div/div/div[3]/div/button[1]") clickSubmit.click() #start scraping name = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[1]/h1/span").text nameArray.append(name) price = wd.find_element(by=By.XPATH, value="//*[@id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text priceArray.append(price) currentZipCode = zipcode zipCodeArray.append(currentZipCode) tz = pytz.timezone('Europe/London') GMT = datetime.now(tz) GMTArray.append(GMT) with concurrent.futures.ThreadPoolExecutor() as executor: executor.map(ScrapingTarget, urlList) data = {'prod-name': nameArray, 'Price': priceArray, 'currentZipCode': zipCodeArray, "Tcin": TCIN, "UPC":UPC, "GMT": GMTArray } df = pd.DataFrame(data, columns= ['prod-name', 'Price','currentZipCode',"Tcin","UPC","GMT"]) df.to_csv(r'C:\Users\12987\PycharmProjects\python\Network\priceingAlgoriCoding\export_Target_dataframe.csv', mode='a', index = False, header=True) sleep(20)
-
Scraping .aspx page with Python yields 404
I'm a web-scraping beginner and am trying to scrape this webpage: https://profiles.doe.mass.edu/statereport/ap.aspx
I'd like to be able to put in some settings at the top (like District, 2020-2021, Computer Science A, Female) and then download the resulting data for those settings.
Here's the code I'm currently using:
import requests from bs4 import BeautifulSoup url = 'https://profiles.doe.mass.edu/statereport/ap.aspx' with requests.Session() as s: s.headers['User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0" r = s.get('https://profiles.doe.mass.edu/statereport/ap.aspx') soup = BeautifulSoup(r.text,"lxml") data = {i['name']:i.get('value','') for i in soup.select('input[name]')} data["ctl00$ContentPlaceHolder1$ddReportType"]="DISTRICT", data["ctl00$ContentPlaceHolder1$ddYear"]="2021", data["ctl00$ContentPlaceHolder1$ddSubject"]="COMSCA", data["ctl00$ContentPlaceHolder1$ddStudentGroup"]="F", p = s.post(url,data=data)
When I print out
p.text
, I get a page with title'\t404 - Page Not Found\r\n'
and message<h2>We are unable to locate information at: <br /><br ' '/>http://profiles.doe.mass.edu:80/statereport/ap.aspxp?ASP.NET_SessionId=bxfgao54wru50zl5tkmfml00</h2>\r\n'
Here's what
data
looks like before I modify it:{'__EVENTVALIDATION': '/wEdAFXz4796FFICjJ1Xc5ZOd9SwSHUlrrW+2y3gXxnnQf/b23Vhtt4oQyaVxTPpLLu5SKjKYgCipfSrKpW6jkHllWSEpW6/zTHqyc3IGH3Y0p/oA6xdsl0Dt4O8D2I0RxEvXEWFWVOnvCipZArmSoAj/6Nog6zUh+Jhjqd1LNep6GtJczTu236xw2xaJFSzyG+xo1ygDunu7BCYVmh+LuKcW56TG5L0jGOqySgRaEMolHMgR0Wo68k/uWImXPWE+YrUtgDXkgqzsktuw0QHVZv7mSDJ31NaBb64Fs9ARJ5Argo+FxJW/LIaGGeAYoDphL88oao07IP77wrmH6t1R4d88C8ImDHG9DY3sCDemvzhV+wJcnU4a5qVvRziPyzqDWnj3tqRclGoSw0VvVK9w+C3/577Gx5gqF21UsZuYzfP4emcqvJ7ckTiBk7CpZkjUjM6Z9XchlxNjWi1LkzyZ8QMP0MaNCP4CVYJfndopwFzJC7kI3W106YIA/xglzXrSdmq6/MDUCczeqIsmRQGyTOkQFH724RllsbZyHoPHYvoSAJilrMQf6BUERVN4ojysx3fz5qZhZE7DWaJAC882mXz4mEtcevFrLwuVPD7iB2v2mlWoK0S5Chw4WavlmHC+9BRhT36jtBzSPRROlXuc6P9YehFJOmpQXqlVil7C9OylT4Kz5tYzrX9JVWEpeWULgo9Evm+ipJZOKY2YnC41xTK/MbZFxsIxqwHA3IuS10Q5laFojoB+e+FDCqazV9MvcHllsPv2TK3N1oNHA8ODKnEABoLdRgumrTLDF8Lh+k+Y4EROoHhBaO3aMppAI52v3ajRcCFET22jbEm/5+P2TG2dhPhYgtZ8M/e/AoXht29ixVQ1ReO/6bhLIM+i48RTmcl76n1mNjfimB8r3irXQGYIEqCkXlUHZ/SNlRYyx3obJ6E/eljlPveWNidFHOaj+FznOh264qDkMm7fF78WBO2v0x+or1WGijWDdQtRy9WRKXchYxUchmBlYm15YbBfMrIB7+77NJV+M6uIVVnCyiDRGj+oPXcTYxqSUCLrOMQyzYKJeu8/hWD0gOdKeoYUdUUJq4idIk+bLYy76sI/N2aK+aXZo/JPQ+23gTHzIlyi4Io7O6kXaULPs8rfo8hpkH1qXyKb/rP2VJBNWgyp8jOMx9px+m4/e2Iecd86E4eN4Rk6OIiwqGp+dMdgntXu5ruRHb1awPlVmDw92dL1P0b0XxJW7EGfMzyssMDhs1VT6K6iMUTHbuXkNGaEG1dP1h4ktnCwGqDLVutU6UuzT6i4nfqnvFjGK9+7Ze8qWIl8SYyhmvzmgpLjdMuF9CYMQ2Aa79HXLKFACsSSm0dyiU1/ZGyII2Fvga9o+nVV1jZam3LkcAPaXEKwEyJXfN/DA7P4nFAaQ+QP+2bSgrcw+/dw+86OhPyG88qyJwqZODEXE1WB5zSOUywGb1/Xed7wq9WoRs6v8rAK5c/2iH7YLiJ4mUVDo+7WCKrzO5+Hsyah3frMKbheY1acRmSVUzRgCnTx7jvcLGR9Jbt6TredqZaWZBrDFcntdg7EHd7imK5PqjUld3iCVjdyO+yLKUkMKiFD85G3vEferg/Q/TtfVBqeTU0ohP9d+CsKOmV/dxVYWEtBcfa9KiN6j4N8pP7+3iUOhajojZ8jV98kxT0zPZlzkpqI4SwR6Ys8d2RjIi5K+oQul4pL5u+zZvX0lsLP9Jl7FeVTfBvST67T6ohz8dl9gBfmmbwnT23SyuFSUGd6ZGaKE+9kKYmuImW7w3ePs7C70yDWHpIpxP/IJ4GHb36LWto2g3Ld3goCQ4fXPu7C4iTiN6b5WUSlJJsWGF4eQkJue8=', '__VIEWSTATE': '/wEPDwUKLTM0NzY4OTQ4NmRkDwwPzTpuna+yxVhQxpRF4n2+zYKQtotwRPqzuCkRvyU=', '__VIEWSTATEGENERATOR': '2B6F8D71', 'ctl00$ContentPlaceHolder1$btnViewReport': 'View Report', 'ctl00$ContentPlaceHolder1$hfExport': 'ViewReport', 'leftNavId': '11241', 'quickSearchValue': '', 'runQuickSearch': 'Y', 'searchType': 'QUICK', 'searchtext': ''}
Following suggestions from similar questions, I've tried playing around with the parameters, editing
data
in various ways (to emulate the POST request that I see in my browser when I navigate the site myself), and specifying anASP.NET_SessionId
, but to no avail.How can I access the information from this website?
-
Get specific information from wikipedia on google spreadsheet (not the entire table)
I have a table from "Lead rolling actors" from Wikipedia and I want to add some columns to the table with the dates of birth, years active etc for every actor.
It's the first time I use IMPORTXML formula but for Robert Downey Jr I am trying the following:
-Born: =IMPORTXML(G1!,"//span[@class='bday']")
< span class="bday">1965-04-04</ span>
-Years Active: =IMPORTXML(G1!,"//td[@class='infobox-data']")
< td class="infobox-data">1970–present</ td>
In both cases it gives me errors. What am I doing wrong? I looked on https://www.benlcollins.com/spreadsheets/google-sheet-web-scraper/ to get some guidance but I can't find my error.
-
Amazon policy on purchase from web within FireTV app
I have created a Amazon Fire TV app in which we are selling content. A user can purchase the content from website. We don't have any functionality to purchase content within firetv app.
Now We have submitted the app at Amazon app store and Amazon has rejected the app. Reason is given below -
is there any Amazon policy which is restricting to use Amazon IAP if we are selling content in FireTV app? If anyone have any idea then please help here.
-
Amazon SP-API How To Get All Variations Including Out Of Stock amazon-sp-api
I'm trying to take an ASIN and get all the variations but, currently, it only returns the in-stock items.
Anything else is excluded.
Is there a way to get all variations regardless?
Here is my code (I'm using amazon-sp-api to do this):
let sellingPartner = new SellingPartnerAPI({ region: 'na', refresh_token: a.amazon_refresh_token, credentials: { SELLING_PARTNER_APP_CLIENT_ID: process.env.SELLING_PARTNER_APP_CLIENT_ID, SELLING_PARTNER_APP_CLIENT_SECRET: process.env.SELLING_PARTNER_APP_CLIENT_SECRET, AWS_ACCESS_KEY_ID: process.env.AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY: process.env.AWS_SECRET_ACCESS_KEY, AWS_SELLING_PARTNER_ROLE: process.env.AWS_SELLING_PARTNER_ROLE } }) let response = await a.sellingPartner.callAPI({ operation:'catalogItems.getCatalogItem', path:{ asin: asin }, query:{ MarketplaceId:process.env.MarketplaceId }, options:{ version:'v0' } });
-
how do we can make amazon like search suggestions using elasticsearch (note:amazon is using elasticsearch )
Am using elastic search , my suggestion is working but I need a search suggestion like amazon. is it possible using elastic search.