Python BeautifulSoup selenium scraper - python

I'm using the following python script for scraping info from Amazon pages.
At some point, it stopped returning page results. The script is starting, browsing through the keywords/pages but I only get the headers as output:
Keyword Rank Title ASIN Score Reviews Prime Date
I suspect that the problem is in the following line as this tag doesn't exist anymore and the results var doesn't get any value:
results = soup.findAll('div', attrs={'class': 's-item-container'})
This is the full code:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import datetime
from collections import deque
import logging
import csv
class AmazonScaper(object):
def __init__(self,keywords, output_file='example.csv',sleep=2):
self.browser = webdriver.Chrome(executable_path='/Users/willcecil/Dropbox/Python/chromedriver') #Add path to your Chromedriver
self.keyword_queue = deque(keywords) #Add the start URL to our list of URLs to crawl
self.output_file = output_file
self.sleep = sleep
self.results = []
def get_page(self, keyword):
try:
self.browser.get('https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={a}'.format(a=keyword))
return self.browser.page_source
except Exception as e:
logging.exception(e)
return
def get_soup(self, html):
if html is not None:
soup = BeautifulSoup(html, 'lxml')
return soup
else:
return
def get_data(self,soup,keyword):
try:
results = soup.findAll('div', attrs={'class': 's-item-container'})
for a, b in enumerate(results):
soup = b
header = soup.find('h2')
result = a + 1
title = header.text
try:
link = soup.find('a', attrs={'class': 'a-link-normal a-text-normal'})
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
# Extract the ASIN from the URL - ASIN is the breaking point to filter out if the position is sponsored
ASIN = re.sub(r'.*amazon.co.uk.*/dp/', '', str(url))
# Extract Score Data using ASIN number to find the span class
score = soup.find('span', attrs={'name': ASIN})
try:
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
# Extract Number of Reviews in the same way
reviews = soup.find('a', href=re.compile(r'.*#customerReviews'))
try:
reviews = reviews.text
except:
reviews = "None"
# And again for Prime
PRIME = soup.find('i', attrs={'aria-label': 'Prime'})
try:
PRIME = PRIME.text
except:
PRIME = "None"
data = {keyword:[keyword,str(result),title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
self.results.append(data)
except Exception as e:
print(e)
return 1
def csv_output(self):
keys = ['Keyword','Rank','Title','ASIN','Score','Reviews','Prime','Date']
print(self.results)
with open(self.output_file, 'a', encoding='utf-8') as outputfile:
dict_writer = csv.DictWriter(outputfile, keys)
dict_writer.writeheader()
for item in self.results:
for key,value in item.items():
print(".".join(value))
outputfile.write(",".join('"' + item + '"' for item in value)+"\n") # Add "" quote character so the CSV accepts commas
def run_crawler(self):
while len(self.keyword_queue): #If we have keywords to check
keyword = self.keyword_queue.popleft() #We grab a keyword from the left of the list
html = self.get_page(keyword)
soup = self.get_soup(html)
time.sleep(self.sleep) # Wait for the specified time
if soup is not None: #If we have soup - parse and save data
self.get_data(soup,keyword)
self.browser.quit()
self.csv_output() # Save the object data to csv
if __name__ == "__main__":
keywords = [str.replace(line.rstrip('\n'),' ','+') for line in
open('keywords.txt')] # Use our file of keywords & replaces spaces with +
ranker = AmazonScaper(keywords) # Create the object
ranker.run_crawler() # Run the rank checker
The output should look like this (I have trimmed the Titles for clarity).
Keyword Rank Title ASIN Score Reviews Prime Date
Blue+Skateboard 3 Osprey Complete
Beginn B00IL1JMF4 3.7 40 Prime February 21, 2019
Blue+Skateboard 4 ENKEEO Complete Mini
C B078J9Y1DG 4.5 42 Prime February 21, 2019 Blue+Skateboard 5 skatro -
Mini Cruiser B00K93PIXM 4.8 223 Prime February 21, 2019
Blue+Skateboard 7 Vinsani Retro Cruiser
B00CSV72AK 4.4 8 Prime February 21, 2019 Blue+Skateboard 8 Ridge
Retro Cruiser Bo B00CA33ISQ 4.1 207 Prime February 21, 2019
Blue+Skateboard 9 Xootz Kids Complete
Be B01B2YNSJM 3.6 32 Prime February 21, 2019 Blue+Skateboard 10 Enuff
Pyro II Skateboa B00MGRGX2Y 4.3 68 Prime February 21, 2019

The following shows some changes you could make. I have changed to using css selectors at some points.
The main result set to loop over are retrieved by soup.select('.s-result-list [data-asin]'). This specifies elements with class name .s-result-list having children with attribute data-asin. This matches the 60 (current) items on page.
I swapped the PRIME selection to using an attribute = value selector
Headers are now h5 i.e. header = soup.select_one('h5').
soup.select_one('[aria-label="Amazon Prime"]
Example code:
import datetime
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
keyword = 'blue+skateboard'
driver = webdriver.Chrome()
url = 'https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={}'
driver.get(url.format(keyword))
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.select('.s-result-list [data-asin]')
for a, b in enumerate(results):
soup = b
header = soup.select_one('h5')
result = a + 1
title = header.text.strip()
try:
link = soup.select_one('h5 > a')
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
if url !='/gp/slredirect/picassoRedirect.html':
ASIN = re.sub(r'.*/dp/', '', str(url))
#print(ASIN)
try:
score = soup.select_one('.a-icon-alt')
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
try:
reviews = soup.select_one("href*='#customerReviews']")
reviews = reviews.text.strip()
except:
reviews = "None"
try:
PRIME = soup.select_one('[aria-label="Amazon Prime"]')
PRIME = PRIME['aria-label']
except:
PRIME = "None"
data = {keyword:[keyword,str(result),title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
print(data)
Example output:

Related

Looping until max results

I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks
Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.
You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break
This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))

BeautifulSoup (Python): how grab text-string next to a tag (that may or may not exist)?

I think my title explains it pretty well the problem I am facing. Let's look at a picture of the problem. (You can find the web-page at this adress, however it has probably changed).
I have highlighted the text that I want to grab in blue, this is the model-year 2008. Now, it is not necessary for the seller to submit the model-year, so this may or may not exist. But when it does exist it always follows the <i> tag with class ="fa fa-calender". My solution so far has been to grab all the text whitin <p class="result-details> ... </p>" (this then becomes a list) and then choose the second element, conditioned on that <i class="fa fa-calender> ... </i> exists. Otherwise I do not grab anything.
Now, it seems as this does not work in general since that text that comes before the second element can be aranged into more than one element if has a whitespace in it. So, is there any way (any function) that can grab a text string that neighbours another tag as seen in my picture?
PS: if I have made myself unclear, I just want to fetch the year 2008 from the post on the web page if it exists.
Edit
In this situation my code erroneously gives my the word "Hjulvältar" (bulldozer in english) instead of the year 2008.
CODE
from bs4 import BeautifulSoup
from datetime import date
import requests
url_avvikande = ['bomliftar','teleskop-bomliftar','kompakta-sjalvgaende-bomlyftar','bandschaktare','reachstackers','staplare']
today = date.today().isoformat()
url_main = 'https://www.mascus.se'
produktgrupper = ['lantbruksmaskiner','transportfordon','skogsmaskiner','entreprenadmaskiner','materialhantering','gronytemaskiner']
kategorier = {
'lantbruksmaskiner': ['traktorer','sjalvgaende-falthackar','skordetroskor','atv','utv:er','snoskotrar'],
'transportfordon': ['fordonstruckar','elektriska-fordon','terrangfordon'],
'skogsmaskiner': ['skog-skordare','skog-gravmaskiner','skotare','drivare','fallare-laggare','skogstraktorer','lunnare','terminal-lastare'],
'entreprenadmaskiner': ['gravlastare','bandgravare','minigravare-7t','hjulgravare','midigravmaskiner-7t-12t','atervinningshanterare','amfibiska-gravmaskiner','gravmaskiner-med-frontskopa','gravmaskiner-med-lang-rackvidd','gravmaskiner-med-slapskopa','rivningsgravare','specialgravmaskiner','hjullastare','kompaktlastare','minilastmaskiner','bandlastare','teleskopiska-hjullastare','redaskapshallare','gruvlastare','truckar-och-lastare-for-gruvor','bergborriggar','teleskoplastare','dumprar','minidumprar','gruvtruckar','banddumprar','specialiserade-dragare','vaghyvlar','vattentankbilar','allterrangkranar','terrangkranar-grov-terrang','-bandgaende-kranar','saxliftar','bomliftar','teleskop-bomliftar','personhissar-och-andra-hissar','kompakta-sjalvgaende-bomlyftar','krossar','mobila-krossar','sorteringsverk','mobila-sorteringsverk','bandschaktare','asfaltslaggningsmaskiner','--asfaltskallfrasmaskiner','tvavalsvaltar','envalsvaltar','jordkompaktorer','pneumatiska-hjulvaltar','andra-valtar','kombirullar','borrutrustning-ytborrning','horisontella-borrutrustning','trenchers-skar-gravmaskin'],
'materialhantering': ['dieseltruckar','eldrivna-gaffeltruckar','lpg-truckar','gaffeltruckar---ovriga','skjutstativtruck','sidlastare','teleskopbomtruckar','terminaltraktorer','reachstackers','ovriga-materialhantering-maskiner','staplare-led','staplare','plocktruck-laglyftande','plocktruck-hoglyftande','plocktruck-mediumlyftande','dragtruck','terrangtruck','4-vagstruck','smalgangstruck','skurborsttorkar','inomhus-sopmaskiner','kombinationsskurborstar'],
'gronytemaskiner': ['kompakttraktorer','akgrasklippare','robotgrasklippare','nollsvangare','plattformsklippare','sopmaskiner','verktygsfraktare','redskapsbarare','golfbilar','fairway-grasklippare','green-grasklippare','grasmattevaltar','ovriga-gronytemaskiner']
}
url = 'https://www.mascus.se'
mappar = ['Lantbruk', 'Transportfordon', 'Skogsmaskiner', 'Entreprenad', 'Materialhantering', 'Grönytemaskiner']
index = -1
status = True
for produktgrupp in kategorier:
index += 1
mapp = mappar[index]
save_path = f'/home/protector.local/vika99/webscrape_mascus/Annonser/{mapp}'
underkategorier = kategorier[produktgrupp]
for underkategori in underkategorier:
# OBS
if underkategori != 'borrutrustning-ytborrning' and status:
continue
else:
status = False
# OBS
if underkategori in url_avvikande:
url = f'{url_main}/{produktgrupp}/{underkategori}'
elif underkategori == 'gravmaskiner-med-frontskopa':
url = f'{url_main}/{produktgrupp}/begagnat-{underkategori}'
elif underkategori == 'borrutrustning-ytborrning':
url = f'{url_main}/{produktgrupp}/begagnad-{underkategori}'
else:
url = f'{url_main}/{produktgrupp}/begagnade-{underkategori}'
file_name = f'{save_path}/{produktgrupp}_{underkategori}_{today}.txt'
sida = 1
print(url)
with open(file_name, 'w') as f:
while True:
print(sida)
html_text = None
soup = None
links = None
while links == None:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
links = soup.find('ul', class_ = 'page-numbers')
annonser = soup.find_all('li', class_ = 'col-row single-result')
for annons in annonser:
modell = annons.find('a', class_ = 'title-font').text
if annons.p.find('i', class_ = 'fa fa-calendar') != None:
tillverkningsar = annons.find('p', class_ = 'result-details').text.strip().split(" ")[1]
else:
tillverkningsar = 'Ej angiven'
try:
pris = annons.find('span', class_ = 'title-font no-ws-wrap').text
except AttributeError:
pris = annons.find('span', class_ = 'title-font no-price').text
f.write(f'{produktgrupp:<21}{underkategori:25}{modell:<70}{tillverkningsar:<13}{pris:>14}\n')
url_part = None
sida += 1
try:
url_part = links.find('a', text = f'{sida}')['href']
except TypeError:
print(f'Avläsning av underkategori klar.')
break
url = f'{url_main}{url_part}'
As you loop the listings you can test if that calendar icon class is present, if it is then grab the next_sibling
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mascus.se/entreprenadmaskiner/begagnade-pneumatiska-hjulvaltar')
soup = bs(r.content, 'lxml')
listings = soup.select('.single-result')
for listing in listings:
calendar = listing.select_one('.fa-calendar')
if calendar is not None:
print(calendar.next_sibling)
else:
print('Not present')

how to scrape texts from voetsmart via beautifulsoup

I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.
scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.

Python BeautifulSoup page drill down

I have a python script which scrapes information from an Amazon page using a list of keywords stored in a .txt file. I have almost all the information I need in the page below:
'https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={a}'.format(a=keyword)
The bit missing is the seller info (for example: by ZETA) for which I need to drill down in all product pages as the one below:
https://www.amazon.co.uk/Stroller-Pushchair-Colours-Available-Raincover/dp/B073B2D7CL/ref=sr_1_9?keywords=Pushchair&qid=1555063828&s=gateway&sr=8-9
I guess I need a while loop inside get_data function but I'm not sure how to implement this. See below for the code:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import datetime
from collections import deque
import logging
import csv
class AmazonScaper(object):
def __init__(self,keywords, output_file='example.csv',sleep=2):
self.browser = webdriver.Chrome(executable_path='chromedriver.exe') #Add path to your Chromedriver
self.keyword_queue = deque(keywords) #Add the start URL to our list of URLs to crawl
self.output_file = output_file
self.sleep = sleep
self.results = []
def get_page(self, keyword):
try:
self.browser.get('https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={a}'.format(a=keyword))
return self.browser.page_source
except Exception as e:
logging.exception(e)
return
def get_soup(self, html):
if html is not None:
soup = BeautifulSoup(html, 'lxml')
return soup
else:
return
def get_data(self,soup,keyword):
try:
results = soup.select('.s-result-list [data-asin]')
for a, b in enumerate(results):
soup = b
header = soup.find('h5')
result = a + 1
title = header.text.strip()
try:
link = soup.find('a', attrs={'class': 'a-link-normal a-text-normal'})
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
# Extract the ASIN from the URL - ASIN is the breaking point to filter out if the position is sponsored
ASIN = re.sub(r'.*/dp/', '', str(url))
# Extract Score Data using ASIN number to find the span class
#<span class="a-icon-alt">4.3 out of 5 stars</span>
try:
score = soup.select_one('.a-icon-alt')
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
# Extract Number of Reviews in the same way
try:
reviews = soup.select_one("href*='#customerReviews']")
reviews = reviews.text.strip()
except:
reviews = "None"
# And again for Prime
try:
PRIME = soup.select_one('[field-lbr_brands_browse-bin=*"]')
PRIME = PRIME['field-lbr_brands_browse-bin']
#<i class="a-icon a-icon-prime" role="img" aria-label="Amazon Prime"></i>
except:
PRIME = "None"
try:
seller = ""
seller = ""
except:
seller = "None"
data = {keyword:[keyword,str(result),seller,title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
self.results.append(data)
except Exception as e:
print(e)
return 1
def csv_output(self):
keys = ['Keyword','Rank','seller','Title','ASIN','Score','Reviews','Prime','Dates']
print(self.results)
with open(self.output_file, 'a', encoding='utf-8') as outputfile:
dict_writer = csv.DictWriter(outputfile, keys)
dict_writer.writeheader()
for item in self.results:
for key,value in item.items():
print(".".join(value))
outputfile.write(",".join('"' + item + '"' for item in value)+"\n") # Add "" quote character so the CSV accepts commas
def run_crawler(self):
while len(self.keyword_queue): #If we have keywords to check
keyword = self.keyword_queue.popleft() #We grab a keyword from the left of the list
html = self.get_page(keyword)
soup = self.get_soup(html)
time.sleep(self.sleep) # Wait for the specified time
if soup is not None: #If we have soup - parse and save data
self.get_data(soup,keyword)
#self.browser.quit()
self.csv_output() # Save the object data to csv
if __name__ == "__main__":
keywords = [str.replace(line.rstrip('\n'),' ','+') for line in
open('keywords.txt')] # Use our file of keywords & replaces spaces with +
ranker = AmazonScaper(keywords) # Create the object
ranker.run_crawler() # Run the rank checker
On the search page, each search item is contained in tags like:
<div data-asin="B0089TV3CS" data-index="1" class="sg-col-4-of-24 sg-col-4-of-12 sg-col-4-of-36 s-result-item sg-col-4-of-28 sg-col-4-of-16 AdHolder sg-col sg-col-4-of-20 sg-col-4-of-32" data-cel-widget="search_result_1">
Look right at the end of the above line. You can see the pattern that all search results follow. So you can use a regex search on the div tags with class attributes like so:
search_results = soup.findall("div", {"data-cel-widget": re.compile(r"search_result_\d")})
Now you can loop through each search result, and extract the links to the individual product pages, noting that the links are contained in tags like:
<a class="a-link-normal a-text-normal" href="/Sterling-Necklace-Infinity-Pendant-Jewellery/dp/B07BPSPD14/ref=sr_1_8?keywords=cross&qid=1555066092&s=gateway&sr=8-8">
I'm not familiar with selenium, but if I were using the requests module, I'd use it to load each product page in the loop, make a BeautifulSoup from it, and then look for the following tag, which is where the seller info is contained:
<a id="bylineInfo" class="a-link-normal" href="/ZETA/b/ref=bl_dp_s_web_1658218031?ie=UTF8&node=1658218031&field-lbr_brands_browse-bin=ZETA">ZETA</a>

Scraping with Python

I have a code in python in order to scrape some data from trip advisor(ratings from the reviews). The problem is that whenever I run the code it gives me different rows and never scraps all the webpages.
The index error that appears is:
Traceback (most recent call last):
File "C:/Users/thimios/PycharmProjects/TripadvisorScrapping/proxiro.py", line 26, in <module>
rating = soup.findAll("div", {'class': 'rating reviewItemInline'})[i]
IndexError: list index out of range
The code is the following:
from bs4 import BeautifulSoup
import os
import urllib.request
file2 = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews2.csv"), "wb")
file2.write(b"Organization,Rating" + b"\n")
WebSites = [
"https://www.tripadvisor.com/Hotel_Review-g189400-d198932-Reviews-Hilton_Athens-Athens_Attica.html#REVIEWS"]
Checker ="REVIEWS"
# looping through each site until it hits a break
for theurl in WebSites:
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
#print(soup)
while True:
# Extract ratings from the text reviews
altarray = ""
for i in range(0,10):
rating = soup.findAll("div", {'class': 'rating reviewItemInline'})[i]
rating1 = rating.find_all("span")[0]
rating2 = rating1['class'][1][-2:]
print(rating2)
if len(altarray) == 0:
altarray = [rating2]
else:
altarray.append(rating2)
#print(altarray)
#print(len(altarray))
#print(type(altarray))
# Extract Organization,
Organization1 = soup.find(attrs={'class': 'heading_name'})
Organization = Organization1.text.replace('"', ' ').replace('Review of',' ').strip()
#print(Organization)
# Loop through each review on the page
for x in range(0, 10):
Rating = altarray[x]
Rating = str(Rating)
#print(Rating)
#print(type(Rating))
Record2 = Organization + "," + Rating
if Checker == "REVIEWS":
file2.write(bytes(Record2, encoding="ascii", errors='ignore') + b"\n")
link = soup.find_all(attrs={"class": "nav next rndBtn ui_button primary taLnk"})
#print(link)
#print(link[0])
if len(link) == 0:
break
else:
soup = BeautifulSoup(urllib.request.urlopen("http://www.tripadvisor.com" + link[0].get('href')),"html.parser")
#print(soup)
#print(Organization)
print(link[0].get('href'))
Checker = link[0].get('href')[-7:]
#print(Checker)
file2.close()
I am supposing that trip advisor doesn't give full access to the data.Any idea?
The error is encountered when you are trying to access an element in the list by index and that index does not exists.
I have ran your code and it prints :
50
50
50
50
50
50
40
40
40
50
Although, the way you are looping is not the most pythonic way of doing it and also susceptible to lot of index errors.
What you can do is replace this :
for i in range(0,10):
rating = soup.findAll("div", {'class': 'rating reviewItemInline'})[i]
with :
for rating in soup.findAll("div", {'class': 'rating reviewItemInline'}) :
This shall resolve the error as well.

Categories