while (x < go):
url = "https://www.shoppingwesbite.com/search?=product" + input_a
headers = {'User-Agent': 'my user agent here'}
ok = get(url, headers=headers)
data = BeautifulSoup(ok.content, 'html.parser')
price = data.find_all('div', {"class" : "css-rey619"})[x].get_text()
title = data.find_all('div', {"class" : "css-398hol"})[x].get_text()
reviews = data.find_all('span', {'class':'css-402phy'})[x].get_text()
I have included this piece of code from my web scraper and it essentially just pulls the first 10 results on a shopping website for a product inputted by the user. Now, most of the time it works but sometimes it returns the error that says the index is not callable for the "reviews" variable because I think it's trying to pull a review for a product that doesn't have a review yet. I don't know how to get around this and would appreciate any suggestions/ideas on what I could try. I was thinking on making some logic statement that checks if the listing has a review or not and outputting it if it does, but I don't know how to achieve this. Thanks!
You can check the length of the reviews, if it is zero you got empty review.
for i in range(len(reviews)):
if len(reviews[i]) == 0:
print("you got empty review now you can easily remove it")
Related
I've been trying to code a program in python that can return a list of all the product names on the first page. I have a function that gets the URL based on what you want to search:
def get_url(search_term):
template = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_1'
search_term = search_term.replace(' ', '+')
url = template.format(search_term)
print(url)
return URL
Then I pass the URL into another function and here is where I need help. Right now my function to retrieve the title and number of reviews is this:
def getInfo(url):
r = HTMLSession().get(url)
r.html.render()
product = {
'title': r.html.find('.a-size-medium' '.a-color-base' '.a-text-normal', first=True).text,
'reviews': r.html.find('.a-size-base', first=True).text
}
print(product)
However, the r.html.find part isn't getting the info I need, it either returns [] or None if I add first=True. I've tried different ways like using the XPath and selector. None of those seemed to work. Can anyone help find a way to use html.find method to find all the product names and save them in title in the dictionary product?
I have a problem with the web scraping code below. The code works, but if the entered product is not just a single word and contains for example also a number like "Playstation 4" it fails. The problem seems to be in this line if product in str(product_name):
I tried many different variations like product_name.text or product_name.string, but it won´t correctly check if the string product is in the converted object product_name if it is not just one word.
If I use print(product_name.text) I get exactly the result that I would expect, but why can´t I use the if-in-statement correctly with product_name.text or str(product_name)?
import requests
from bs4 import BeautifulSoup
product = input("Please enter product: ")
URL = "http://www.somewebsite.com/search?sSearch=" + product
website = requests.get(URL)
html = BeautifulSoup(website.text, 'html.parser')
product_info = html.find_all('div', class_="product--main")
product_array = []
for product_details in product_info:
product_name = product_details.find('a', class_="product--title product--title-desktop")
if product in str(product_name):
product_array.append(product_name.text.replace('\n', '')+'; ')
discounted_price = product_details.find('span', class_="price--default is--discount")
if discounted_price:
product_array.append(discounted_price.text.replace('\n', '').replace('\xa0€*','').replace('from','') + ';\n')
else:
regular_price = product_details.find('span', class_="price--default")
product_array.append(regular_price.text.replace('\n', '').replace('\xa0€*','').replace('from','') + ';\n' if regular_price else 'N/A;\n')
with open("data.csv", "w") as text_file:
text_file.write("product; price;\n")
for object in product_array:
text_file.write(object)
Why should I use urlencode?
I tried many different variations like product_name.text or product_name.string,
but it won´t correctly check if the string product is in the converted object product_name if it is...
not just one word.
URL = "http://www.somewebsite.com/search?sSearch=" + product
Please look what happens with query string when you use concatenation:
So please consider updating your code like below:
I'm trying to make a function that scrapes book names from goodreads using python and Beautifulsoup.
I've realized some goodread pages have a common url that have the form:
"https://www.goodreads.com/shelf/show/" + category_name + "?page=" + page_number so I've made a function that receives a category name and a max page range in order to iterate from page 1 to max_pages.
The problem is that every time the program iterates it doesn't update the page but instead goes to the first (default) page for the category. I've tried to provide the full url like for example: https://www.goodreads.com/shelf/show/art?page=2 but it still doesn't work so I'm guessing it might be that BeautifulSoup converts the url I'm passing into another format that's not working, but I don't know.
def scrape_category(category_name, search_range):
book_names = []
for i in range(search_range):
quote_page = "https://www.goodreads.com/shelf/show/" + category_name + "?page=" + str(i + 1)
page = urlopen(quote_page)
soup = BeautifulSoup(page,'lxml')
names = soup.find_all('a', attrs={"class":'bookTitle'})
for name in names:
book_name = name.text
book_name = re.sub(r'\"','',book_name)
book_names.append(book_name)
return book_names
The result from this code is always the book names from the first page of the category I'm passing as parameter, never the second, third ... or n page from range 1 to max_pages that I'm requesting.
I see the same books when I enter https://www.goodreads.com/shelf/show/art?page=2 and https://www.goodreads.com/shelf/show/art?page=15 in my browser. This is not a problem in BeautifulSoup, this is just how this site was built.
I am trying to scrape multiple pages using beautifulsoup concept, but am getting only the last page results as output, please suggest the right way. Below is my code.
# For every page
for page in range(0,8):
# Make a get request
response = get('http://nationalacademyhr.org/fellowsdirectory?page=0%2C{}' + format(page))
# Pause the loop
sleep(randint(8,15))
# Monitor the requests
requests += 1
elapsed_time = time() - start_time
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
clear_output(wait = True)
html_soup = BeautifulSoup(response.text, 'html.parser')
all_table_info = html_soup.find('table', class_ = "views-table cols-4")
for name in all_table_info.find_all('div',
class_="views-field views-field-view"):
names.append(name.text.replace("\n", " ")if name.text else None)
for organization in all_table_info.find_all('td',
class_="views-field views-field-field-employer"):
orgs.append(organization.text.strip() if organization.text else None)
for year in all_table_info.find_all('td',
class_ = "views-field views-field-view-2"):
Years.append(year.text.strip() if year.text else None)
df = pd.DataFrame({'Name' : names, 'Org' : orgs, 'year' : Years })
print (df)
There is a typing error: a plus instead of a dot. You need 'http://nati...ge=0%2C{}'.format(page),
but you wrote
'http://nati...ge=0%2C{}' + format(page)
URLs having braces before the page number end up at the same page.
EDIT:
If I was not clear, you need just change the line
response = get('http://nationalacademyhr.org/fellowsdirectory?page=0%2C{}' + format(page))
to
response = get('http://nationalacademyhr.org/fellowsdirectory?page=0%2C{}'.format(page))
In the first case the resulting URL contains also the substring '{}', which causes the problem.
Note: there are 9 pages on the site identified by page=0,0 through to page=0,8. Your loop should use range(9). Or, even better, load the first page then get the URL for the next page using the next link. Iterate over all the page by following the next link until there is no next link on the page.
Further to xhancar's answer which identifies the problem, a better way is to avoid string operations when building URLs, and instead let requests construct the URL query string for you:
for page in range(9):
params = {'page': '0,{}'.format(page)}
response = get('http://nationalacademyhr.org/fellowsdirectory', params=params)
The params parameter is passed to requests.get() which adds the values to the URL query string. The query parameters will be properly encoded, e.g. the , replaced with %2C.
I'm using the following code to obtain a list of a user's followers on twitter:
import urllib
from BeautifulSoup import BeautifulSoup
#code only looks at one page of followers instead of continuing to all of a user's followers
#decided to only use a small sample
site = "http://mobile.twitter.com/NYTimesKrugman/following"
friends = set()
response = urllib.urlopen(site)
html = response.read()
soup = BeautifulSoup(html)
names = soup.findAll('a', {'href': True})
for name in names:
a = name.renderContents()
b = a.lower()
if ("http://mobile.twitter.com/" + b) == name['href']:
c = str (b)
friends.add(c)
for friend in friends:
print friend
print ("Done!")
However, I get the following results:
NYTimeskrugman
nytimesphoto
rasermus
Warning (from warnings module):
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion14.py", line 42
if ("http://mobile.twitter.com/" + b) == name['href']:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
amnesty_norge
zynne_
fredssenteret
oljestudentene
solistkoret
....(and so it continues)
It would appear that I was able to get most of the names of the following but I received a somewhat random error. It didn't stop the code from finishing however...I was hoping that someone could enlighten me as to what happened?
Don't know if my answer will be useful several years later, but I rewrote your code using requests instead of urllib.
I think it's better to made an other selection with the class "username" to consider only followers names !
Here's the stuff :
import requests
from bs4 import BeautifulSoup
site = "http://mobile.twitter.com/paulkrugman/followers"
friends = set()
response = requests.get(site)
soup = BeautifulSoup(response.text)
names = soup.findAll('a', {'href': True})
for name in names:
pseudo = name.find("span", {"class": "username"})
if pseudo:
pseudo = pseudo.get_text()
friends.add(pseudo)
for friend in friends:
print (friend)
print("Done !")
#paulkrugman appears in every set, so don't forget to delete it !