import requests as r
from bs4 import BeautifulSoup as bs
url=r.get("https://www.consumerreports.org/cro/coffee-makers.htm")
soup=bs(url.content)
product=soup.find('div',class_="row product-type-container")
pclass=product.find('div',class_="product-type-item col-xs-4")
pname=pclass.find('div',class_="product-type-info-container").h3.text
print(pname)
i am scraping all the product name and details but can only scrape one product at a time how can i scrape
To get titles of all products in all categories you can use next example:
import requests
from bs4 import BeautifulSoup
def get_products(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
out = []
for title in soup.select(".crux-component-title"):
out.append(title.get_text(strip=True))
return out
url = "https://www.consumerreports.org/cro/coffee-makers.htm"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
all_data = []
for category_link in soup.select("h3.crux-product-title a"):
u = "https://www.consumerreports.org" + category_link["href"]
print("Getting {}".format(u))
all_data.extend(get_products(u))
for i, title in enumerate(all_data, 1):
print("{:<5} {}".format(i, title))
Prints:
1 Bella 14755 with Brew Strength Selector
2 Bella Pro Series 90061
3 Betty Crocker 12-cup Stainless Steel BC-2809CB
4 Black+Decker 12-cup Programmable CM1331S
5 Black+Decker 12-Cup Thermal Programmable CM2046S
6 Black+Decker CM2036S 12-cup Thermal
7 Black+Decker CM4000S
8 Black+Decker DLX1050B
9 Black+Decker Even Stream CM2035B
10 Black+Decker Honeycomb Collection CM1251W
11 Black+Decker Programmable CM1331BS (Walmart Exclusive)
12 Bonavita BV1901TS 8-Cup One-Touch
13 Braun Brew Sense KF7150BK
14 Braun BrewSense 12-cup Programmable KF7150
15 Braun BrewSense 12-cup Programmable KF7000BK
...and so on.
Why is that: find(..) returns only first object which matches your criteria.
Solution: Try using find_all(..) method.
Related
I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I'am new in web scraping. Hope somoene help me out and find the problem.
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
#print(soup)
film_in= soup.find('tbody').findAll('tr')
#print(film_in)
film = film_in[0]
#print(film)
titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
print(titre.text)
rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
#print(rang)
def remove_parentheses(string):
return string.replace("(","").replace(")","")
année = film.find("span",{'class':'secondaryInfo'}).text
#print(année)
imdb =[]
for films in film_in:
titre = film.find("a",{'title':'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'})
rang = film.find("td",{'class':'ratingColumn imdbRating'}).find('strong').text
année =(remove_parentheses(film.find("span",{'class':'secondaryInfo'}).text))
dictionnaire = {'film': film,
'rang': rang,
'année':année
}
imdb.append(dictionnaire)
df_imdb = pd.DataFrame(imdb)
print(df_imdb)
I'am trying to extract data from a site and then to create a DataFrame out of it. the program doesnt work properly. I need to solve it using urllib, is there a way. thanks in advance
I'am new in web scraping.
You can try the next example:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
url = 'https://www.imdb.com/chart/top/?sort=rk,asc&mode=simple&page=1'
#soup = BeautifulSoup(requests.get(url).text,'html.parser')# It's the perfect and powerful
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
imdb = []
film_in = soup.select('table[class="chart full-width"] tr')
for film in film_in[1:]:
titre = film.select_one('.titleColumn a').get_text(strip=True)
rang = film.select_one('[class="ratingColumn imdbRating"] > strong').text
année =film.find("span",{'class':'secondaryInfo'}).get_text(strip=True)
dictionnaire = {'titre': titre,
'rang': rang,
'année':année
}
imdb.append(dictionnaire)
df_imdb = pd.DataFrame(imdb)
print(df_imdb)
Output:
titre rang année
0 The Shawshank Redemption 9.2 (1994)
1 The Godfather 9.2 (1972)
2 The Dark Knight 9.0 (2008)
3 The Godfather Part II 9.0 (1974)
4 12 Angry Men 9.0 (1957)
.. ... ... ...
245 Dersu Uzala 8.0 (1975)
246 Aladdin 8.0 (1992)
247 The Help 8.0 (2011)
248 The Iron Giant 8.0 (1999)
249 Gandhi 8.0 (1982)
[250 rows x 3 columns]
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
#put all item in this array
response = requests.get('http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh')
soup = BeautifulSoup(response.content, 'html.parser')
table=soup.find_all('table', class_='expo-table general-color')
for row in table:
for up in row.find_all('td'):
text_list = [text for text in up.stripped_strings]
print(text_list)
These code is working good and they will get me the correct output but they will not give output in these format as you seen below I want output in these format can you help me
Indirizzo Bliedinghauserstrasse 27
Città Remscheid
Nazionalità Germania
Sito web www.amannesmann.de
Stand Pad. 3 E14 F11
Telefono +492191989-0
Fax +492191989-201
E-mail sales#mannesmann.de
Membro di Cecimo
Social
pandas has a builtin html table scraper, so you can run:
df = pd.read_html('http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh')
This returns a list of all tables on the page as dataframes, you can access your data with df[0]:
0
1
0
Indirizzo
Bliedinghauserstrasse 27
1
Città
Remscheid
2
Nazionalità
Germania
3
Sito web
www.amannesmann.de
4
Stand
Pad. 3 E14 F11
5
Telefono
+492191989-0
6
Fax
+492191989-201
7
E-mail
sales#mannesmann.de
8
Membro di
nan
9
Social
nan
You can use .get_text() method to extract text and use parameters to avoid whitespaces and give extra space using separator
data=table.find_all("tr")
for i in data:
print(i.get_text(strip=True,separator=" "))
Output:
Indirizzo Bliedinghauserstrasse 27
Città Remscheid
...
Instead of selecting <td>, select <tr> and use .stripped_strings on it to get the row wise data and then append them to the Dataframe.
Here is the code
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers= {'User-Agent': 'Mozilla/5.0'}
#put all item in this array
temp = []
response = requests.get('http://smartcatalog.emo-milano.com/it/espositore/a-mannesmann-maschinenfabrik-gmbh')
soup = BeautifulSoup(response.content, 'html.parser')
table=soup.find_all('table', class_='expo-table general-color')
for row in table:
for up in row.find_all('tr'):
temp.append([text for text in up.stripped_strings])
df = pd.DataFrame(temp)
print(df)
0 1
0 Indirizzo Bliedinghauserstrasse 27
1 Città Remscheid
2 Nazionalità Germania
3 Sito web www.amannesmann.de
4 Stand Pad. 3 E14 F11
5 Telefono +492191989-0
6 Fax +492191989-201
7 E-mail sales#mannesmann.de
8 Membro di None
9 Social None
I want to get all the products on this page:
nike.com.br/snkrs#estoque
My python code is this:
produtos = []
def aviso():
print("Started!")
request = requests.get("https://www.nike.com.br/snkrs#estoque")
soup = bs4(request.text, "html.parser")
links = soup.find_all("a", class_="btn", text="Comprar")
links_filtred = list(set(links))
for link in links_filtred:
if(produto not in produtos):
request = requests.get(f"{link['href']}")
soup = bs4(request.text, "html.parser")
produto = soup.find("div", class_="nome-preco-produto").get_text()
if(code_formated == ""):
code_formated = "\u200b"
print(f"Nome: {produto} Link: {link['href']}\n")
produtos.append(link["href"])
aviso()
Guys, this code gets the products from the page, but not all yesterday, I suspect that the content is dynamic, but how can I get them all with request and beautifulsoup? I don't want to use Selenium or an automation library, how do I do that? I don't want to have to change my code a lot because it's almost done, how do I do that?
DO NOT USE requests.get if you are dealing with the same HOST.
Reason: read-that
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
allin = []
with requests.Session() as req:
for page in range(1, 6):
params = {
'p': page,
'demanda': 'true'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(x.find_next('h2').get_text(strip=True, separator=" "), x['href'])
for x in soup.select('.aspect-radio-box')]
allin.extend(goal)
df = pd.DataFrame(allin, columns=['Title', 'Url'])
print(df)
main('https://www.nike.com.br/Snkrs/Feed')
Output:
Title Url
0 Dunk High x Fragment design Black https://www.nike.com.br/dunk-high-x-fragment-d...
1 Dunk Low Infantil (16-26) City Market https://www.nike.com.br/dunk-low-infantil-16-2...
2 ISPA Flow 2020 Desert Sand https://www.nike.com.br/ispa-flow-2020-153-169...
3 ISPA Flow 2020 Pure Platinum https://www.nike.com.br/ispa-flow-2020-153-169...
4 Nike iSPA Men's Lightweight Packable Jacket https://www.nike.com.br/nike-ispa-153-169-211-...
.. ... ...
115 Air Jordan 1 Mid Hyper Royal https://www.nike.com.br/air-jordan-1-mid-153-1...
116 Dunk High Orange Blaze https://www.nike.com.br/dunk-high-153-169-211-...
117 Air Jordan 5 Stealth https://www.nike.com.br/air-jordan-5-153-169-2...
118 Air Jordan 3 Midnight Navy https://www.nike.com.br/air-jordan-3-153-169-2...
119 Air Max 90 Bacon https://www.nike.com.br/air-max-90-153-169-211...
[120 rows x 2 columns]
To get the data you can send a request to:
https://www.nike.com.br/Snkrs/Estoque?p=<PAGE>&demanda=true
where providing a page number between 1-5 to p= in the URL.
For example, to print the links, you can try:
import requests
from bs4 import BeautifulSoup
url = "https://www.nike.com.br/Snkrs/Estoque?p={page}&demanda=true"
for page in range(1, 6):
response = requests.get(url.format(page=page))
soup = BeautifulSoup(response.content, "html.parser")
print(soup.find_all("a", class_="btn", text="Comprar"))
I am having an inconsistent issue that is driving me crazy. I am trying to scrape data about rental units. Let's say we have a webpage with 42 ads, the code works just fine for only 19 ads then it returns:
Traceback (most recent call last):
File "main.py", line 53, in <module>
title = real_state_title.div.h1.text.strip()
AttributeError: 'NoneType' object has no attribute 'div'
If you started the code to process ads starting from a different ad number, let's say 5, it will also process the first 19 ads then raises the same error!
Here is a minimum code to show the issue I am having. Please note that this code will print the HTML for a functioning ad and also for the one with the error. What is printed is so different.
Run the code then change the value of i to see the results.
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
import traceback
page_url = "https://www.kijiji.ca/b-apartments-condos/saint-john/c37l80017?ll=45.273315%2C-66.063308&address=Saint+John%2C+NB&ad=offering&radius=20.0"
# opens the connection and downloads html page from url
uClient = uReq(page_url)
# parses html into a soup data structure to traverse html
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each ad from Kijiji web page
containers = page_soup.findAll('div', {'class': 'clearfix'})
# Print the number of ads in this web page
print(f'Number of ads in this web page is {len(containers)}')
print_functioning_ad = True
# Loop throw ads
i = 1 # change to start from a different ad (don't put zero)
for container in containers[i:]:
print(f'Ad No.: {i}\n')
i += 1
# Get the link for this specific ad
ad_link_container = container.find('div', {'class': 'title'})
ad_link = 'https://kijiji.ca' + ad_link_container.a['href']
print(ad_link)
single_ad = uReq(ad_link)
# parses html into a soup data structure to traverse html
page_soup2 = soup(single_ad.read(), "html.parser")
single_ad.close()
# Title
real_state_title = page_soup2.find('div', {'class': 'realEstateTitle-1440881021'})
# Print one functioning ad html
if print_functioning_ad:
print_functioning_ad = False
print(page_soup2)
print('real state title type', type(real_state_title))
try:
title = real_state_title.div.h1.text.strip()
print(title)
except Exception:
print(traceback.format_exc())
print(page_soup2)
break
print('____________________________________________________________')
Edit 1:
In my simple example I want to loop through each ad in the provided link, open it, and get the title. In my actual code I am not only getting the title but also every other info about the ad. So I need to load the data from the link associated with every ad. My code actually does that, but for an unknown reason, this happens ONLY for 19 ads regardless which one I started with. This is driving my nuts!
To get all pages from the URL you can use next example:
import requests
from bs4 import BeautifulSoup
page_url = "https://www.kijiji.ca/b-apartments-condos/saint-john/c37l80017?ll=45.273315%2C-66.063308&address=Saint+John%2C+NB&ad=offering&radius=20.0"
page = 1
while True:
print("Page {}...".format(page))
print("-" * 80)
soup = BeautifulSoup(requests.get(page_url).content, "html.parser")
for i, a in enumerate(soup.select("a.title"), 1):
print(i, a.get_text(strip=True))
next_url = soup.select_one('a[title="Next"]')
if not next_url:
break
print()
page += 1
page_url = "https://www.kijiji.ca" + next_url["href"]
Prints:
Page 1...
--------------------------------------------------------------------------------
1 Spacious One Bedroom Apartment
2 3 Bedroom Quispamsis
3 Uptown-two-bedroom apartment for rent - all-inclusive
4 New Construction!! Large 2 Bedroom Executive Apt
5 LARGE 1 BEDROOM UPTOWN $850 HEAT INCLUDED AVAIABLE JULY 1
6 84 Wright St Apt 2
7 310 Woodward Ave (Brentwood Tower) Condo #1502
...
Page 5...
--------------------------------------------------------------------------------
1 U02 - CHFR - Cozy 1 Bedroom + Den - WEST SAINT JOHN
2 2+ Bedroom Historic Renovated Stainless Kitchen
3 2 Bedroom Apartment - 343 Prince Street West
4 2 Bedroom 5th Floor Loft Apartment in South End Saint John
5 Bay of Fundy view from luxury 5th floor 1 bedroom + den suite
6 Suites of The Atlantic - Renting for Fall 2021: 2 bedrooms
7 WOODWARD GARDENS//2 BR/$945 + LIGHTS//MAY//MILLIDGEVILLE//JULY
8 HEATED & SMOKE FREE - Bach & 1Bd Apt - 50% off 1st month's rent
9 Beautiful 2 bedroom apartment in Millidgeville
10 Spacious 2 bedroom in Uptown Saint John
11 3 bedroom apartment at Millidge Ave close to university ave
12 Big Beautiful 3 bedroom apt. in King Square
13 NEWER HARBOURVIEW SUITES UNFURNISHED OR FURNISHED /BLUE ROCK
14 Rented
15 Completely Renovated - 1 Bedroom Condo w/ small den Brentwood
16 1+1 Bedroom Apartment for rent for 2 persons
17 3 large bedroom apt. in King Street East Saint John,NB
18 Looking for a house
19 Harbour View 2 Bedroom Apartment
20 Newer Harbourview suites unfurnished or furnished /Blue Rock Ct
21 LOVELY 2 BEDROOM APARTMENT FOR LEASE 5 WOODHOLLOW PARK EAST SJ
I think I figured out the problem here. I seems like you can't make a lot of requests in a short period of time, so I added a try: except: statement where a time sleep of 80 second is issued when this error occurs, this fixed my problem!
You may want to change the sleep time period to a different value depends on the website you are trying to scrape from.
Here is the modified code:
from bs4 import BeautifulSoup as soup # HTML data structure
from urllib.request import urlopen as uReq # Web client
import traceback
import time
page_url = "https://www.kijiji.ca/b-apartments-condos/saint-john/c37l80017?ll=45.273315%2C-66.063308&address=Saint+John%2C+NB&ad=offering&radius=20.0"
# opens the connection and downloads html page from url
uClient = uReq(page_url)
# parses html into a soup data structure to traverse html
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each ad from Kijiji web page
containers = page_soup.findAll('div', {'class': 'clearfix'})
# Print the number of ads in this web page
print(f'Number of ads in this web page is {len(containers)}')
print_functioning_ad = True
# Loop throw ads
i = 1 # change to start from a different ad (don't put zero)
for container in containers[i:]:
print(f'Ad No.: {i}\n')
i = i + 1
# Get the link for this specific ad
ad_link_container = container.find('div', {'class': 'title'})
ad_link = 'https://kijiji.ca' + ad_link_container.a['href']
print(ad_link)
single_ad = uReq(ad_link)
# parses html into a soup data structure to traverse html
page_soup2 = soup(single_ad.read(), "html.parser")
single_ad.close()
# Title
real_state_title = page_soup2.find('div', {'class': 'realEstateTitle-1440881021'})
try:
title = real_state_title.div.h1.text.strip()
print(title)
except AttributeError:
print(traceback.format_exc())
i = i - 1
t = 80
print(f'----------------------------Sleep for {t} seconds!')
time.sleep(t)
continue
print('____________________________________________________________')
I am trying to extract the star rating of each review in a dataframe for sentiment analysis.
https://www.mouthshut.com/product-reviews/Kotak-811-Mobile-Banking-reviews-925917218
This the webpage I am trying to scrape. I am fairly new to webscraping, so I prefer beautifulsoup as it is easier to understand.
import requests
from bs4 import BeautifulSoup
import pandas as pd
URL = ""
Final = []
for x in range(0, 8):
if x == 1:
URL = "https://www.mouthshut.com/product-reviews/Kotak-811-Mobile-Banking-reviews-925917218"
else:
URL ="https://www.mouthshut.com/product-reviews/Kotak-811-Mobile-Banking-reviews-925917218-page-{}".format(x)
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
reviews = [] # a list to store reviews
# Use a CSS selector to extract all the review containers
review_divs = soup.select('div.col-10.review')
for element in review_divs :
review = {'Review_Title': element .a.text, 'URL': element .a['href'], 'Review': element .find('div', {'class': ['more', 'reviewdata']}).text.strip()}
reviews.append(review)
Final.extend(reviews)
df = pd.DataFrame(Final)
I would really appreciate the help.
Thank You
You may add the following entry to your review dictionary to get all the
giving stars under class=rating.
'Stars' : len(element.find('div', "rating").findAll("i", "rated-star"))
Review_Title ... Stars
0 Why need permission for contact, gallery ... 1
1 Very dull marketing for open account ... 1
2 Worst bank ... 1
3 Good interface & can be easily accessible ... 3
4 Best digital Bank account ... 4
5 Better account for everyone ... 4
6 Feature full Mobile banking ... 5
7 Very good bank ... 4
8 Above average online banking experience ... 3
...