web scraping for more pages - python

Currently I am working on web scraping for a website where I want data when the page is load automatically. I am using BeautifullSoup and requests.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.monki.com/en/newin/view-all-new.html")
soup = BeautifulSoup(page.content, 'html.parser')
article_codes=[]
for k in soup.findAll('div',attrs={"class":"producttile-details"}):
article_code = k.find('span', attrs={'class':"articleCode"})
print(article_code)
article_codes.append(article_code.text)
with this code I only get data of a page only but I want all data after the page loaded.

The page is using JavaScript to load the additional pages. You can use requests module to simulate those requests.
For example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.monki.com/en_eur/newin/view-all-new/_jcr_content/productlisting.products.html'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
}
with requests.session() as s:
s.get('https://www.monki.com/en_eur/newin/view-all-new.html', headers=headers).text
for page in range(0, 10): # <-- adjust to required number of pages
soup = BeautifulSoup(s.get(url, params={'offset': page*28}, headers=headers).content, 'html.parser')
for product in soup.select('.o-product'):
name = product.select_one('.product-name').get_text(strip=True)
price = product.select_one('.price-tag').get_text(strip=True)
link = product.select_one('.a-link')['href']
print('{:<50} {:<10} {}'.format(name, price , link))
Prints all products:
NEW! Maxi smock dress €30 https://www.monki.com/en_eur/clothing/dresses/midi-dresses/product.midi-button-up-shirt-dress-black.0871799004.html
NEW! Retro skater dress €20 https://www.monki.com/en_eur/clothing/dresses/mini-dresses/product.retro-skater-dress-white.0688447029.html
NEW! Mozik block jeans €40 https://www.monki.com/en_eur/clothing/jeans/product.mozik-block-jeans-blue.0874088001.html
NEW! Pack of two scrunchies €6 https://www.monki.com/en_eur/accessories/hair-accessories/product.pack-of-two-scrunchies-beige.0530296078.html
NEW! Mini hand bag €18 https://www.monki.com/en_eur/accessories/bags,-wallets-belts/bags/product.mini-hand-bag-black.0826291006.html
NEW! Fitted crop top €10 https://www.monki.com/en_eur/clothing/tops/t-shirts/product.fitted-crop-top-purple.0906440002.html
NEW! Tiered smock dress €30 https://www.monki.com/en_eur/clothing/dresses/midi-dresses/product.tiered-smock-dress-blue.0895277004.html
NEW! Mini hand bag €18 https://www.monki.com/en_eur/accessories/bags,-wallets-belts/bags/product.mini-hand-bag-beige.0826291008.html
NEW! Fitted t-shirt €10 https://www.monki.com/en_eur/clothing/tops/t-shirts/product.fitted-t-shirt-purple.0905746002.html
NEW! Shoulder pads t-shirt dress €25 https://www.monki.com/en_eur/clothing/dresses/mini-dresses/product.shoulder-pads-t-shirt-dress-beige.0929301002.html
NEW! Yoko mid blue jeans €40 https://www.monki.com/en_eur/clothing/jeans/product.yoko-mid-blue-jeans-blue.0656425001.html
NEW! Yoko classic blue jeans €40 https://www.monki.com/en_eur/clothing/jeans/product.yoko-classic-blue-jeans-blue.0807218001.html
NEW! Pleated midi skirt €25 https://www.monki.com/en_eur/clothing/skirts/midi-skirts/product.pleated-midi-skirt-black.0562278003.html
... and so on.

Related

Crawling news articles with BeautifulSoup

I want to crawl maritime news from Fleetmon.com news as well as with detail pages and save it in text file. I tried BeautifulSoup in python but it not work properly..
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = 'https://www.fleetmon.com/maritime-news/'
headers = {'User-Agent': 'Mozilla/5.0'}
newslinks = [] # put all item in this array
for x in range(1): # set page range
response = requests.get(
f'https://www.fleetmon.com/maritime-news/?page={x}') # url of next page
soup = BeautifulSoup(response.content, 'html.parser')
newslist = soup.find_all('article')
# loop to get all href from ul
for item in newslist:
for link in item.find_all('a', href=True):
newslinks.append(link['href'])
newslinks = list(set(newslinks))
print(newslinks)
# news details pages
newsdata = []
for link in newslinks:
print(link)
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
shipName = soup.find('div', {'class': 'uk-article-story'}).text.strip()
fieldsets = soup.find_all('article')
row = {'Ship Name': shipName}
for fieldset in fieldsets:
dts = fieldset.find_all('h1')
for dt in dts:
row.update({dt.text.strip(): dt.find_next('p').text.strip()})
newsdata.append(row)
#text or csv
df = pd.DataFrame(newsdata)
df.to_csv (r'C:\Users\Usuario\Desktop\news.csv', index = False, header=True)
print(df)
Help me to improve my code to get all data in text form.
Also is it possible to crawl data and save it csv like this:
Column1:News_title:value
column2:category: accidents
column3:publish_date_time:June 28, 2022 at 13:31
column4:news:full news here
Go to the details page (here I use req2 to go to the details page) and I 've made the pagination using for loop and range function and you can increase or decrease the page numbers with no time.
P/S: If you click on any title link then you can see the details page and from the details pages are scraped all required data items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
url='https://www.fleetmon.com/maritime-news/?page={page}'
data=[]
for page in range(1,11):
req = requests.get(url.format(page=page),headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
for link in soup.select('.news-headline h2 a') :
link='https://www.fleetmon.com' + link.get('href')
req2 = requests.get(link,headers=headers)
soup2 = BeautifulSoup(req2.text, 'lxml')
title= soup2.find('h1',class_="uk-article-title margin-t-0").text
cat=soup2.select_one('p.uk-article-meta span a strong').text
date=soup2.select_one('[class="uk-text-nowrap"]:nth-child(3)').text
details=soup2.select_one('.uk-article-story ').get_text(strip=True)
data.append({
'title':title,
'category':cat,
'date':date,
'details_news':details
})
df = pd.DataFrame(data)#.to_csv('news.csv',index=False)
print(df)
Output:
Cruise ship NORWEGIAN SUN hit iceberg, damaged... ... Cruise ship NORWEGIAN SUN hit an
iceberg size ...
1 Yang Ming and HMM Were Accused of Collusion to... ... YM WARRANTY by ship spotter phduck2kYM WARRANT...
2 Fire in bulk carrier cargo hold, Florida ... At around 2350 LT Jun 26 firefighters responde...
3 Chlorine gas tank fell on Chinese cargo ship, ... ... Tank with 25 tons of chlorine gas fell onto ca...
4 Heavy vehicle fell onto cargo deck during offl... ... Heavy machinery vehicle (probably mobile crane...
.. ... ...
...
195 Yara Plans 15 Ammonia Bunkering Terminals in S... ... VIKING ENERGY by ship spotter PattayaVIKING EN...
196 World’s Largest Electric Cruise Ship Sets Sail... ... ©Wuxi Saisiyi Electric Technology,©Wuxi Saisiy...
197 The Supply Chain Crisis Brewing at Israeli Ports ... Port Haifa in FleetMon ExplorerPort Haifa in F...
198 CDC Drops Its “Cruise Ship Travel Health Notic... ... AIDADIVA by ship spotter Becks93AIDADIVA by sh...
199 Scorpio Tankers Take the Path of Shipboard Car... ... CORONA UTILITY by ship spotter canonbenqCORONA...
[200 rows x 4 columns]

Extracting company name and other information inside all urls present in a webpage using beautifulsoup

<li>
<strong>Company Name</strong>
":"
<span itemprop="name">PT ERA MURNI BUSANA</span>
</li>
In the above HTML code, I am trying to extract the company name which is PT ERA MURNI BUSANA.
if I use a single test link, I can get the name using the single line code I wrote:
soup.find_all("span",attrs={"itemprop":"name"})[3].get_text()
But I want to extract the information from all such pages present in a single web page.
So I write the for loop but it is fetch the details. I am pasting the part of the code that I have been trying which needs some modification.
Code:-
for link in supplierlinks: #links have been extracted and merged with the base url
r=requests.get(link,headers=headers)
soup=BeautifulSoup(r.content,'lxml')
companyname=soup.find_all("span",attrs={"itemprop":"name"})[2].get_text()
Output looks like:
{'Company Name': 'AIRINDO SAKTI GARMENT PT'}
{'Company Name': 'Garments'}
{'Company Name': 'Garments'}
Instead of the garments popping up in the output, I need the company name. How do I modify the code within for loop?
Link:https://idn.bizdirlib.com/node/5290
Try this code:
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}
r = requests.get('https://idn.bizdirlib.com/node/5290',headers=headers).text
soup = BeautifulSoup(r,'html5lib')
print(soup.find_all("span",attrs={"itemprop":"name"})[-1].get_text())
div = soup.find('div',class_ = "content clearfix")
li_tags = div.div.find_all('fieldset')[1].find_all('div')[-1].ul.find_all('li')
supplierlinks = []
for li in li_tags:
try:
supplierlinks.append("https://idn.bizdirlib.com/"+li.a['href'])
except:
pass
for link in supplierlinks:
r = requests.get(link,headers=headers).text
soup = BeautifulSoup(r,'html5lib')
print(soup.find_all("span", attrs={"itemprop": "name"})[-1].get_text())
Output:
PT ERA MURNI BUSANA
PT ELKA SURYA ABADI
PT EMPANG BESAR MAKMUR
PT EMS
PT ENERON
PT ENPE JAYA
PT ERIDANI TOUR AND TRAVEL
PT EURO ASIA TRADE & INDUSTRY
PT EUROKARS CHRISDECO UTAMA
PT EVERAGE VALVES METAL
PT EVICO
This code prints the company names of all the links on the page
You can select sibling element to element <strong> that contains the text "Company Name" (also, don't forget to set User-Agent http header):
import requests
from bs4 import BeautifulSoup
url = 'https://idn.bizdirlib.com/node/5290'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
print( soup.select_one('strong:contains("Company Name") + *').text )
Prints:
PT ERA MURNI BUSANA
EDIT: To get contact person:
import requests
from bs4 import BeautifulSoup
url = 'https://idn.bizdirlib.com/node/5290'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
print( soup.select_one('strong:contains("Company Name") + *').text )
print( soup.select_one('strong:contains("Contact") + *').text )
Prints:
PT ERA MURNI BUSANA
Mr. Yohan Kustanto

Why can't I scrape Amazon products by BeautifulSoup?

I am trying to scrape the heading of this Amazon listing. The code I wrote is working for some other Amazon listings, but not working for the url mentioned in the code below.
Here is the python code I've tried:
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/BULLMER-Cotton-Printed-T-shirt-Multicolour/dp/B0892SZX7F/ref=sr_1_4?c=ts&dchild=1&keywords=Men%27s+T-Shirts&pf_rd_i=1968024031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_p=8b97601b-3643-402d-866f-95cc6c9f08d4&pf_rd_r=EPY70Y57HP1220DK033Y&pf_rd_s=merchandised-search-6&qid=1596817115&refinements=p_72%3A1318477031&s=apparel&sr=1-4&ts_id=1968123031"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "html.parser")
#print(soup.prettify())
title = soup.find(id = "productTitle")
if title:
title = title.get_text()
else:
title = "default_title"
print(title)
Output:
200
default_title
html code from inspector tools:
<span id="productTitle" class="a-size-large product-title-word-break">
BULLMER Mens Halfsleeve Round Neck Printed Cotton Tshirt - Combo Tshirt - Pack of 3
</span>
First, As others have commented, use a proxy service. Second in order to go amazon product page if you have an asin that's enough.
Amazon follows this url pattern for all product pages.
https://www.amazon.(com/in/fr)/dp/<asin>
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/dp/B0892SZX7F"
headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find("span", {"id":"productTitle"})
if title:
title = title.get_text(strip=True)
else:
title = "default_title"
print(title)
Output:
200
BULLMER Mens Halfsleeve Round Neck Printed Cotton Tshirt - Combo Tshirt - Pack of 3
this worked fine for me:
import requests
from bs4 import BeautifulSoup
url="https://www.amazon.in/BULLMER-Cotton-Printed-T-shirt-Multicolour/dp/B0892SZX7F/ref=sr_1_4?c=ts&dchild=1&keywords=Men%27s+T-Shirts&pf_rd_i=1968024031&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_p=8b97601b-3643-402d-866f-95cc6c9f08d4&pf_rd_r=EPY70Y57HP1220DK033Y&pf_rd_s=merchandised-search-6&qid=1596817115&refinements=p_72%3A1318477031&s=apparel&sr=1-4&ts_id=1968123031"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0"}
http_proxy = "http://10.10.1.10:3128"
https_proxy = "https://10.10.1.11:1080"
ftp_proxy = "ftp://10.10.1.10:3128"
proxyDict = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
page = requests.get(url, headers=headers)
print(page.status_code)
soup = BeautifulSoup(page.content, "lxml")
#print(soup.prettify())
title = soup.find(id = "productTitle")
if title:
title = title.get_text()
else:
title = "default_title"
print(title)

soup.find returning "none" only sometimes?

I am scraping an Amazon product page and using Beautiful Soup to find the product name and price. For some reason, the "title" variable will return sometimes and other times I will get the error, "'NoneType' object has no attribute 'get_text'"
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Lenovo-ThinkPad-i5-10210U-i7-7500U-Wireless/\
dp/B08BYZD4H9/ref=sr_1_2_sspa?dchild=1&keywords=thinkpad&qid=1595377662&sr=8\
-2-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyMVhTU1BOODg5TlgmZW5jcnlwdGVkS\
WQ9QTAzMTc5MDFMNjhGMUE0VlRHT1gmZW5jcnlwdGVkQWRJZD1BMDY3MDc3MzJPQzc2QkI5UlcwSUE\
md2lkZ2V0TmFtZT1zcF9hdGYmYWN0aW9uPWNsaWNrUmVkaXJlY3QmZG9Ob3RMb2dDbGljaz10cnVl'
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle").get_text()
price = soup.find(id="priceblock_ourprice").get_text()
converted_price = int(price[1:6].replace(',',''))
print(converted_price)
print(title)
Try to specify more HTTP headers, for example User-Agent and Accept-Language. Also, change the parser to lxml or html5lib.
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept-Language': 'en-US,en;q=0.5'
}
URL = 'https://www.amazon.com/Lenovo-ThinkPad-i5-10210U-i7-7500U-Wireless/dp/B08BYZD4H9/ref=sr_1_2_sspa?dchild=1&keywords=thinkpad&qid=1595377662&sr=8-2-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyMVhTU1BOODg5TlgmZW5jcnlwdGVkSWQ9QTAzMTc5MDFMNjhGMUE0VlRHT1gmZW5jcnlwdGVkQWRJZD1BMDY3MDc3MzJPQzc2QkI5UlcwSUEmd2lkZ2V0TmFtZT1zcF9hdGYmYWN0aW9uPWNsaWNrUmVkaXJlY3QmZG9Ob3RMb2dDbGljaz10cnVl'
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'lxml') # <-- change to `lxml` or `html5lib`
title = soup.find(id="productTitle").get_text(strip=True)
price = soup.find(id="priceblock_ourprice").get_text(strip=True)
converted_price = int(price[1:6].replace(',',''))
print(converted_price)
print(title)
Prints (in my testing always):
1049
2020 Lenovo ThinkPad E15 15.6 Inch FHD 1080P Laptop| Intel 4-Core i5-10210U (Beats i7-7500U)| 16GB RAM| 1TB SSD (Boot) + 500GB HDD| FP Reader| Win10 Pro+ NexiGo Wireless Mouse Bundle
You are getting this error 'NoneType' object has no attribute 'get_text' because the webpage's data is changing and there is no attribute having id="productTitle" or there is no attribute having id="priceblock_ourprice".
Put some debug statements like this and you will know why exactly this error is coming.
soup = BeautifulSoup(page.content, 'html.parser')
print(soup)
title_soup = soup.find(id="productTitle")
print(title_soup) # <- this might print None
print(title_soup.get_text())
price_soup = soup.find(id="priceblock_ourprice")
print(price_soup) # <- this might print None
print(price_soup.get_text())

python nested for to retrieve css tags values

The tags from a web page are as follows:
<div class="lg_col MT5">
<p>
<span class="sp starGryB">4.4</span>
</p>
<p class="MT5 UC">
<span class="gd10gb">141 Ratings</span>
</p>
</div>
I am trying to retrieve the values "4.4", and "141 Ratings" for all the div class values "lg_col MT5".
The nested for loop that I use isn't working as expected. It seems as if the hierarchy of the tags isn't taken into account.
import requests
import sys
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
def test_function():
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for tag in soup.select('div.lg_col.MT5'):
for tag1 in soup.select('span.sp.starGryB'):
try:
print(tag1.string)
except KeyError:
pass
for tag2 in soup.select('span.gd10gb'):
try:
print(tag2.string)
except KeyError:
pass
test_function()
`
The expected output is: 4.4 followed by 141 Ratings for each of the div tags in the webpage.
But the output is: All the starGryB values followed by all the gd10gb values as this happens over and over again.
Use tag.select instead of soup.select if you want to look in just tag and not the entire soup.
Not for points.
This is another way to scrape it to avoid having to deal with loops.
import requests
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tags_1 = soup.find_all('span', class_='sp starGryB')
tags_2 = [tag.parent.parent.select('span.gd10gb') for tag in tags_1]
tags_3 = [tag.parent.parent.parent.select('a.gr24mb.UC') for tag in tags_1]
scores = [score.get_text() for score in tags_1]
ratings = [rating[0].get_text() if len(rating) > 0 else 'NA' for rating in tags_2]
names = [name[0].get_text().strip() for name in tags_3]
tags = zip(names, scores, ratings)
for a, b, c in tags:
print a, b, c
Result:
Wild Amazon 2.9 27 Ratings
European Buffet NA NA
Flamingo 2.3 17 Ratings
The Holy Smoke 2.9 13 Ratings
Snow Park 2.6 14 Ratings
Dhabba Express 2.7 11 Ratings
The Yellow Chilli 2.7 6 Ratings
The Piano, The Savera Hotel 2.5 6 Ratings
Roasts & Grills, Green Park Hotel 2.3 6 Ratings
[Finished in 0.9s]

Categories