Crawling news articles with BeautifulSoup - python

I want to crawl maritime news from Fleetmon.com news as well as with detail pages and save it in text file. I tried BeautifulSoup in python but it not work properly..
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = 'https://www.fleetmon.com/maritime-news/'
headers = {'User-Agent': 'Mozilla/5.0'}
newslinks = [] # put all item in this array
for x in range(1): # set page range
response = requests.get(
f'https://www.fleetmon.com/maritime-news/?page={x}') # url of next page
soup = BeautifulSoup(response.content, 'html.parser')
newslist = soup.find_all('article')
# loop to get all href from ul
for item in newslist:
for link in item.find_all('a', href=True):
newslinks.append(link['href'])
newslinks = list(set(newslinks))
print(newslinks)
# news details pages
newsdata = []
for link in newslinks:
print(link)
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
shipName = soup.find('div', {'class': 'uk-article-story'}).text.strip()
fieldsets = soup.find_all('article')
row = {'Ship Name': shipName}
for fieldset in fieldsets:
dts = fieldset.find_all('h1')
for dt in dts:
row.update({dt.text.strip(): dt.find_next('p').text.strip()})
newsdata.append(row)
#text or csv
df = pd.DataFrame(newsdata)
df.to_csv (r'C:\Users\Usuario\Desktop\news.csv', index = False, header=True)
print(df)
Help me to improve my code to get all data in text form.
Also is it possible to crawl data and save it csv like this:
Column1:News_title:value
column2:category: accidents
column3:publish_date_time:June 28, 2022 at 13:31
column4:news:full news here

Go to the details page (here I use req2 to go to the details page) and I 've made the pagination using for loop and range function and you can increase or decrease the page numbers with no time.
P/S: If you click on any title link then you can see the details page and from the details pages are scraped all required data items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
url='https://www.fleetmon.com/maritime-news/?page={page}'
data=[]
for page in range(1,11):
req = requests.get(url.format(page=page),headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
for link in soup.select('.news-headline h2 a') :
link='https://www.fleetmon.com' + link.get('href')
req2 = requests.get(link,headers=headers)
soup2 = BeautifulSoup(req2.text, 'lxml')
title= soup2.find('h1',class_="uk-article-title margin-t-0").text
cat=soup2.select_one('p.uk-article-meta span a strong').text
date=soup2.select_one('[class="uk-text-nowrap"]:nth-child(3)').text
details=soup2.select_one('.uk-article-story ').get_text(strip=True)
data.append({
'title':title,
'category':cat,
'date':date,
'details_news':details
})
df = pd.DataFrame(data)#.to_csv('news.csv',index=False)
print(df)
Output:
Cruise ship NORWEGIAN SUN hit iceberg, damaged... ... Cruise ship NORWEGIAN SUN hit an
iceberg size ...
1 Yang Ming and HMM Were Accused of Collusion to... ... YM WARRANTY by ship spotter phduck2kYM WARRANT...
2 Fire in bulk carrier cargo hold, Florida ... At around 2350 LT Jun 26 firefighters responde...
3 Chlorine gas tank fell on Chinese cargo ship, ... ... Tank with 25 tons of chlorine gas fell onto ca...
4 Heavy vehicle fell onto cargo deck during offl... ... Heavy machinery vehicle (probably mobile crane...
.. ... ...
...
195 Yara Plans 15 Ammonia Bunkering Terminals in S... ... VIKING ENERGY by ship spotter PattayaVIKING EN...
196 World’s Largest Electric Cruise Ship Sets Sail... ... ©Wuxi Saisiyi Electric Technology,©Wuxi Saisiy...
197 The Supply Chain Crisis Brewing at Israeli Ports ... Port Haifa in FleetMon ExplorerPort Haifa in F...
198 CDC Drops Its “Cruise Ship Travel Health Notic... ... AIDADIVA by ship spotter Becks93AIDADIVA by sh...
199 Scorpio Tankers Take the Path of Shipboard Car... ... CORONA UTILITY by ship spotter canonbenqCORONA...
[200 rows x 4 columns]

Related

Web scraping multiple pages in python

So I'm trying to web scrape a website that has around 500 pages for used cars and each page has around 22 cars, I managed to extract the first 22 cars from the first page, but how can make my code iterate through all the pages so I can get all cars? (I'm a beginner so sorry if my code is not well structured)
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
website = 'https://ksa.yallamotor.com/used-cars/search'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0'
}
response = requests.get(website, headers=headers)
links = []
car_name = []
model_year = []
cars = []
soup = BeautifulSoup(response.text, 'lxml')
cars = soup.find_all('div', class_='singleSearchCard m24t p12 bg-w border-gray border8')
for c in cars:
l = "https://ksa.yallamotor.com/" + c.find('a', class_='black-link')['href']
links.append(l)
for i in range(0,22):
url = links[i]
session_object = requests.Session()
result = session_object.get(url, headers=headers)
soup = BeautifulSoup(result.text, 'lxml')
name = soup.find('h1', class_="font24")
car_name.append(name.text)
y = soup.find_all('div', class_="font14 text-center font-b m2t")[0]
model_year.append(y.text)
Website is under Cloudflare protection, so you would need something like cloudscraper (pip install cloudscraper). The following code will get you your data (you can further analyse each car, get the details you need, etc):
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
for x in range(1, 501):
r = scraper.get(f'https://ksa.yallamotor.com/used-cars/search?page={x}&sort=updated_desc')
soup = BeautifulSoup(r.text, 'html.parser')
cars = soup.select('.singleSearchCard')
for car in cars:
url = car.select_one('a.black-link')
print(url.get_text(strip=True), url['href'])
Result printed in terminal:
Used BMW 7 Series 730Li 2018 /used-cars/bmw/7-series/2018/used-bmw-7-series-2018-jeddah-1294758
Used Infiniti QX80 5.6L Luxe (8 Seats) 2020 /used-cars/infiniti/qx80/2020/used-infiniti-qx80-2020-jeddah-1295458
Used Chevrolet Suburban 5.3L LS 2WD 2018 /used-cars/chevrolet/suburban/2018/used-chevrolet-suburban-2018-jeddah-1302084
Used Chevrolet Silverado 2016 /used-cars/chevrolet/silverado/2016/used-chevrolet-silverado-2016-jeddah-1297430
Used GMC Yukon 5.3L SLE (2WD) 2018 /used-cars/gmc/yukon/2018/used-gmc-yukon-2018-jeddah-1304469
Used GMC Yukon 5.3L SLE (2WD) 2018 /used-cars/gmc/yukon/2018/used-gmc-yukon-2018-jeddah-1304481
Used Chevrolet Impala 3.6L LS 2018 /used-cars/chevrolet/impala/2018/used-chevrolet-impala-2018-jeddah-1297427
Used Infiniti Q70 3.7L Luxe 2019 /used-cars/infiniti/q70/2019/used-infiniti-q70-2019-jeddah-1295235
Used Chevrolet Tahoe LS 2WD 2018 /used-cars/chevrolet/tahoe/2018/used-chevrolet-tahoe-2018-jeddah-1305486
Used Mercedes-Benz 450 SEL 2018 /used-cars/mercedes-benz/450-sel/2018/used-mercedes-benz-450-sel-2018-jeddah-1295830
[...]

Scraping returning None

I am trying to scrape yellow pages everything working fine except scraping the phone numbers! it's a div class = 'popover-phones' but having an a tag with href = the phone number can anyone assist me please. yellow pages inspection
import item as item
import requests
from bs4 import BeautifulSoup
import json
from csv import writer
url = 'https://yellowpages.com.eg/en/category/charcoal'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
articles = soup.find_all('div', class_= 'col-xs-12 item-details')
for item in articles:
address = item.find('a',class_= 'address-text').text
company = item.find('a',class_= 'item-title').text
telephone = item.find('div', class_='popover-phones')enter code here
print(company,address,telephone)
The phone numbers you see are loaded from external URL. To get all phone numbers from the page you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://yellowpages.com.eg/en/category/charcoal"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for p in soup.select("[data-tooltip-phones]"):
phone_url = "https://yellowpages.com.eg" + p["data-tooltip-phones"]
title = p.find_previous(class_="item-title").text
phones = requests.get(phone_url).json()
print(title, *[b for a in phones for b in a])
Prints:
2 Bacco 02-3390-8764
3 A Group International 0120-3530-005 057-2428-449
3 A Group International 0120-3833-500 0120-3530-005
Abdel Karim 0122-3507-461
Abdel Sabour Zidan 03-4864-641
Abou Aoday 0111-9226-536 0100-3958-351
Abou Eid For Charcoal Trading 0110-0494-770
Abou Fares For Charcoal Trade 0128-3380-916
Abou Karim Store 0100-6406-939
Adel Sons 0112-1034-398 0115-0980-776
Afandina 0121-2414-087
Ahmed El Fahham 02-2656-0815
Al Baraka For Charcoal 0114-6157-799 0109-3325-720
Al Ghader For Import & Export 03-5919-355 0111-0162-602 0120-6868-434
Al Mashd For Coal 0101-0013-743 0101-0013-743
Al Zahraa Co. For Exporting Charcoal & Agriculture Products 040-3271-056 0100-0005-174 040-3271-056
Alex Carbon Group 03-3935-902
Alwaha Charcoal Trade Est. 0100-4472-554 0110-1010-810 0100-9210-812
Aly Abdel Rahman For Charcoal Trade 03-4804-440 0122-8220-661
Amy Deluxe Egypt 0112-5444-410

How to get all products from a beautifulsoup page

I want to get all the products on this page:
nike.com.br/snkrs#estoque
My python code is this:
produtos = []
def aviso():
print("Started!")
request = requests.get("https://www.nike.com.br/snkrs#estoque")
soup = bs4(request.text, "html.parser")
links = soup.find_all("a", class_="btn", text="Comprar")
links_filtred = list(set(links))
for link in links_filtred:
if(produto not in produtos):
request = requests.get(f"{link['href']}")
soup = bs4(request.text, "html.parser")
produto = soup.find("div", class_="nome-preco-produto").get_text()
if(code_formated == ""):
code_formated = "\u200b"
print(f"Nome: {produto} Link: {link['href']}\n")
produtos.append(link["href"])
aviso()
Guys, this code gets the products from the page, but not all yesterday, I suspect that the content is dynamic, but how can I get them all with request and beautifulsoup? I don't want to use Selenium or an automation library, how do I do that? I don't want to have to change my code a lot because it's almost done, how do I do that?
DO NOT USE requests.get if you are dealing with the same HOST.
Reason: read-that
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
allin = []
with requests.Session() as req:
for page in range(1, 6):
params = {
'p': page,
'demanda': 'true'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(x.find_next('h2').get_text(strip=True, separator=" "), x['href'])
for x in soup.select('.aspect-radio-box')]
allin.extend(goal)
df = pd.DataFrame(allin, columns=['Title', 'Url'])
print(df)
main('https://www.nike.com.br/Snkrs/Feed')
Output:
Title Url
0 Dunk High x Fragment design Black https://www.nike.com.br/dunk-high-x-fragment-d...
1 Dunk Low Infantil (16-26) City Market https://www.nike.com.br/dunk-low-infantil-16-2...
2 ISPA Flow 2020 Desert Sand https://www.nike.com.br/ispa-flow-2020-153-169...
3 ISPA Flow 2020 Pure Platinum https://www.nike.com.br/ispa-flow-2020-153-169...
4 Nike iSPA Men's Lightweight Packable Jacket https://www.nike.com.br/nike-ispa-153-169-211-...
.. ... ...
115 Air Jordan 1 Mid Hyper Royal https://www.nike.com.br/air-jordan-1-mid-153-1...
116 Dunk High Orange Blaze https://www.nike.com.br/dunk-high-153-169-211-...
117 Air Jordan 5 Stealth https://www.nike.com.br/air-jordan-5-153-169-2...
118 Air Jordan 3 Midnight Navy https://www.nike.com.br/air-jordan-3-153-169-2...
119 Air Max 90 Bacon https://www.nike.com.br/air-max-90-153-169-211...
[120 rows x 2 columns]
To get the data you can send a request to:
https://www.nike.com.br/Snkrs/Estoque?p=<PAGE>&demanda=true
where providing a page number between 1-5 to p= in the URL.
For example, to print the links, you can try:
import requests
from bs4 import BeautifulSoup
url = "https://www.nike.com.br/Snkrs/Estoque?p={page}&demanda=true"
for page in range(1, 6):
response = requests.get(url.format(page=page))
soup = BeautifulSoup(response.content, "html.parser")
print(soup.find_all("a", class_="btn", text="Comprar"))

web scraping for more pages

Currently I am working on web scraping for a website where I want data when the page is load automatically. I am using BeautifullSoup and requests.
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.monki.com/en/newin/view-all-new.html")
soup = BeautifulSoup(page.content, 'html.parser')
article_codes=[]
for k in soup.findAll('div',attrs={"class":"producttile-details"}):
article_code = k.find('span', attrs={'class':"articleCode"})
print(article_code)
article_codes.append(article_code.text)
with this code I only get data of a page only but I want all data after the page loaded.
The page is using JavaScript to load the additional pages. You can use requests module to simulate those requests.
For example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.monki.com/en_eur/newin/view-all-new/_jcr_content/productlisting.products.html'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0',
}
with requests.session() as s:
s.get('https://www.monki.com/en_eur/newin/view-all-new.html', headers=headers).text
for page in range(0, 10): # <-- adjust to required number of pages
soup = BeautifulSoup(s.get(url, params={'offset': page*28}, headers=headers).content, 'html.parser')
for product in soup.select('.o-product'):
name = product.select_one('.product-name').get_text(strip=True)
price = product.select_one('.price-tag').get_text(strip=True)
link = product.select_one('.a-link')['href']
print('{:<50} {:<10} {}'.format(name, price , link))
Prints all products:
NEW! Maxi smock dress €30 https://www.monki.com/en_eur/clothing/dresses/midi-dresses/product.midi-button-up-shirt-dress-black.0871799004.html
NEW! Retro skater dress €20 https://www.monki.com/en_eur/clothing/dresses/mini-dresses/product.retro-skater-dress-white.0688447029.html
NEW! Mozik block jeans €40 https://www.monki.com/en_eur/clothing/jeans/product.mozik-block-jeans-blue.0874088001.html
NEW! Pack of two scrunchies €6 https://www.monki.com/en_eur/accessories/hair-accessories/product.pack-of-two-scrunchies-beige.0530296078.html
NEW! Mini hand bag €18 https://www.monki.com/en_eur/accessories/bags,-wallets-belts/bags/product.mini-hand-bag-black.0826291006.html
NEW! Fitted crop top €10 https://www.monki.com/en_eur/clothing/tops/t-shirts/product.fitted-crop-top-purple.0906440002.html
NEW! Tiered smock dress €30 https://www.monki.com/en_eur/clothing/dresses/midi-dresses/product.tiered-smock-dress-blue.0895277004.html
NEW! Mini hand bag €18 https://www.monki.com/en_eur/accessories/bags,-wallets-belts/bags/product.mini-hand-bag-beige.0826291008.html
NEW! Fitted t-shirt €10 https://www.monki.com/en_eur/clothing/tops/t-shirts/product.fitted-t-shirt-purple.0905746002.html
NEW! Shoulder pads t-shirt dress €25 https://www.monki.com/en_eur/clothing/dresses/mini-dresses/product.shoulder-pads-t-shirt-dress-beige.0929301002.html
NEW! Yoko mid blue jeans €40 https://www.monki.com/en_eur/clothing/jeans/product.yoko-mid-blue-jeans-blue.0656425001.html
NEW! Yoko classic blue jeans €40 https://www.monki.com/en_eur/clothing/jeans/product.yoko-classic-blue-jeans-blue.0807218001.html
NEW! Pleated midi skirt €25 https://www.monki.com/en_eur/clothing/skirts/midi-skirts/product.pleated-midi-skirt-black.0562278003.html
... and so on.

python nested for to retrieve css tags values

The tags from a web page are as follows:
<div class="lg_col MT5">
<p>
<span class="sp starGryB">4.4</span>
</p>
<p class="MT5 UC">
<span class="gd10gb">141 Ratings</span>
</p>
</div>
I am trying to retrieve the values "4.4", and "141 Ratings" for all the div class values "lg_col MT5".
The nested for loop that I use isn't working as expected. It seems as if the hierarchy of the tags isn't taken into account.
import requests
import sys
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
def test_function():
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for tag in soup.select('div.lg_col.MT5'):
for tag1 in soup.select('span.sp.starGryB'):
try:
print(tag1.string)
except KeyError:
pass
for tag2 in soup.select('span.gd10gb'):
try:
print(tag2.string)
except KeyError:
pass
test_function()
`
The expected output is: 4.4 followed by 141 Ratings for each of the div tags in the webpage.
But the output is: All the starGryB values followed by all the gd10gb values as this happens over and over again.
Use tag.select instead of soup.select if you want to look in just tag and not the entire soup.
Not for points.
This is another way to scrape it to avoid having to deal with loops.
import requests
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tags_1 = soup.find_all('span', class_='sp starGryB')
tags_2 = [tag.parent.parent.select('span.gd10gb') for tag in tags_1]
tags_3 = [tag.parent.parent.parent.select('a.gr24mb.UC') for tag in tags_1]
scores = [score.get_text() for score in tags_1]
ratings = [rating[0].get_text() if len(rating) > 0 else 'NA' for rating in tags_2]
names = [name[0].get_text().strip() for name in tags_3]
tags = zip(names, scores, ratings)
for a, b, c in tags:
print a, b, c
Result:
Wild Amazon 2.9 27 Ratings
European Buffet NA NA
Flamingo 2.3 17 Ratings
The Holy Smoke 2.9 13 Ratings
Snow Park 2.6 14 Ratings
Dhabba Express 2.7 11 Ratings
The Yellow Chilli 2.7 6 Ratings
The Piano, The Savera Hotel 2.5 6 Ratings
Roasts & Grills, Green Park Hotel 2.3 6 Ratings
[Finished in 0.9s]

Categories