Web scraping multiple pages in python - python

So I'm trying to web scrape a website that has around 500 pages for used cars and each page has around 22 cars, I managed to extract the first 22 cars from the first page, but how can make my code iterate through all the pages so I can get all cars? (I'm a beginner so sorry if my code is not well structured)
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
website = 'https://ksa.yallamotor.com/used-cars/search'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0'
}
response = requests.get(website, headers=headers)
links = []
car_name = []
model_year = []
cars = []
soup = BeautifulSoup(response.text, 'lxml')
cars = soup.find_all('div', class_='singleSearchCard m24t p12 bg-w border-gray border8')
for c in cars:
l = "https://ksa.yallamotor.com/" + c.find('a', class_='black-link')['href']
links.append(l)
for i in range(0,22):
url = links[i]
session_object = requests.Session()
result = session_object.get(url, headers=headers)
soup = BeautifulSoup(result.text, 'lxml')
name = soup.find('h1', class_="font24")
car_name.append(name.text)
y = soup.find_all('div', class_="font14 text-center font-b m2t")[0]
model_year.append(y.text)

Website is under Cloudflare protection, so you would need something like cloudscraper (pip install cloudscraper). The following code will get you your data (you can further analyse each car, get the details you need, etc):
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
for x in range(1, 501):
r = scraper.get(f'https://ksa.yallamotor.com/used-cars/search?page={x}&sort=updated_desc')
soup = BeautifulSoup(r.text, 'html.parser')
cars = soup.select('.singleSearchCard')
for car in cars:
url = car.select_one('a.black-link')
print(url.get_text(strip=True), url['href'])
Result printed in terminal:
Used BMW 7 Series 730Li 2018 /used-cars/bmw/7-series/2018/used-bmw-7-series-2018-jeddah-1294758
Used Infiniti QX80 5.6L Luxe (8 Seats) 2020 /used-cars/infiniti/qx80/2020/used-infiniti-qx80-2020-jeddah-1295458
Used Chevrolet Suburban 5.3L LS 2WD 2018 /used-cars/chevrolet/suburban/2018/used-chevrolet-suburban-2018-jeddah-1302084
Used Chevrolet Silverado 2016 /used-cars/chevrolet/silverado/2016/used-chevrolet-silverado-2016-jeddah-1297430
Used GMC Yukon 5.3L SLE (2WD) 2018 /used-cars/gmc/yukon/2018/used-gmc-yukon-2018-jeddah-1304469
Used GMC Yukon 5.3L SLE (2WD) 2018 /used-cars/gmc/yukon/2018/used-gmc-yukon-2018-jeddah-1304481
Used Chevrolet Impala 3.6L LS 2018 /used-cars/chevrolet/impala/2018/used-chevrolet-impala-2018-jeddah-1297427
Used Infiniti Q70 3.7L Luxe 2019 /used-cars/infiniti/q70/2019/used-infiniti-q70-2019-jeddah-1295235
Used Chevrolet Tahoe LS 2WD 2018 /used-cars/chevrolet/tahoe/2018/used-chevrolet-tahoe-2018-jeddah-1305486
Used Mercedes-Benz 450 SEL 2018 /used-cars/mercedes-benz/450-sel/2018/used-mercedes-benz-450-sel-2018-jeddah-1295830
[...]

Related

Pagination not showing up in parsed content (BeautifulSoup)

I am new to python programming and I have a problem with pagination while using beautiful soup. all the parsed content show up except the pagination contents. image of content not showing up I have highlighted the lines which does not show up.
Website link.
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from lxml import html
url = "https://www.yellowpages.lk/Medical.php"
result = requests.get(url)
time.sleep(5)
doc = BeautifulSoup(result.content, "lxml")
time.sleep(5)
Table = doc.find('table',{'id':'MedicalFacility'}).find('tbody').find_all('tr')
Page = doc.select('.col-lg-10')
C_List = []
D_List = []
N_List = []
A_List = []
T_List = []
W_List = []
V_List = []
M_List = []
print(doc.prettify())
print(Page)
while True:
for i in range(0,25):
Sort = Table[i]
Category = Sort.find_all('td')[0].get_text().strip()
C_List.insert(i,Category)
District = Sort.find_all('td')[1].get_text().strip()
D_List.insert(i,District)
Name = Sort.find_all('td')[2].get_text().strip()
N_List.insert(i,Name)
Address = Sort.find_all('td')[3].get_text().strip()
A_List.insert(i,Address)
Telephone = Sort.find_all('td')[4].get_text().strip()
T_List.insert(i,Telephone)
Whatsapp = Sort.find_all('td')[5].get_text().strip()
W_List.insert(i,Whatsapp)
Viber = Sort.find_all('td')[6].get_text().strip()
V_List.insert(i,Viber)
MoH_Division = Sort.find_all('td')[7].get_text().strip()
M_List.insert(i,MoH_Division)
I tried using .find() with class and .select('.class') to see if the pagination contents show up so far nothing has worked
The pagination is more or less superfluous in that page: the data is loaded anyway, and Javascript is generating pagination just for display purposes: Requests will get full data anyway.
Here is one way of getting that information in full:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
url = 'https://www.yellowpages.lk/Medical.php'
r = requests.get(url, headers=headers)
soup = bs(r.text, 'html.parser')
table = soup.select_one('table[id="MedicalFacility"]')
df = pd.read_html(str(table))[0]
print(df)
Result in terminal:
Category District Name Address Telephone WhatsApp Viber MoH Division
0 Pharmacy Gampaha A & B Pharmacy 171 Negambo Road Veyangoda 0778081515 9.477808e+10 9.477808e+10 Aththanagalla
1 Pharmacy Trincomalee A A Pharmacy 350 Main Street Kanthale 0755576998 9.475558e+10 9.475558e+10 Kanthale
2 Pharmacy Colombo A Baur & Co Pvt Ltd 55 Grandpass Rd Col 14 0768200100 9.476820e+10 9.476820e+10 CMC
3 Pharmacy Colombo A Colombo Pharmacy Ug 93 97 Peoples Park Colombo 11 0773771446 9.477377e+10 NaN CMC
4 Pharmacy Trincomalee A R Pharmacy Main Street Kinniya-3 0771413838 9.477500e+10 9.477500e+10 Kinniya
... ... ... ... ... ... ... ... ...
1968 Pharmacy Ampara Zam Zam Pharmacy Main Street Akkaraipattu 0672277698 9.477756e+10 9.477756e+10 Akkaraipattu
1969 Pharmacy Batticaloa Zattra Pharmacy Jummah Mosque Rd Oddamawadi-1 0766689060 9.476669e+10 NaN Oddamavady
1970 Pharmacy Puttalam Zeenath Pharmacy Norochcholei 0728431622 NaN NaN Kalpitiya
1971 Pharmacy Puttalam Zidha Pharmacy Norochcholei 0773271222 NaN NaN Kalpitiya
1972 Pharmacy Gampaha Zoomcare Pharmacy & Grocery 182/B/1 Rathdoluwa Seeduwa 0768378112 NaN NaN Seeduwa
1973 rows × 8 columns
See pandas documentation here. Also BeautifulSoup documentation, and lastly, Requests documentation.
If you are using pandas, all you need is just a couple of lines of code to put the entire table into a dataframe.
All you need is pandas.read_html() function as follows:
Code:
import pandas as pd
df = pd.read_html("https://www.yellowpages.lk/Medical.php")[0]
print(df)
Output:

Scraping returning None

I am trying to scrape yellow pages everything working fine except scraping the phone numbers! it's a div class = 'popover-phones' but having an a tag with href = the phone number can anyone assist me please. yellow pages inspection
import item as item
import requests
from bs4 import BeautifulSoup
import json
from csv import writer
url = 'https://yellowpages.com.eg/en/category/charcoal'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
articles = soup.find_all('div', class_= 'col-xs-12 item-details')
for item in articles:
address = item.find('a',class_= 'address-text').text
company = item.find('a',class_= 'item-title').text
telephone = item.find('div', class_='popover-phones')enter code here
print(company,address,telephone)
The phone numbers you see are loaded from external URL. To get all phone numbers from the page you can use next example:
import requests
from bs4 import BeautifulSoup
url = "https://yellowpages.com.eg/en/category/charcoal"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for p in soup.select("[data-tooltip-phones]"):
phone_url = "https://yellowpages.com.eg" + p["data-tooltip-phones"]
title = p.find_previous(class_="item-title").text
phones = requests.get(phone_url).json()
print(title, *[b for a in phones for b in a])
Prints:
2 Bacco 02-3390-8764
3 A Group International 0120-3530-005 057-2428-449
3 A Group International 0120-3833-500 0120-3530-005
Abdel Karim 0122-3507-461
Abdel Sabour Zidan 03-4864-641
Abou Aoday 0111-9226-536 0100-3958-351
Abou Eid For Charcoal Trading 0110-0494-770
Abou Fares For Charcoal Trade 0128-3380-916
Abou Karim Store 0100-6406-939
Adel Sons 0112-1034-398 0115-0980-776
Afandina 0121-2414-087
Ahmed El Fahham 02-2656-0815
Al Baraka For Charcoal 0114-6157-799 0109-3325-720
Al Ghader For Import & Export 03-5919-355 0111-0162-602 0120-6868-434
Al Mashd For Coal 0101-0013-743 0101-0013-743
Al Zahraa Co. For Exporting Charcoal & Agriculture Products 040-3271-056 0100-0005-174 040-3271-056
Alex Carbon Group 03-3935-902
Alwaha Charcoal Trade Est. 0100-4472-554 0110-1010-810 0100-9210-812
Aly Abdel Rahman For Charcoal Trade 03-4804-440 0122-8220-661
Amy Deluxe Egypt 0112-5444-410

Crawling news articles with BeautifulSoup

I want to crawl maritime news from Fleetmon.com news as well as with detail pages and save it in text file. I tried BeautifulSoup in python but it not work properly..
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl = 'https://www.fleetmon.com/maritime-news/'
headers = {'User-Agent': 'Mozilla/5.0'}
newslinks = [] # put all item in this array
for x in range(1): # set page range
response = requests.get(
f'https://www.fleetmon.com/maritime-news/?page={x}') # url of next page
soup = BeautifulSoup(response.content, 'html.parser')
newslist = soup.find_all('article')
# loop to get all href from ul
for item in newslist:
for link in item.find_all('a', href=True):
newslinks.append(link['href'])
newslinks = list(set(newslinks))
print(newslinks)
# news details pages
newsdata = []
for link in newslinks:
print(link)
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
shipName = soup.find('div', {'class': 'uk-article-story'}).text.strip()
fieldsets = soup.find_all('article')
row = {'Ship Name': shipName}
for fieldset in fieldsets:
dts = fieldset.find_all('h1')
for dt in dts:
row.update({dt.text.strip(): dt.find_next('p').text.strip()})
newsdata.append(row)
#text or csv
df = pd.DataFrame(newsdata)
df.to_csv (r'C:\Users\Usuario\Desktop\news.csv', index = False, header=True)
print(df)
Help me to improve my code to get all data in text form.
Also is it possible to crawl data and save it csv like this:
Column1:News_title:value
column2:category: accidents
column3:publish_date_time:June 28, 2022 at 13:31
column4:news:full news here
Go to the details page (here I use req2 to go to the details page) and I 've made the pagination using for loop and range function and you can increase or decrease the page numbers with no time.
P/S: If you click on any title link then you can see the details page and from the details pages are scraped all required data items.
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
url='https://www.fleetmon.com/maritime-news/?page={page}'
data=[]
for page in range(1,11):
req = requests.get(url.format(page=page),headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
for link in soup.select('.news-headline h2 a') :
link='https://www.fleetmon.com' + link.get('href')
req2 = requests.get(link,headers=headers)
soup2 = BeautifulSoup(req2.text, 'lxml')
title= soup2.find('h1',class_="uk-article-title margin-t-0").text
cat=soup2.select_one('p.uk-article-meta span a strong').text
date=soup2.select_one('[class="uk-text-nowrap"]:nth-child(3)').text
details=soup2.select_one('.uk-article-story ').get_text(strip=True)
data.append({
'title':title,
'category':cat,
'date':date,
'details_news':details
})
df = pd.DataFrame(data)#.to_csv('news.csv',index=False)
print(df)
Output:
Cruise ship NORWEGIAN SUN hit iceberg, damaged... ... Cruise ship NORWEGIAN SUN hit an
iceberg size ...
1 Yang Ming and HMM Were Accused of Collusion to... ... YM WARRANTY by ship spotter phduck2kYM WARRANT...
2 Fire in bulk carrier cargo hold, Florida ... At around 2350 LT Jun 26 firefighters responde...
3 Chlorine gas tank fell on Chinese cargo ship, ... ... Tank with 25 tons of chlorine gas fell onto ca...
4 Heavy vehicle fell onto cargo deck during offl... ... Heavy machinery vehicle (probably mobile crane...
.. ... ...
...
195 Yara Plans 15 Ammonia Bunkering Terminals in S... ... VIKING ENERGY by ship spotter PattayaVIKING EN...
196 World’s Largest Electric Cruise Ship Sets Sail... ... ©Wuxi Saisiyi Electric Technology,©Wuxi Saisiy...
197 The Supply Chain Crisis Brewing at Israeli Ports ... Port Haifa in FleetMon ExplorerPort Haifa in F...
198 CDC Drops Its “Cruise Ship Travel Health Notic... ... AIDADIVA by ship spotter Becks93AIDADIVA by sh...
199 Scorpio Tankers Take the Path of Shipboard Car... ... CORONA UTILITY by ship spotter canonbenqCORONA...
[200 rows x 4 columns]

Beautifulsoup: activate web button and continue scraping on new page

I'm having a university project and need to get data online. I would like to get some data from this website.
https://www.footballdatabase.eu/en/transfers/-/2020-10-03
For the 3rd of October I managed to get the first 19 rows but then there are 6 pages and I'm struggling to activate the button for loading the next page.
This is the html code for the button:
2
My code so far:
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
page = "https://www.footballdatabase.eu/en/transfers/-/2020-10-03"
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
Players = pageSoup.find_all("span", {"class": "name"})
Team = pageSoup.find_all("span", {"class": "firstteam"})
Values = pageSoup.find_all("span", {"class": "transferamount"})
Values[0].text
PlayersList = []
TeamList = []
ValuesList = []
j=1
for i in range(0,20):
PlayersList.append(Players[i].text)
TeamList.append(Team[i].text)
ValuesList.append(Values[i].text)
j=j+1
df = pd.DataFrame({"Players":PlayersList,"Team":TeamList,"Values":ValuesList})
Thank you very much!
You can use requests module to simulate the Ajax call. For example:
import requests
from bs4 import BeautifulSoup
data = {
'date': '2020-10-03',
'pid': 1,
'page': 1,
'filter': 'full',
}
url = 'https://www.footballdatabase.eu/ajax_transfers_show.php'
for data['page'] in range(1, 7): # <--- adjust number of pages here.
soup = BeautifulSoup(requests.post(url, data=data).content, 'html.parser')
for line in soup.select('.line'):
name = line.a.text
first_team = line.select_one('.firstteam').a.text if line.select_one('.firstteam').a else 'Free'
second_team = line.select_one('.secondteam').a.text if line.select_one('.secondteam').a else 'Free'
amount = line.select_one('.transferamount').text
print('{:<30} {:<20} {:<20} {}'.format(name, first_team, second_team, amount))
Prints:
Bruno Amione Belgrano  Hellas Vérone  1.7 M€
Ismael Gutierrez Betis Deportivo Atlético B 1 M€
Vitaly Janelt Bochum  Brentford  500 k€
Sven Ulreich Bayern Munich  Hambourg SV  500 k€
Salim Ali Al Hammadi Baniyas  Khor Fakkan  Prêt
Giovanni Alessandretti Ascoli U-20 Recanatese  Prêt
Gabriele Bellodi AC Milan U-20 Alessandria  Prêt
Louis Britton Bristol City B Torquay United  Prêt
Juan Brunetta Godoy Cruz  Parme  Prêt
Bobby Burns Barrow  Glentoran  Prêt
Bohdan Butko Shakhtar Donetsk  Lech Poznan  Prêt
Nicolò Casale Hellas Vérone  Empoli  Prêt
Alessio Da Cruz Parme  FC Groningue  Prêt
Dalbert Henrique Inter Milan  Rennes  Prêt
...and so on.

python nested for to retrieve css tags values

The tags from a web page are as follows:
<div class="lg_col MT5">
<p>
<span class="sp starGryB">4.4</span>
</p>
<p class="MT5 UC">
<span class="gd10gb">141 Ratings</span>
</p>
</div>
I am trying to retrieve the values "4.4", and "141 Ratings" for all the div class values "lg_col MT5".
The nested for loop that I use isn't working as expected. It seems as if the hierarchy of the tags isn't taken into account.
import requests
import sys
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
def test_function():
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for tag in soup.select('div.lg_col.MT5'):
for tag1 in soup.select('span.sp.starGryB'):
try:
print(tag1.string)
except KeyError:
pass
for tag2 in soup.select('span.gd10gb'):
try:
print(tag2.string)
except KeyError:
pass
test_function()
`
The expected output is: 4.4 followed by 141 Ratings for each of the div tags in the webpage.
But the output is: All the starGryB values followed by all the gd10gb values as this happens over and over again.
Use tag.select instead of soup.select if you want to look in just tag and not the entire soup.
Not for points.
This is another way to scrape it to avoid having to deal with loops.
import requests
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0"}
url = "http://www.burrp.com/chennai/search.html?q=buffet"
source_code = requests.get(url, headers=HEADERS)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
tags_1 = soup.find_all('span', class_='sp starGryB')
tags_2 = [tag.parent.parent.select('span.gd10gb') for tag in tags_1]
tags_3 = [tag.parent.parent.parent.select('a.gr24mb.UC') for tag in tags_1]
scores = [score.get_text() for score in tags_1]
ratings = [rating[0].get_text() if len(rating) > 0 else 'NA' for rating in tags_2]
names = [name[0].get_text().strip() for name in tags_3]
tags = zip(names, scores, ratings)
for a, b, c in tags:
print a, b, c
Result:
Wild Amazon 2.9 27 Ratings
European Buffet NA NA
Flamingo 2.3 17 Ratings
The Holy Smoke 2.9 13 Ratings
Snow Park 2.6 14 Ratings
Dhabba Express 2.7 11 Ratings
The Yellow Chilli 2.7 6 Ratings
The Piano, The Savera Hotel 2.5 6 Ratings
Roasts & Grills, Green Park Hotel 2.3 6 Ratings
[Finished in 0.9s]

Categories