Web scraping of multiple web pages with python

Web scraping of multiple web pages with python - python

I am webscraping indeed.nl for "Junior UX Designer" in "Nederland". The website for that search term contains 6 webpages with vacancies - meaning, if one webpage contains 15 vacancies, I should get in total around 90 vacancies.
However, when I put it into a json file, I can see that I receive 90 rows - however, multiple duplicates are in there, and many job vacancies are not even displayed in the file.
This is the code I'm using:
import requests
from bs4 import BeautifulSoup
import json
jobs_NL = []
for i in range(1,7):
url = "https://nl.indeed.com/vacatures?q=junior+ux+designer&l=Nederland&start="+str(i)
print("Getting page",i)
page = requests.get(url)
html = BeautifulSoup(page.content, "html.parser")
job_title = html.find_all("table", class_="jobCard_mainContent")
for item in job_title:
title = item.find("h2").get_text()
company = item.find("span", class_="companyName").get_text()
location = item.find("div", class_="companyLocation").get_text()
if item.find("div", class_="salary-snippet") != None:
salary = item.find("div", class_="heading6 tapItem-gutter metadataContainer").get_text()
else:
salary = "No salary found"
vacancy = {
"title": title,
"company": company,
"location": location,
"salary": salary
}
jobs_NL.append(vacancy)

You need to multiply the start variable by 10 to get correct page:
import requests
import pandas as pd
from bs4 import BeautifulSoup
jobs_NL = []
for i in range(7):
url = "https://nl.indeed.com/vacatures?q=junior+ux+designer&l=Nederland&start={}".format(
10 * i
)
print("Getting page", i)
page = requests.get(url)
html = BeautifulSoup(page.content, "html.parser")
job_title = html.find_all("table", class_="jobCard_mainContent")
for item in job_title:
title = item.find("h2").get_text()
company = item.find("span", class_="companyName").get_text()
location = item.find("div", class_="companyLocation").get_text()
if item.find("div", class_="salary-snippet") != None:
salary = item.find(
"div", class_="heading6 tapItem-gutter metadataContainer"
).get_text()
else:
salary = "No salary found"
vacancy = {
"title": title,
"company": company,
"location": location,
"salary": salary,
}
jobs_NL.append(vacancy)
df = pd.DataFrame(jobs_NL)
print(df)
Prints:
...
90 UX Designer | SaaS Platform StarApple Amersfoort €3.000 - €4.500 per maand
91 Frontend Developer JustBetter Alkmaar No salary found
92 Software Engineer Infinitas Learning Thuiswerken No salary found
93 UX Researcher Cognizant Technology Solutions Amsterdam No salary found
94 Junior Front End developer StarApple Zeist+1 plaats €2.500 - €3.000 per maand
95 nieuwSenior User Experience Designer Trimble Bodegraven No salary found
96 Senior UX Designer - Research Agency Found Professionals B.V. Amsterdam+1 plaats No salary found
97 HubSpot marketing lead Comaxx Waalre No salary found
98 nieuwJunior Technisch CRO Specialist Finest People Amsterdam West €50.000 per jaar
99 iOS developer Infoplaza Houten No salary found

Related

Beautifulsoup taking too much time to execute in the code

I am trying to scrape a website:-
https://media.info/newspapers/titles
This website has a list of newspapers from A to Z. I first have to scrape all the URLs and then scrape some more information from each newspaper.
Below is my code to scrape the URLs of all the newspapers starting from A to Z:-
driver.get('https://media.info/newspapers/titles')
time.sleep(2)
page_title = []
pages = driver.find_elements(By.XPATH,"//div[#class='pages']//a")
for i in pages:
page_title.append(i.get_attribute("href"))
names = []
for i in page_title:
driver.get(i)
time.sleep(1)
name = driver.find_elements(By.XPATH,"//div[#class='info thumbBlock']//a")
for i in name:
names.append(i.get_attribute("href"))
len(names) :-> 1688
names[0:5]
['https://media.info/newspapers/titles/abergavenny-chronicle',
'https://media.info/newspapers/titles/abergavenny-free-press',
'https://media.info/newspapers/titles/abergavenny-gazette-diary',
'https://media.info/newspapers/titles/the-abingdon-herald',
'https://media.info/newspapers/titles/academies-week']
moving further I need to scrape some information like owner, postal_Address, email, etc and I wrote the below code.
test = []
c = 0
for i in names:
driver.get(i)
time.sleep(2)
r = requests.get(i)
soup = BeautifulSoup(r.content,'lxml')
try:
name = driver.find_element(By.XPATH,"//*[#id='mainpage']/article/div[3]/h1").text
try:
twitter = driver.find_element(By.XPATH,"//*[#id='mainpage']/article/table[3]/tbody/tr/td[1]/a").text
except:
twitter = None
try:
twitter_followers = driver.find_element(By.XPATH,"//*[#id='mainpage']/article/table[3]/tbody/tr/td[1]/small").text.replace(' followers','').lstrip('(').rstrip(')')
except:
twitter_followers = None
people = []
try:
persons = driver.find_elements(By.XPATH,"//div[#class='columns']")
for i in persons:
people.append(i.text)
except:
people.append(None)
try:
owner = soup.select_one('th:contains("Owner") + td').text
except:
owner = None
try:
postal_address = soup.select_one('th:contains("Postal address") + td').text
except:
postal_address = None
try:
Telephone = soup.select_one('th:contains("Telephone") + td').text
except:
Telephone = None
try:
company_website = soup.select_one('th:contains("Official website") + td > a').get('href')
except:
company_website = None
try:
main_email = soup.select_one('th:contains("Main email") + td').text
except:
main_email = None
try:
personal_email = soup.select_one('th:contains("Personal email") + td').text
except:
personal_email = None
r2 = requests.get(company_website)
soup2 = BeautifulSoup(r2.content,'lxml')
try:
is_wordpress = soup2.find("meta",{"name":"generator"}).get('content')
except:
is_wordpress = None
news_Data = {
"Name": name,
"Owner": owner,
"Postal Address": postal_address,
"main Email":main_email,
"Telephone": Telephone,
"Personal Email": personal_email,
"Company Wesbite": company_website,
"Twitter_Handle": twitter,
"Twitter_Followers": twitter_followers,
"People":people,
"Is Wordpress?":is_wordpress
}
test.append(news_Data)
c=c+1
print("completed",c)
except Exception as Argument:
print(f"There is an exception with {i}")
pass
I am using both Selenium and BesutifulSoup with requests to scrape the data. The code is fulfilling the requirements.
Firstly, is it a good practice to use it in this manner like using selenium and soup in the same code?
Secondly, the code is taking too much time. is there any alternate way to reduce the runtime of the code?

BeautifulSoup is not slow: making requests and waiting for responses is slow.
You do not necessarily need selenium/chromedriver setup for this task, it's doable with requests (or other python library).
Yes, there are ways to speed it up, however keep in mind you are making requests to a server, which might become overwhelmed if you send too many requests at once, or it might block you.
Here is an example without selenium, which will accomplish what you're after:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
r = s.get('https://media.info/newspapers/titles')
soup = bs(r.text)
letter_links = [x.get('href') for x in soup.select_one('div.pages').select('a')]
newspaper_links = []
for x in tqdm(letter_links):
soup = bs(s.get(x).text)
ns_links = soup.select_one('div.columns').select('a')
for n in ns_links:
newspaper_links.append((n.get_text(strip=True), 'https://media.info/' + n.get('href')))
detailed_infos = []
for x in tqdm(newspaper_links[:50]):
soup = bs(s.get(x[1]).text)
owner = soup.select_one('th:contains("Owner")').next_sibling.select_one('a').get_text(strip=True) if soup.select_one('th:contains("Owner")') else None
website = soup.select_one('th:contains("Official website")').next_sibling.select_one('a').get_text(strip=True) if soup.select_one('th:contains("Official website")') else None
detailed_infos.append((x[0], x[1], owner, website))
df = pd.DataFrame(detailed_infos, columns = ['Newspaper', 'Info Url', 'Owner', 'Official website'])
print(df)
Result in terminal:
Newspaper Info Url Owner Official website
0 Abergavenny Chronicle https://media.info//newspapers/titles/abergavenny-chronicle Tindle Newspapers abergavenny-chronicle-today.co.uk
1 Abergavenny Free Press https://media.info//newspapers/titles/abergavenny-free-press Newsquest Media Group freepressseries.co.uk
2 Abergavenny Gazette & Diary https://media.info//newspapers/titles/abergavenny-gazette-diary Tindle Newspapers abergavenny-chronicle-today.co.uk/tn/index.cfm
3 The Abingdon Herald https://media.info//newspapers/titles/the-abingdon-herald Newsquest Media Group abingdonherald.co.uk
4 Academies Week https://media.info//newspapers/titles/academies-week None academiesweek.co.uk
5 Accrington Observer https://media.info//newspapers/titles/accrington-observer Reach plc accringtonobserver.co.uk
6 Addlestone and Byfleet Review https://media.info//newspapers/titles/addlestone-and-byfleet-review Reach plc woking.co.uk
7 Admart & North Devon Diary https://media.info//newspapers/titles/admart-north-devon-diary Tindle Newspapers admart.me.uk
8 AdNews Willenhall, Wednesbury and Darlaston https://media.info//newspapers/titles/adnews-willenhall-wednesbury-and-darlaston Reach plc reachplc.com
9 The Advertiser https://media.info//newspapers/titles/the-advertiser DMGT dmgt.co.uk
10 Aintree and Maghull Champion https://media.info//newspapers/titles/aintree-and-maghull-champion Champion Media group champnews.com
11 Airdrie & Coatbridge World https://media.info//newspapers/titles/airdrie-coatbridge-world Reach plc icLanarkshire.co.uk
12 Airdrie and Coatbridge Advertiser https://media.info//newspapers/titles/airdrie-and-coatbridge-advertiser Reach plc acadvertiser.co.uk
13 Aire Valley Target https://media.info//newspapers/titles/aire-valley-target Newsquest Media Group thisisbradford.co.uk
14 Alcester Chronicle https://media.info//newspapers/titles/alcester-chronicle Newsquest Media Group redditchadvertiser.co.uk/news/alcester
15 Alcester Standard https://media.info//newspapers/titles/alcester-standard Bullivant Media redditchstandard.co.uk
16 Aldershot Courier https://media.info//newspapers/titles/aldershot-courier Guardian Media Group aldershot.co.uk
17 Aldershot Mail https://media.info//newspapers/titles/aldershot-mail Guardian Media Group aldershot.co.uk
18 Aldershot News & Mail https://media.info//newspapers/titles/aldershot-news-mail Reach plc gethampshire.co.uk/aldershot
19 Alford Standard https://media.info//newspapers/titles/alford-standard JPI Media skegnessstandard.co.uk
20 Alford Target https://media.info//newspapers/titles/alford-target DMGT dmgt.co.uk
21 Alfreton and Ripley Echo https://media.info//newspapers/titles/alfreton-and-ripley-echo JPI Media jpimedia.co.uk
22 Alfreton Chad https://media.info//newspapers/titles/alfreton-chad JPI Media chad.co.uk
23 All at Sea https://media.info//newspapers/titles/all-at-sea None allatsea.co.uk
24 Allanwater News https://media.info//newspapers/titles/allanwater-news HUB Media allanwaternews.co.uk
25 Alloa & Hillfoots Shopper https://media.info//newspapers/titles/alloa-hillfoots-shopper Reach plc reachplc.com
26 Alloa & Hillfoots Advertiser https://media.info//newspapers/titles/alloa-hillfoots-advertiser Dunfermline Press Group alloaadvertiser.com
27 Alloa and Hillfoots Wee County News https://media.info//newspapers/titles/alloa-and-hillfoots-wee-county-news HUB Media wee-county-news.co.uk
28 Alton Diary https://media.info//newspapers/titles/alton-diary Tindle Newspapers tindlenews.co.uk
29 Andersonstown News https://media.info//newspapers/titles/andersonstown-news Belfast Media Group irelandclick.com
30 Andover Advertiser https://media.info//newspapers/titles/andover-advertiser Newsquest Media Group andoveradvertiser.co.uk
31 Anfield and Walton Star https://media.info//newspapers/titles/anfield-and-walton-star Reach plc icliverpool.co.uk
32 The Anglo-Celt https://media.info//newspapers/titles/the-anglo-celt None anglocelt.ie
33 Annandale Herald https://media.info//newspapers/titles/annandale-herald Dumfriesshire Newspaper Group dng24.co.uk
34 Annandale Observer https://media.info//newspapers/titles/annandale-observer Dumfriesshire Newspaper Group dng24.co.uk
35 Antrim Times https://media.info//newspapers/titles/antrim-times JPI Media antrimtoday.co.uk
36 Arbroath Herald https://media.info//newspapers/titles/arbroath-herald JPI Media arbroathherald.com
37 The Arden Observer https://media.info//newspapers/titles/the-arden-observer Bullivant Media ardenobserver.co.uk
38 Ardrossan & Saltcoats Herald https://media.info//newspapers/titles/ardrossan-saltcoats-herald Newsquest Media Group ardrossanherald.com
39 The Argus https://media.info//newspapers/titles/the-argus Newsquest Media Group theargus.co.uk
40 Argyllshire Advertiser https://media.info//newspapers/titles/argyllshire-advertiser Oban Times Group argyllshireadvertiser.co.uk
41 Armthorpe Community Newsletter https://media.info//newspapers/titles/armthorpe-community-newsletter JPI Media jpimedia.co.uk
42 The Arran Banner https://media.info//newspapers/titles/the-arran-banner Oban Times Group arranbanner.co.uk
43 The Arran Voice https://media.info//newspapers/titles/the-arran-voice Independent News Ltd voiceforarran.com
44 The Art Newspaper https://media.info//newspapers/titles/the-art-newspaper None theartnewspaper.com
45 Ashbourne News Telegraph https://media.info//newspapers/titles/ashbourne-news-telegraph Reach plc ashbournenewstelegraph.co.uk
46 Ashby Echo https://media.info//newspapers/titles/ashby-echo Reach plc reachplc.com
47 Ashby Mail https://media.info//newspapers/titles/ashby-mail DMGT thisisleicestershire.co.uk
48 Ashfield Chad https://media.info//newspapers/titles/ashfield-chad JPI Media chad.co.uk
49 Ashford Adscene https://media.info//newspapers/titles/ashford-adscene DMGT thisiskent.co.uk
You can extract more information for each newspaper, as you wish - the above is just an example, going through the first 50 newspapers. Now if you want a multithreaded/async solution, I recommend you read the following, and apply it to your own scenario:
BeautifulSoup getting href of a list - need to simplify the script - replace multiprocessing
Lastly, Requests docs can be found here: https://requests.readthedocs.io/en/latest/
BeautifulSoup docs: https://beautiful-soup-4.readthedocs.io/en/latest/index.html
For TQDM: https://pypi.org/project/tqdm/

names = []
for letter in string.ascii_lowercase:
page = requests.get("https://media.info/newspapers/titles/starting-with/{}".format(letter))
soup = BeautifulSoup(page.content, "html.parser")
for i in soup.find_all("a"):
if i['href'].startswith("/newspapers/titles/"):
names.append(i['href'])

How do I get two DIV's text, so that it becomes a table using BeautifulSoup in Python?

How can I iterate through the links, then access their pages' specific divs' content and form like a table, using Python?
I've come this far (only), but the output is not right:
from bs4 import BeautifulSoup
import urllib3
http = urllib3.PoolManager()
base_url = 'http://www.warrencountyschools.org'
url = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
fieldContent = []
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
abs_link = base_url+link
profileURL = abs_link
profilePagResp = http.request('GET', profileURL)
soup2 = BeautifulSoup(profilePagResp.data)
flDiv = soup2.findAll('div', {'class', 'field-label'})
fcDiv = soup2.find('div', {'class', 'field-content'})
for fl in flDiv:
fieldContent.append(fcDiv.text)
print(fieldContent)
The output now consists of each name repeated the number of times it's iterates, while it should be like this:
Name
Email
Website
Phone
Buildings
SomeName
email#
wwww.
78978978
SomeBuildin

#Antonio Santos, All profile data aren't in the same order. So you can grab data only as follows :
Script
from bs4 import BeautifulSoup
import requests
import pandas as pd
base_url = 'http://www.warrencountyschools.org'
url = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
response = requests.get(url)
soup = BeautifulSoup(response.content,'html.parser')
# the second tr in the table - index starts at 0
table = soup.find('table', {'class': 'content staff-table'})
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
if len(cols) >= 3:
link = cols[2].find('a').get('href')
abs_link = base_url+link
print(abs_link)
final_page = requests.get(abs_link)
soup2 = BeautifulSoup(final_page .text,'html.parser')
profile_data =[x.get_text(strip=True) for x in soup2.findAll("div","field-content")]
print(profile_data)
Output:
http://www.warrencountyschools.org/staff/13650
['Greg Blewett', 'Greg.blewett#warren.kyschools.us', 'Access Staff Website', '270-746-7205', 'Greg Blewett - Construction-Carpentry - Warren County Area Technology Center']http://www.warrencountyschools.org/staff/25689
['Adrian Boggess', 'Staff', 'adrian.boggess#warren.kyschools.us', 'Tike Barton - Computerized Manufacturing and Machining - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/2403
['Kim Coomer', 'Teacher', 'kim.coomer#warren.kyschools.us', '270-746-7205', 'Kim Coomer - Career Specialist - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/13651
['Rex Cundiff', 'Rex.cundiff#warren.kyschools.us', 'Access Staff Website', '270-746-7205', 'Rex Cundiff - Welding - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/13652
['Susan Devore', 'Susan.devore#warren.kyschools.us', 'Access Staff Website', '270-746-7205', 'Susan Devore - Information Technology - Warren County Area Technology Center']http://www.warrencountyschools.org/staff/13666
['Michael Emberton', 'michael.emberton#warren.kyschools.us', 'Access Staff Website', 'Micheal Emberton - Automotive - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/25684
['Jacob Hildebrant', 'Staff', 'jacob.hildebrant#warren.kyschools.us', 'Greg Blewett -
Construction-Carpentry - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/25346
['Jeton Hyseni', 'Staff', 'Jeton.Hyseni#warren.kyschools.us', 'Administrative Assistant - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/25041
['Jesse Muse', 'Staff', 'jesse.muse#warren.kyschools.us', 'Tike Barton - Computerized
Manufacturing and Machining - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/2560
['Chris Riggs', 'Staff', 'chris.riggs#warren.kyschools.us', '467-7500', 'Administrative Assistant - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/24757
['Allison Runner', 'Staff', 'allison.runner#warren.kyschools.us', 'Administrative Assistant - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/25881
['Jacob Thomas', 'Staff', 'jacob.thomas#warren.kyschools.us', 'Greg Blewett - Construction-Carpentry - Warren County Area Technology Center']
http://www.warrencountyschools.org/staff/25880
['Brooke Weakly', 'Staff', 'brooke.bruington#warren.kyschools.us', 'Administrative Assistant - Warren County Area Technology Center']

How do I get two DIV's text, so that it becomes a table ...
Your approach to collecting the data is already very good, but to form key value pairs, which can be transferred into a table via pandas, for example, we have to consider the following points:
Prepare the headers
Extracting the text we use a list comprehension and remove the colons via list slicing to get clean results.
keys = [x.text[:-1] for x in soup2.find_all('div', {'class', 'field-label'})]
Note: Since 2016 in BeautifulSoup the method findALL() was renamed to find_all() it would be better to use in new code the actually syntax.
Prepare the content and glue it togehter
Extract the contents in the same way as the headers and combine them into a dict via zip().
profile = dict(tuple(zip(keys,[x.get_text(strip=True) for x in soup2.find_all('div', {'class', 'field-content'})])))
Note This is the crucial point in order to be able to map the contents in the correct columns.
Adjsutments on website value
Since the url of the website is not human-readable text (won't get it with text or get_text() method), but is in the href of the <a>, we have to do a separate check and take the url if it exists.
if (website := soup.select_one('a:-soup-contains("Access Staff Website")')):
profile['Website'] = base_url+website['href']
else:
profile['Website'] = ''
Store profiles and create the table
Last but not least, we add the dict to our result list and can transfer it via pandas into a data frame. Using fillna() we can determine what should be in all empty cells and to_csv() saves the data frame as a csv file.
fieldContent.append(profile)
pd.DataFrame(fieldContent).fillna('')#.to_csv('profile.csv', index=False)
Note In pandas you can determine which columns should be in output, some sorting, manipulating data, ....
Example
The complete example uses css selectors instead of the find() / find_all() methods, cause in my opinion you can select more focused but result is the same.
from bs4 import BeautifulSoup
import pandas as pd
import urllib3
http = urllib3.PoolManager()
base_url = 'http://www.warrencountyschools.org'
url = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
response = http.request('GET', url)
soup = BeautifulSoup(response.data)
fieldContent = []
for a in soup.select('a:-soup-contains("[Profile]")'):
profilePagResp = http.request('GET', base_url+a['href'])
soup = BeautifulSoup(profilePagResp.data)
keys = [x.text[:-1] for x in soup.select('.field-label')]
profile = dict(tuple(zip(keys,[x.get_text(strip=True) for x in soup.select('.field-content')])))
if (website := soup.select_one('a:-soup-contains("Access Staff Website")')):
profile['Website'] = base_url+website['href']
else:
profile['Website'] = ''
fieldContent.append(profile)
pd.DataFrame(fieldContent).fillna('')[['Name','Email','Website','Phone','Buildings']]#.to_csv('profile.csv', index=False)
Output
Name
Email
Website
Phone
Buildings
Greg Blewett
Greg.blewett#warren.kyschools.us
http://www.warrencountyschools.org/olc/13650
270-746-7205
Greg Blewett - Construction-Carpentry - Warren County Area Technology Center
Adrian Boggess
adrian.boggess#warren.kyschools.us
Tike Barton - Computerized Manufacturing and Machining - Warren County Area Technology Center
Kim Coomer
kim.coomer#warren.kyschools.us
270-746-7205
Kim Coomer - Career Specialist - Warren County Area Technology Center
Rex Cundiff
Rex.cundiff#warren.kyschools.us
http://www.warrencountyschools.org/olc/13651
270-746-7205
Rex Cundiff - Welding - Warren County Area Technology Center

You could use an async library like trio as this is more I/0 bound as you will be awaiting responses for requests to individual staff pages. I have added a custom sort, based on last name, in attempt to recreate the original results order. For larger result sets this might not match perfectly in case of ties. You might then extend by adding in a first name sort. The additional sort column can be dropped.
There does seem to be a FIFO processing instruction within trio but I haven't explored that.
import pandas as pd
import httpx
import trio
from bs4 import BeautifulSoup
LINK = 'https://www.warrencountyschools.org/district_staff.aspx?action=search&location=29&department=0'
ALL_INFO = []
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
async def get_staff_info(link, nurse):
async with httpx.AsyncClient(timeout=None) as client:
r = await client.get(link)
soup = await get_soup(r.text)
info_items = ['Name', 'Email', 'Website', 'Phone', 'Buildings']
staff_info = {}
for key in info_items:
try:
if key == 'Website':
value = 'https://www.warrencountyschools.org' + soup.select_one(
f'.field-label:-soup-contains("{key}:") + .field-content > a')['href']
else:
value = soup.select_one(
f'.field-label:-soup-contains("{key}:") + .field-content').text.strip()
except:
value = 'N/A'
finally:
staff_info[key.lower()] = value
ALL_INFO.append(staff_info)
async def get_links(LINK, nurse):
async with httpx.AsyncClient(timeout=None) as client:
r = await client.get(LINK)
soup = await get_soup(r.text)
for x in soup.select('#ctl00_ctl00_MasterContent_ContentColumnRight_ctl01_dg_staff .staff-profile-button > a'):
nurse.start_soon(
get_staff_info, 'https://www.warrencountyschools.org' + x['href'], nurse)
async def main():
async with trio.open_nursery() as nurse:
nurse.start_soon(get_links, LINK, nurse)
if __name__ == "__main__":
trio.run(main)
df = pd.DataFrame(ALL_INFO)
df['sort_value'] = [i.strip().split(' ')[-1] for i in df['name'].tolist()]
df.sort_values(by=['sort_value'], ascending=True, inplace=True)
#print(df)
df.to_csv('staff.csv',
encoding='utf-8-sig', index=False)
Output csv:

How to get all products from a beautifulsoup page

I want to get all the products on this page:
nike.com.br/snkrs#estoque
My python code is this:
produtos = []
def aviso():
print("Started!")
request = requests.get("https://www.nike.com.br/snkrs#estoque")
soup = bs4(request.text, "html.parser")
links = soup.find_all("a", class_="btn", text="Comprar")
links_filtred = list(set(links))
for link in links_filtred:
if(produto not in produtos):
request = requests.get(f"{link['href']}")
soup = bs4(request.text, "html.parser")
produto = soup.find("div", class_="nome-preco-produto").get_text()
if(code_formated == ""):
code_formated = "\u200b"
print(f"Nome: {produto} Link: {link['href']}\n")
produtos.append(link["href"])
aviso()
Guys, this code gets the products from the page, but not all yesterday, I suspect that the content is dynamic, but how can I get them all with request and beautifulsoup? I don't want to use Selenium or an automation library, how do I do that? I don't want to have to change my code a lot because it's almost done, how do I do that?

DO NOT USE requests.get if you are dealing with the same HOST.
Reason: read-that
import requests
from bs4 import BeautifulSoup
import pandas as pd
def main(url):
allin = []
with requests.Session() as req:
for page in range(1, 6):
params = {
'p': page,
'demanda': 'true'
}
r = req.get(url, params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(x.find_next('h2').get_text(strip=True, separator=" "), x['href'])
for x in soup.select('.aspect-radio-box')]
allin.extend(goal)
df = pd.DataFrame(allin, columns=['Title', 'Url'])
print(df)
main('https://www.nike.com.br/Snkrs/Feed')
Output:
Title Url
0 Dunk High x Fragment design Black https://www.nike.com.br/dunk-high-x-fragment-d...
1 Dunk Low Infantil (16-26) City Market https://www.nike.com.br/dunk-low-infantil-16-2...
2 ISPA Flow 2020 Desert Sand https://www.nike.com.br/ispa-flow-2020-153-169...
3 ISPA Flow 2020 Pure Platinum https://www.nike.com.br/ispa-flow-2020-153-169...
4 Nike iSPA Men's Lightweight Packable Jacket https://www.nike.com.br/nike-ispa-153-169-211-...
.. ... ...
115 Air Jordan 1 Mid Hyper Royal https://www.nike.com.br/air-jordan-1-mid-153-1...
116 Dunk High Orange Blaze https://www.nike.com.br/dunk-high-153-169-211-...
117 Air Jordan 5 Stealth https://www.nike.com.br/air-jordan-5-153-169-2...
118 Air Jordan 3 Midnight Navy https://www.nike.com.br/air-jordan-3-153-169-2...
119 Air Max 90 Bacon https://www.nike.com.br/air-max-90-153-169-211...
[120 rows x 2 columns]

To get the data you can send a request to:
https://www.nike.com.br/Snkrs/Estoque?p=<PAGE>&demanda=true
where providing a page number between 1-5 to p= in the URL.
For example, to print the links, you can try:
import requests
from bs4 import BeautifulSoup
url = "https://www.nike.com.br/Snkrs/Estoque?p={page}&demanda=true"
for page in range(1, 6):
response = requests.get(url.format(page=page))
soup = BeautifulSoup(response.content, "html.parser")
print(soup.find_all("a", class_="btn", text="Comprar"))

Recursive Web Scraping Pagination

I'm trying to scrape some real estate articles from the following website:
Link
I manage to get the links I need,but I am struggling with pagination on the web page.I'm trying to scrape every link under each category 'building relationships', 'building your team', 'capital rising' etc.Some of these categories pages have pagination and some of them do not contain pagination.I tried with the following code but it just gives me the links from 2 page.
from requests_html import HTMLSession
def tag_words_links(url):
global _session
_request = _session.get(url)
tags = _request.html.find('a.tag-cloud-link')
links = []
for link in tags:
links.append({
'Tags': link.find('a', first=True).text,
'Links': link.find('a', first=True).attrs['href']
})
return links
def parse_tag_links(link):
global _session
_request = _session.get(link)
articles = []
try:
next_page = _request.html.find('link[rel="next"]', first=True).attrs['href']
_request = _session.get(next_page)
article_links = _request.html.find('h3 a')
for article in article_links:
articles.append(article.find('a', first=True).attrs['href'])
except:
_request = _session.get(link)
article_links = _request.html.find('h3 a')
for article in article_links:
articles.append(article.find('a', first=True).attrs['href'])
return articles
if __name__ == '__main__':
_session = HTMLSession()
url = 'https://lifebridgecapital.com/podcast/'
links = tag_words_links(url)
print(parse_tag_links('https://lifebridgecapital.com/tag/multifamily/'))

To print title of every article under each tag and each page under the tag you can use this example:
import requests
from bs4 import BeautifulSoup
url = "https://lifebridgecapital.com/podcast/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tag_links = [a["href"] for a in soup.select(".tagcloud a")]
for link in tag_links:
while True:
print(link)
print("-" * 80)
soup = BeautifulSoup(requests.get(link).content, "html.parser")
for title in soup.select("h3 a"):
print(title.text)
print()
next_link = soup.select_one("a.next")
if not next_link:
break
link = next_link["href"]
Prints:
...
https://lifebridgecapital.com/tag/multifamily/
--------------------------------------------------------------------------------
WS890: Successful Asset Classes In The Current Market with Jerome Maldonado
WS889: How To Avoid A $1,000,000 Mistake with Hugh Odom
WS888: Value-Based On BRRRR VS Cap Rate with John Stoeber
WS887: Slow And Steady Still Wins The Race with Nicole Pendergrass
WS287: Increase Your NOI by Converting Units to Short Term Rentals with Michael Sjogren
WS271: Investment Strategies To Survive An Economic Downturn with Vinney Chopra
WS270: Owning a Construction Company Creates More Value with Abraham Ng’hwani
WS269: The Impacts of Your First Deal with Kyle Mitchell
WS260: Structuring Deals To Get The Best Return On Investment with Jeff Greenberg
WS259: Capital Raising For Newbies with Bryan Taylor
https://lifebridgecapital.com/tag/multifamily/page/2/
--------------------------------------------------------------------------------
WS257: Why Ground Up Development is the Best Investment with Sam Bates
WS256: Mobile Home Park Investing: The Real Deal with Jefferson Lilly
WS249: Managing Real Estate Paperwork Successfully with Krista Testani
WS245: Multifamily Syndication with Venkat Avasarala
WS244: Passive Investing In Real Estate with Kay Kay Singh
WS243: Getting Started In Real Estate Brokerage with Tyler Chesser
WS213: Data Analytics In Real Estate with Raj Tekchandani
WS202: Ben Leybovich and Sam Grooms on The Advantages Of A Partnership In Real Estate Business
WS199: Financial Freedom Through Real Estate Investing with Rodney Miller
WS197: Loan Qualifications: How The Whole Process Works with Vinney Chopra
https://lifebridgecapital.com/tag/multifamily/page/3/
--------------------------------------------------------------------------------
WS172: Real Estate Syndication with Kyle Jones
...

Scraping e-commerce in python - cannot fetch product categories and total amounts

So far my code can scrape the number of items on sale in the category Charms. But I cannot make it print out the name of the category.
The site uses an infinite scroller - but I managed to identify where the sites are and thus the site URL contains {} which is filled out with the while loop.
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
while page<=1000:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
category_tags = soup.select('span.breadcrumb-element')
return Total_items
return category_tags
if __name__ == '__main__':
page = 0
product_list = []
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items)
print(category_tags)
Here's what I need:
I need to print out the category of the scraped items, which can be found in using this line:
category_tags = soup.select('span.breadcrumb-element')
But I cannot make it print somehow.
While we're at it, how can I make the code print out ALL the items and not just the items on sale?
Thank you.
EDIT:
So building one of the guys' code I ended up with this.
import requests
from bs4 import BeautifulSoup
import re
url1 = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
url2 = "https://us.pandora.net/en/bracelets/?sz=30&start={}&format=page-element"
url3 = "https://us.pandora.net/en/rings/?sz=30&start={}&format=page-element"
url4 = "https://us.pandora.net/en/necklaces/?sz=30&start={}&format=page-element"
url5 = "https://us.pandora.net/en/earrings/?sz=30&start={}&format=page-element"
#res = requests.get(link.format(url1),headers={"User-Agent":"Mozilla/5.0"})
soup1 = BeautifulSoup(requests.get(url1.format(0)).text, 'lxml')
soup2 = BeautifulSoup(requests.get(url2.format(0)).text, 'lxml')
soup3 = BeautifulSoup(requests.get(url3.format(0)).text, 'lxml')
soup4 = BeautifulSoup(requests.get(url4.format(0)).text, 'lxml')
soup5 = BeautifulSoup(requests.get(url5.format(0)).text, 'lxml')
total_items1 = ''.join(re.findall(r'\d', soup1.select_one('span.products-count').text))
total_items2 = ''.join(re.findall(r'\d', soup2.select_one('span.products-count').text))
total_items3 = ''.join(re.findall(r'\d', soup3.select_one('span.products-count').text))
total_items4 = ''.join(re.findall(r'\d', soup4.select_one('span.products-count').text))
total_items5 = ''.join(re.findall(r'\d', soup5.select_one('span.products-count').text))
#categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')
#total_items_sale1 = ''.join(re.findall(r'\d', soup1.select_one('.grid-tile .price-standard')))
#total_items_sale1
#total_items_sale1
#total_items_sale1
#total_items_sale1
#print('Categories:')
#for category in categories:
#print('\t{}'.format(category))
print('\nTotal Charms: {}'.format(total_items1))
print('\nTotal Bracelets: {}'.format(total_items2))
print('\nTotal Rings: {}'.format(total_items3))
print('\nTotal Necklaces: {}'.format(total_items4))
print('\nTotal Earrings: {}'.format(total_items5))
I know it looks horrible. How can we shorten it?

Looking at the result from the server, you don't have to loop through all pages. All the info you have on one page:
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
sale_url = "https://us.pandora.net/en/sale/sale-charms/?sz=30&start={}&format=page-element"
soup = BeautifulSoup(requests.get(url.format(0)).text, 'lxml')
sale_soup = BeautifulSoup(requests.get(sale_url.format(0)).text, 'lxml')
total_items = soup.select_one('#products_count')['value']
total_sale_items = sale_soup.select_one('#products_count')['value']
categories = [tag['title'].strip() for tag in soup.select('.refinement-link[title]')]
print('Categories:')
for category in categories:
print('\t{}'.format(category))
print('\nTotal items: {}'.format(total_items))
print('Total sale items: {}'.format(total_sale_items))
Prints:
Categories:
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
$0 - $50
$50 - $100
$100 - $150
$150 & Over
Charms
New Arrivals
Best Sellers
Clips
Spacers
Dangles
Safety Chains
Alphabet & Symbols
Animals & Pets
Birthday
Touch of Color
Disney
Family
Holidays
Christmas
Inspirational
Symbols of Love
Nature
Passions
Vacation & Travel
Wedding & Anniversary
Last Chance
Pandora Reflexions™
Total items: 959
Total sale items: 376

can't have 2 returns there. The function stops after that first return, so if you want to return multiple objects, you can put that in one line. You also need to append that within a list within the loop. You have that outside of your loop. Note, I change it from 1000 to 300 to just test it.
Secondly, I think what you want is the text.
To print all the items, you'll need to get each item, not just the ones with 'price-standard'
import requests
from bs4 import BeautifulSoup
url = "https://us.pandora.net/en/charms/?sz=30&start={}&format=page-element"
def fetch_items(link,page):
Total_items = 0
categories = []
while page<=300:
#print("current page no: ",page)
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
list_total = soup.select('.grid-tile .price-standard')
Total_items += len(list_total)
#print(Total_items)
page+=30
print(page)
category_tags = soup.select('span.breadcrumb-element')[0]
try:
categories.append(category_tags.text)
except:
categories.append('N/A')
return Total_items, categories
page = 0
total_items = fetch_items(url,page)
#print number of items on sale
print(total_items[0])
print(total_items[1])
Here's how you can go about getting the whole products:
def fetch_items(link,page):
Total_items = 0
names = []
categories = []
prices = []
sales = []
while page<=300:
res = requests.get(link.format(page),headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
products = soup.find_all("li", class_=lambda value: value and value.startswith("grid-tile"))
for each in products:
Total_items += 1
category = each.find('div', {'class':'product-tile'})['data-cgid']
name = each.find('div', {'class':'product-name'}).text.strip()
price = each.find('div', {'class':'product-pricing'}).text.strip()
sale_price = each.find('span', {'class':'price-sales'}).text.strip()
names.append(name)
categories.append(category)
prices.append(price)
sales.append(sale_price)
print(page)
page+=30
return Total_items, names, categories, prices, sales
results = fetch_items(url,page)
Not Sure how you want those results though. But you can dump that into a table if you'd like:
import pandas as pd
df = pd.DataFrame(
{'name':results[1],
'category':results[2],
'price':results[3],
'sale':results[4]})
Output:
print (df.head(10).to_string())
name category price sale
0 American Icons Dangle Charm charms $60.00 $60.00
1 Disney Pixar, Toy Story, Buzz Lightyear Dangle... charms $70.00 $70.00
2 Disney Pixar, Toy Story, Woody Dangle Charm charms $60.00 $60.00
3 Spinning Globe Dangle Charm charms $60.00 $60.00
4 Elephant Charm charms $45.00 $45.00
5 Canada Dangle Charm, Pandora Rose™ charms $65.00 $65.00
6 Sparkling Monkey Charm charms $70.00 $70.00
7 Propeller Plane Dangle Charm charms $55.00 $55.00
8 Spotted Heart Charm charms $50.00 $50.00
9 Pink Travel Bag Charm charms $50.00 $50.00

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web scraping of multiple web pages with python - python

Related

Beautifulsoup taking too much time to execute in the code

How do I get two DIV's text, so that it becomes a table using BeautifulSoup in Python?

How to get all products from a beautifulsoup page

Recursive Web Scraping Pagination

Scraping e-commerce in python - cannot fetch product categories and total amounts

Categories

Resources