How to scrape multiple pages with page number within URL - python

How do I scrape multiple pages when the page number is in the URL?
For example:
https://www.cars.com/for-sale/searchresults.action/?mdId=21811&mkId=20024&page=**1**&perPage=100&rd=99999&searchSource=PAGINATION&showMore=false&sort=relevance&stkTypId=28880&zc=11209
My code:
import requests
from bs4 import BeautifulSoup
from csv import writer
response = requests.get('https://www.cars.com/for-sale/searchresults.action/?mdId=21811&mkId=20024&page=1&perPage=100&rd=99999&searchSource=PAGINATION&showMore=false&sort=relevance&stkTypId=28880&zc=11209')
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all(class_='shop-srp-listings__inner')
with open('posts.csv', 'w') as csv_file:
csv_writer = writer(csv_file)
headers = ['title', 'color', 'price']
csv_writer.writerow(headers)
for post in posts:
title = post.find(class_="listing-row__title").get_text().replace('\n', '').strip()
# color = post.find("li").get_text().replace('\n', '')
price = post.find("span", attrs={"class": "listing-row__price"}).get_text().replace('\n', '').strip()
print(title, price)
# csv_writer.writerow([title, color, price])
Thanks for your help

page = 0
for x in range(25):
page+=1
url = ('https://www.cars.com/for-sale/searchresults.action/?mdId=21811&mkId=20024&page='+str(page)+
'&perPage=100&rd=99999&searchSource=PAGINATION&showMore=false&sort=relevance&stkTypId=28880&zc=11209')
print(url)
#requests.get(url)

Get total number of pages and iterate request to every page.
import requests
from bs4 import BeautifulSoup
from csv import writer
with open('posts.csv', 'w') as csv_file:
csv_writer = writer(csv_file)
headers = ['title', 'color', 'price']
csv_writer.writerow(headers)
response = requests.get('https://www.cars.com/for- sale/searchresults.action/?mdId=21811&mkId=20024&page=1&perPage=100&rd=99999&searchSource=PAGINATION&showMorefalse&sort=relevance&stkTypId=28880&zc=11209')
soup = BeautifulSoup(response.text, 'html.parser')
number_of_pages = soup.find_all(class_='js-last-page')
for page in range(1, number_of_pages+1):
response = requests.get('https://www.cars.com/for- sale/searchresults.action/?mdId=21811&mkId=20024&page='+ str(page)+'&perPage=100&rd=99999&searchSource=PAGINATION&showMorefalse&sort=relevance&stkTypId=28880&zc=11209')
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all(class_='shop-srp-listings__inner')
for post in posts:
title = post.find(class_="listing-row__title").get_text().replace('\n', '').strip()
# color = post.find("li").get_text().replace('\n', '')
price = post.find("span", attrs={"class": "listing-row__price"}).get_text().replace('\n', '').strip()
print(title, price)
# csv_writer.writerow([title, color, price])

Related

Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form

I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code:
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
page = soup.find_all('p')
for i in page:
i = i.text
with open('page.csv', 'a', encoding = 'utf8', newline='') as f:
thewriter = writer(f)
thewriter.writerow(i)
You can use regex to pull out what you need:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'divContent'})
p_list = content.find_all('p')
rows = []
for p in p_list:
string = p.text
text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups()
name = text[0]
edu = text[2]
phone = text[4]
email = text[5]
row = {
'name':name,
'education':edu,
'phone':phone,
'email':email}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('page.csv', index=False)

I can't get information with bs4

I'm trying scrape the title, contact information (phone) and webpage from this url:
https://partnerportal.fortinet.com/directory/search?l=Spain&p=1
&p=1 —This is the page. There are 92.
This is my code. I cannot get anything in the print output.
import datetime
import requests
from bs4 import BeautifulSoup
import csv
filename = "fichero" + datetime.datetime.now().strftime("%d-%m-%Y")+".csv"
with open(filename, "w+") as f:
writer = csv.writer(f)
writer.writerow(["Nombre Empresa", "Direccion Empresa", "Telefono Empresa"])
for i in range(1,3):
r = requests.get('https://partnerportal.fortinet.com/directory/search?l=Spain&p='+format(i))
soup = BeautifulSoup(r.text, "html.parser")
array_title = soup.select('div.panel panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel panel-default p.locator-partner-info')
array_webpage = soup.find_all('a', class_='locator-parter-site', text=True)
for iterator in range(0, len(array_title)):
title = array_title[iterator].text.strip()
for iterator2 in range(0, len(array_address)):
address = array_address[iterator2].text.strip()
print(title)
print(address)
instead of this
array_title = soup.select('div.panel panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel panel-default p.locator-partner-info')
Try This
array_title = soup.select('div.panel-default div.col-sm-10 h3')
array_address = soup.select('div.panel-default p.locator-partner-info')
and you are printing title in address loop which prints only the last assigned value i.e., Tiws. Better print in title loop only to see the correct result.
To get the data from the website:
url = "https://partnerportal.fortinet.com/directory/search?l=Spain&p=1"
html = bs4.BeautifulSoup(requests.get(url).text, 'lxml')
rows = html.find('div', {'class':'row row-results'})
results = [{
'id': row.find('div', {'class':'col-sm-10'})\
.find('h3').getText(),
'phone':row.find('div', {'class':'partner-info-box'})\
.find('p').getText().split('Phone: ')[1].split('\n')[0],
'url': row.find('div', {'class':'partner-info-box'})\
.find('a').get('href')
} for row in html.find('div', {'class':'row row-results'})\
.find_all('div', {'class':'col-sm-12'})]
In order to save the .json to .csv
import pandas as pd
import datetime
filename = "fichero" + datetime.datetime.now().strftime("%d-%m-%Y")+".csv"
pd.DataFrame.from_dict(results).to_csv(filename, index=False)

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

Unable to prints the Names and links in python

I got stuck on extracting names and links it doesn't any response but it prints prices.
link from where I scraping is: https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
for property in content:
links = property.find('div',{'class': 'col-md-12 d-table-cell align-middle'})['href']
name= property.find('img', class_ = 'img-fluid').text.strip()
price= property.find('div', class_ = 'ProductPriceRating d-table-cell text-center pl-1 pr-1 align-middle').text.strip()
print(name,links,price)
You can try like this
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import csv
url = 'https://sehat.com.pk/categories/Over-The-Counter-Drugs/Diarrhea-and-Vomiting-/'
r = requests.get(url)
# time.sleep(6)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'col-md-12 pr-0 pl-0')
# print(content)
header = ['url', 'item', 'price']
data = []
for property in content:
link =[i['href'] for i in property.findAll("a")][0]
title = [i.getText(strip=True) for i in property.find_all("a")][1]
price = [i.getText(strip=True) for i in property.find_all('div',{'class':"ProductPriceRating"})][0]
data.append([link, title, price])
print(data)
df = pd.DataFrame(data, columns=header)
df.to_csv("products.csv")

how can I apply for loop through more than one pages and add each page to csv file

how can I get more data form more than one page into my csv file
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1','https://software-overzicht.nl/amersfoort?page=2' ).text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv','w')
csv_writter = csv.writer(csv_file)
csv_writter.writerow(['naambedrijf', 'adress'])
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text
adress = search.p.text
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
print(title)
csv_writter.writerow([title,adress])
csv_file.close()`
You just need to move your requests.get() and that whole process into your loop of the page range:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
print(title)
csv_writter.writerow([title,adress])

Categories