import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.asta.org/membership/directory-search-details?memId=900312276"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
data = []
for item in soup.find_all("div", {"class": "asta-member"}):
name = item.find("h4", {"class": "asta-member__name"}).text
data.append([name])
for item in soup.find_all("div", {"class": "asta-member_title"}):
email = item.find("div")[2].text
data.append([email])
with open("contacts.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "email"])
writer.writerows(data)
inspect data
script scrapes the name data as its in its own class, but cannot understand what class/other directory to put in script to scrape email
I have tried everything I can think of
Suggestion:
Since the div for email has no class or id you can use the .contents method in beautiful soup. This method lists all the tag contents.
In this case, we will get all the content on "class": "asta-member" since the email div is under it and extract the email text.
Code:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.asta.org/membership/directory-search-details?memId=900312276"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
soup
data = []
for item in soup.find_all("div", {"class": "asta-member"}):
name = item.find("h4", {"class": "asta-member__name"}).text
data.append(name)
#Code changes here
for item in soup.find_all("div", {"class": "asta-member"}):
email = item.contents[7].text.strip()
data.append(email)
with open("contacts.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "email"])
writer.writerow(data)
#reading the CSV file
df = pd.read_csv('/content/contacts.csv')
df
Output:
Reference:
https://beautiful-soup-4.readthedocs.io/en/latest/#contents-and-children
Related
I want to get info from the website by Web scraping with python(I learn it now), but it prints classes (which I got the info from) first in CSV then prints the information which I want. I saw the Youtube video many times, and I wrote the same code but it doesn't happen like the problem which I got. Is there anyone kan HELP me?
This is an image link for CSV to show you how It looks when I click on RUN
Code:
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
Job_titles = []
Company_names = []
Location_names = []
Job_skills = []
Links = []
result = requests.get("https://wuzzuf.net/search/jobs/?q=python&a=hpb")
src = result.content
soup = BeautifulSoup(src, "lxml")
Job_titles = soup.find_all('h2', {"class":"css-m604qf"})
Company_names = soup.find_all('a', {"class":"css-17s97q8"})
Location_names = soup.find_all('span', {"class":"css-5wys0k"})
Job_skills = soup.find_all("div", {'class':"css-y4udm8"})
for i in range(len(Company_names)):
Job_titles.append(Job_titles[i].text)
Company_names.append(Company_names[i].text)
Location_names.append(Location_names[i].text)
Job_skills.append(Job_skills[i].text)
file_list = [Job_titles, Company_names, Location_names, Job_skills,]
exported = zip_longest(*file_list)
with open("C:/Users/Saleh saleh/Documents/jobtest.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["Job titles", "Company names", "Location", "Skills", "Links"])
wr.writerows(exported)
To get information from the site, you can use following example:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://wuzzuf.net/search/jobs/?q=python&a=hpb"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
with open("data.csv", "w") as f_in:
writer = csv.writer(f_in)
writer.writerow(
["Job titles", "Company names", "Location", "Skills", "Links"]
)
for title in soup.select("h2 > a"):
company_name = title.find_next("a")
location = company_name.find_next("span")
info = location.find_next("div", {"class": None})
writer.writerow(
[
title.text,
company_name.text,
location.text,
",".join(
a.text.replace("·", "").strip() for a in info.select("a")
),
title["href"],
]
)
Creates data.csv (screenshot from LibreOffice):
I'm trying to scrape reviews from TrustPilot, but the code always return with blank sheets and the headers/categories I specified. Could someone help me with this?
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
driver= webdriver.Chrome()
names=[] #List to store name of the product
headers=[] #List to store price of the product
bodies=[]
ratings=[] #List to store rating of the product
dates=[]
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = driver.page_source
soup = BeautifulSoup(content, "html.parser", parse_only=SoupStrainer('a'))
for a in soup.findAll('a', href=True, attrs={'class':'reviews-container'}):
name=a.find('div', attrs={'class':'consumer-information_name'})
header=a.find('div', attrs={'class':'review-content_title'})
body=a.find('div', attrs={'class':'review-content_text'})
rating=a.find('div', attrs={'class':'star-rating star-rating--medium'})
date=a.find('div', attrs={'class':'review-date--tooltip-target'})
names.append(name.text)
headers.append(header.text)
bodies.append(body.text)
ratings.append(rating.text)
dates.append(date.text)
print ('webpage, no errors')
df = pd.DataFrame({'User Name':names,'Header':headers,'Body':bodies,'Rating':ratings,'Date':dates})
df.to_csv('reviews02.csv', index=False, encoding='utf-8')
print ('csv made')```
The issue is soup.findAll('a', href=True, attrs={'class':'reviews-container'}) is not finding any results, so there are 0 iterations in the loop. Make sure you are using the correct tags and class names. Also you don't need to use a loop because BeautifulSoup has a find_all method. I used the requests module to open the web page, though it shouldn't make a difference.
from bs4 import BeautifulSoup
import requests
req = requests.get("https://www.trustpilot.com/review/birchbox.com?page=2")
content = req.content
soup = BeautifulSoup(content, "lxml")
names = soup.find_all('div', attrs={'class': 'consumer-information__name'})
headers = soup.find_all('h2', attrs={'class':'review-content__title'})
bodies = soup.find_all('p', attrs={'class':'review-content__text'})
ratings = soup.find_all('div', attrs={'class':'star-rating star-rating--medium'})
dates = soup.find_all('div', attrs={'class':'review-content-header__dates'})
And now each list has 20 entries.
I am new to scraping/BS4 and am having a problem getting this csv file to list all of the members. My problem is the CSV is listing one member's information in repeat over multiple lines. If anyone has any ideas to fix this, would be greatly appreciated.
import requests
import csv
from bs4 import BeautifulSoup
r = requests.get('https://vermontmaple.org/basic-member-list')
soup = BeautifulSoup(r.text, 'html.parser')
with open('list.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['name', 'address', 'phone'])
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = soup.find('div', class_='name').get_text(strip=True)
maple_address = soup.find('div', class_='address').get_text(strip=True)
maple_phone = soup.find('div', class_='phone').get_text(strip=True)
writer.writerow([maple_name, maple_address, maple_phone])
f.close()
change soup.find to company.find inside the forloop
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = company.find('div', class_='name').get_text(strip=True)
maple_address = company.find('div', class_='address').get_text(strip=True)
maple_phone = company.find('div', class_='phone').get_text(strip=True)
there is no need for a f.close()
how can I get more data form more than one page into my csv file
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://software-overzicht.nl/amersfoort?page=1','https://software-overzicht.nl/amersfoort?page=2' ).text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('cms_scrape.csv','w')
csv_writter = csv.writer(csv_file)
csv_writter.writerow(['naambedrijf', 'adress'])
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text
adress = search.p.text
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
print(title)
csv_writter.writerow([title,adress])
csv_file.close()`
You just need to move your requests.get() and that whole process into your loop of the page range:
from bs4 import BeautifulSoup
import requests
import csv
with open('C:/cms_scrape.csv','w', newline='') as f:
csv_writter = csv.writer(f)
csv_writter.writerow(['naambedrijf', 'adress'])
for page in range(1, 22):
url = 'https://software-overzicht.nl/amersfoort?page={}'.format(page)
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
for search in soup.find_all('div', class_='company-info-top'):
title = search.a.text.strip()
adress = search.p.text.strip()
print(title)
csv_writter.writerow([title,adress])
from bs4 import BeautifulSoup
import requests
import os
url = "http://nos.nl/artikel/2093082-steeds-meer-nekklachten-bij-kinderen-door-gebruik-tablets.html"
r = requests.get(url)
soup = BeautifulSoup(r.content.decode('utf-8', 'ignore'))
data = soup.find_all("article", {"class": "article"})
with open("data1.txt", "wb") as file:
content=‘utf-8’
for item in data:
content+='''{}\n{}\n\n{}\n{}'''.format( item.contents[0].find_all("time", {"datetime": "2016-03-16T09:50:30+0100"})[0].text,
item.contents[0].find_all("a", {"class": "link-grey"})[0].text,
item.contents[0].find_all("img", {"class": "media-full"})[0],
item.contents[1].find_all("div", {"class": "article_textwrap"})[0].text,
)
with open("data1.txt".format(file_name), "wb") as file:
file.write(content)
Recently solved a utf/Unicode problem but now it isn't saving it as a .txt file nor saving it at all. What do I need to do?
If you want to write the data as UTF-8 to the file try codecs.open like:
from bs4 import BeautifulSoup
import requests
import os
import codecs
url = "http://nos.nl/artikel/2093082-steeds-meer-nekklachten-bij-kinderen-door-gebruik-tablets.html"
r = requests.get(url)
soup = BeautifulSoup(r.content)
data = soup.find_all("article", {"class": "article"})
with codecs.open("data1.txt", "wb", "utf-8") as filen:
for item in data:
filen.write(item.contents[0].find_all("time", {"datetime": "2016-03-16T09:50:30+0100"})[0].get_text())
filen.write('\n')
filen.write(item.contents[0].find_all("a", {"class": "link-grey"})[0].get_text())
filen.write('\n\n')
filen.write(item.contents[0].find_all("img", {"class": "media-full"})[0].get_text())
filen.write('\n')
filen.write(item.contents[1].find_all("div", {"class": "article_textwrap"})[0].get_text())
I'm unsure about filen.write(item.contents[0].find_all("img", {"class": "media-full"})[0]) because that returned a Tag instance for me.