Beautiful Soup script not delivering desired CSV output - python

I am new to scraping/BS4 and am having a problem getting this csv file to list all of the members. My problem is the CSV is listing one member's information in repeat over multiple lines. If anyone has any ideas to fix this, would be greatly appreciated.
import requests
import csv
from bs4 import BeautifulSoup
r = requests.get('https://vermontmaple.org/basic-member-list')
soup = BeautifulSoup(r.text, 'html.parser')
with open('list.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['name', 'address', 'phone'])
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = soup.find('div', class_='name').get_text(strip=True)
maple_address = soup.find('div', class_='address').get_text(strip=True)
maple_phone = soup.find('div', class_='phone').get_text(strip=True)
writer.writerow([maple_name, maple_address, maple_phone])
f.close()

change soup.find to company.find inside the forloop
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = company.find('div', class_='name').get_text(strip=True)
maple_address = company.find('div', class_='address').get_text(strip=True)
maple_phone = company.find('div', class_='phone').get_text(strip=True)
there is no need for a f.close()

Related

Cannot scrape email data but can scrape name using BeautifulSoup

import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.asta.org/membership/directory-search-details?memId=900312276"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
data = []
for item in soup.find_all("div", {"class": "asta-member"}):
name = item.find("h4", {"class": "asta-member__name"}).text
data.append([name])
for item in soup.find_all("div", {"class": "asta-member_title"}):
email = item.find("div")[2].text
data.append([email])
with open("contacts.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "email"])
writer.writerows(data)
inspect data
script scrapes the name data as its in its own class, but cannot understand what class/other directory to put in script to scrape email
I have tried everything I can think of
Suggestion:
Since the div for email has no class or id you can use the .contents method in beautiful soup. This method lists all the tag contents.
In this case, we will get all the content on "class": "asta-member" since the email div is under it and extract the email text.
Code:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.asta.org/membership/directory-search-details?memId=900312276"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
soup
data = []
for item in soup.find_all("div", {"class": "asta-member"}):
name = item.find("h4", {"class": "asta-member__name"}).text
data.append(name)
#Code changes here
for item in soup.find_all("div", {"class": "asta-member"}):
email = item.contents[7].text.strip()
data.append(email)
with open("contacts.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["name", "email"])
writer.writerow(data)
#reading the CSV file
df = pd.read_csv('/content/contacts.csv')
df
Output:
Reference:
https://beautiful-soup-4.readthedocs.io/en/latest/#contents-and-children

Write list values to CSV as they pertain to current iteration

I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):

problem in Webscraping by Python/ Python prints classes first in CSV then prints the information

I want to get info from the website by Web scraping with python(I learn it now), but it prints classes (which I got the info from) first in CSV then prints the information which I want. I saw the Youtube video many times, and I wrote the same code but it doesn't happen like the problem which I got. Is there anyone kan HELP me?
This is an image link for CSV to show you how It looks when I click on RUN
Code:
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
Job_titles = []
Company_names = []
Location_names = []
Job_skills = []
Links = []
result = requests.get("https://wuzzuf.net/search/jobs/?q=python&a=hpb")
src = result.content
soup = BeautifulSoup(src, "lxml")
Job_titles = soup.find_all('h2', {"class":"css-m604qf"})
Company_names = soup.find_all('a', {"class":"css-17s97q8"})
Location_names = soup.find_all('span', {"class":"css-5wys0k"})
Job_skills = soup.find_all("div", {'class':"css-y4udm8"})
for i in range(len(Company_names)):
Job_titles.append(Job_titles[i].text)
Company_names.append(Company_names[i].text)
Location_names.append(Location_names[i].text)
Job_skills.append(Job_skills[i].text)
file_list = [Job_titles, Company_names, Location_names, Job_skills,]
exported = zip_longest(*file_list)
with open("C:/Users/Saleh saleh/Documents/jobtest.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["Job titles", "Company names", "Location", "Skills", "Links"])
wr.writerows(exported)
To get information from the site, you can use following example:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://wuzzuf.net/search/jobs/?q=python&a=hpb"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
with open("data.csv", "w") as f_in:
writer = csv.writer(f_in)
writer.writerow(
["Job titles", "Company names", "Location", "Skills", "Links"]
)
for title in soup.select("h2 > a"):
company_name = title.find_next("a")
location = company_name.find_next("span")
info = location.find_next("div", {"class": None})
writer.writerow(
[
title.text,
company_name.text,
location.text,
",".join(
a.text.replace("ยท", "").strip() for a in info.select("a")
),
title["href"],
]
)
Creates data.csv (screenshot from LibreOffice):

Multiple for loops and csv files

I am new here and newbie with python and currently learning some basic stuff, mostly scraping and I encountered a problem that I hope you can help me to solve.
I'm trying to scrape few details from a website and writing them into a CSV file but I'm able to write only the last results into my CSV, apparently my script just overwrite the data.
Also if you find any mistakes on my code or any room for improvement (which I'm sure there are) I'd be glad if you will point them out as well.
Also2, any recommendation for videos/tutorials that can help me improve my python and scraping skills would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
csv_file = open('contacts.csv', 'w')
csv_writer = csv.writer (csv_file)
csv_writer.writerow(["department", "name", "position", "phone"])
for department in soup.find_all("div", class_="view-content"):
department_name = department.h3
print (department_name.text)
for contacts in soup.find_all("div", class_="col-md-7 col-xs-10"):
contact_name = contacts.strong
print(contact_name.text)
for position in soup.find_all("div", class_="field-content"):
print(position.text)
for phone in soup.find_all("div", class_="modal-content"):
first_phone = phone.h3
first_phones = first_phone
print(first_phones)
csv_writer.writerow([department_name, contact_name, position, first_phones])
csv_file.close()
Thanks Thomas,
Actually I tweaked my code a little bit by thinking how I can make it simpler (four for loops are too much, no?) so with the following code I solved my problem(dropped the 'department' and 'phones' because some other issues):
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
f = open("contactslot.csv", "w+")
csv_writer = csv.writer (f)
csv_writer.writerow(["Name", "Position"])
infomation = soup.find_all("div", class_="well profile")
info = information[0]
for info in information:
contact_name = info.find_all("div", class_="col-md-7 col-xs-10")
names = contact_name[0].strong
name = names.text
print (name)
position_name = info.find_all("div", class_="field-content")
position = position_name[0].text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
Hi Babr welcome to use python. Your answer is good and here is one more little thing you may can do better.
use find replace find_all if you just want one element
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
f = open("/Users/mingjunliu/Downloads/contacts.csv", "w+")
csv_writer = csv.writer(f)
csv_writer.writerow(["Name", "Position"])
for info in soup.find_all("div", class_="well profile"):
contact_name = info.find("div", class_="col-md-7 col-xs-10")
names = contact_name.strong
name = names.text
print(name)
position_name = info.find("div", class_="field-content")
position = position_name.text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
And the reason you need to drop phone and department is because of the bad website structure. It's not your fault.

writing beautiful soup output to CSV

I want to write prices and corresponding addresses to a CSV file in Excel. I have this code so far which gives the output shown below in the photo.
What I want is a column for price first and a column for the address second.
[![from bs4 import BeautifulSoup
import requests
import csv
number = "1"
url = "http://www.trademe.co.nz/browse/categoryattributesearchresults.aspx?cid=5748&search=1&v=list&134=1&nofilters=1&originalsidebar=1&key=1654466070&page=" + number + "&sort_order=prop_default&rptpath=350-5748-3399-"
r= requests.get(url)
soup = BeautifulSoup(r.content)
output_file= open("output.csv","w")
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
n = 1
while n != 150:
b = (price\[n\].text)
b = str(b)
n = n + 1
output_file.write(b)
output_file.close()][1]][1]
Maybe something like this?
from bs4 import BeautifulSoup
import requests
import csv
....
r = requests.get(url)
soup = BeautifulSoup(r.content)
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
dataset = [(x.text, y.text) for x,y in zip(price, address)]
with open("output.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
for data in dataset[:150]: #truncate to 150 rows
writer.writerow(data)
There are a few problems with your code. Getting the prices and addresses into separate lists risks the site switching the order of the items, etc. and getting them mixed up. When scraping entries like this it is important to first find the larger enclosing container, then narrow down from there.
Unfortunately the URL you provided is no longer valid. As such I just browsed to another set of listings for this example:
from bs4 import BeautifulSoup
import requests
import csv
url = 'http://www.trademe.co.nz/property/residential-property-for-sale'
url += '/waikato/view-list'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open('output.csv', 'w', newline='') as csvfile:
propertyWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for listing in soup.find_all('div',
{'class': 'property-list-view-card'}):
price = listing.find_all('div',
{'class': 'property-card-price-container'})
address = listing.find_all('div',
{'class': 'property-card-subtitle'})
propertyWriter.writerow([price[0].text.strip(),
address[0].text.strip()])

Categories