Saving data from website to csv with find_all from BeautifulSoup - python

I am trying to learn how to scrape a website with Python and BeautifulSoup. I have been able to collect all the names/job-titles, and I'm trying to save them into a csv-file. I either need some type of loop or append in order to get them all into a csv-file. As it stands now, only the final name and job-title are saved in the csv-file.
#import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
#specify the url
buzzly_page = 'http://buzzlymedia.com/ourwork/'
#query the website and return the html to the variable 'page'
page = urllib2.urlopen(buzzly_page)
#parse the html
soup = BeautifulSoup(page, 'html.parser')
#query to get value of name
for name_box in soup.find_all('strong', attrs={'itemprop': 'name'}):
name = name_box.text.strip() #remove starting and trailing
print name
#query to get value of job-title
for job_box in soup.find_all('span', attrs={'itemprop': 'jobTitle'}):
job = job_box.text.strip() #remove starting and trailing
print job
#write into csv-file
with open('buzzly_clients.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name, job])

Find the divs that contains the elements you want and iterate over them like this.
# import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
# specify the url
buzzly_page = 'http://buzzlymedia.com/ourwork/'
# query the website and return the html to the variable 'page'
page = urllib2.urlopen(buzzly_page)
# parse the html
soup = BeautifulSoup(page, 'html.parser')
# write into csv-file
with open('buzzly_clients.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for div in soup.find_all('div', attrs={'class': 'avia-testimonial-meta-mini'}):
# query to get value of name
name_box = div.find('strong', attrs={'itemprop': 'name'})
name = name_box.text.strip() # remove starting and trailing
print (name)
# query to get value of job-title
job_box = div.find('span', attrs={'itemprop': 'jobTitle'})
job = job_box.text.strip() # remove starting and trailing
print (job)
writer.writerow([name, job])

Related

Issue using BeautifulSoup and reading target URLs from a CSV

Everything works as expected when I'm using a single URL for the URL variable to scrape, but not getting any results when attempting to read links from a csv. Any help is appreciated.
Info about the CSV:
One column with a header called "Links"
300 rows of links with no space, commoa, ; or other charters before/after the links
One link in each row
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
#print(res.url)
url = res
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_= "app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_= "app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_= "app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_= "app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0,email_elm1,email_elm2,email_elm3)
print(final_email_elm)
df = pd.DataFrame(final_email_elm)
#getting an output in csv format for the dataframe we created
#df.to_csv('draft_part2_scrape.csv')
The problem lies in this part of the code:
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
...
After the loop is executed, res will have the last link. So, this program will only scrape the last link.
To solve this problem, store all the links in a list and iterate that list to scrape each of the link. You can store the scraped result in a seperate dataframe and concatenate them at the end to store in a single file:
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
links = []
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
links.append(link['Links'])
dfs = []
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_="app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_="app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_="app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_="app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0, email_elm1, email_elm2, email_elm3)
print(final_email_elm)
dfs.append(pd.DataFrame(final_email_elm))
#getting an output in csv format for the dataframe we created
df = pd.concat(dfs)
df.to_csv('draft_part2_scrape.csv')

How to write csv and insert scrape data

I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?
i have successfully scrape data but i want to store it in csv here below is my code
need to write code to pull all of the html from a website then save it to a csv file.
I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.
This is what I have so far:
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
print("Error in Scraping check the url")
else:
print("Successfully scrape the data")
time.sleep(10)
print("Loading data in csv")
file = csv.writer(open('dataminer.csv', 'w'))
file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location'])
for pname in soup.find_all(class_="profile-name"):
#print(pname.text)
profname = pname.text
file.writerow([profname, ])
for cname in soup.find_all(class_="company_name"):
print(cname.text)
for salary in soup.find_all(class_="salary"):
print(salary.text)
for lpa in soup.find_all(class_="jobText"):
print(lpa.text)
for loc in soup.find_all(class_="location"):
print(loc.text)
Make a dict and save the data into it then save to csv, check below code!
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
print("Error in Srapping check the url")
else:
print("Successfully scrape the data")
for x in soup.find_all('div',attrs={'class':'job-page'}):
data.append({
'pname':x.find(class_="profile-name").text.encode('utf-8'),
'cname':x.find(class_="company_name").text.encode('utf-8'),
'salary':x.find(class_="salary").text.encode('utf-8'),
'lpa':x.find(class_="jobText").text.encode('utf-8'),
'loc':x.find(class_="location").text.encode('utf-8')})
print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.
import csv
import requests
from bs4 import BeautifulSoup
URL = "https://www.myamcat.com/jobs"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['pname','cname','salary','loc'])
for item in soup.select(".job-listing .content"):
pname = item.select_one(".profile-name h3").get_text(strip=True)
cname = item.select_one(".company_name").get_text(strip=True)
salary = item.select_one(".salary .jobText").get_text(strip=True)
loc = item.select_one(".location .jobText").get_text(strip=True)
writer.writerow([pname,cname,salary,loc])

How do I pull multiple values from html page using python?

I'm performing some data analysis for my own knowledge from nhl spread/betting odds information. I'm able to pull some information, but Not the entire data set. I want to pull the list of games and the associated into a panda dataframe, but I have been able to perform the proper loop around the html tags. I've tried the findAll option and the xpath route. I'm not successful with either.
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
name_box = page_content.find('div', attrs={'class': 'datarow'})
name = name_box.text.strip()
print (name)
This script goes through each datarow and pulls out each item individually and then appends them into a pandas DataFrame.
from bs4 import BeautifulSoup
import requests
import pandas as pd
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
# Take out the <div> of name and get its value
tables = page_content.find_all('div', class_='datarow')
# Iterate through rows
rows = []
# Iterate through each datarow and pull out each home/away separately
for table in tables:
# Get time and date
time_and_date_tag = table.find_all('div', attrs={"class": "time"})[0].contents
date = time_and_date_tag[1]
time = time_and_date_tag[-1]
# Get teams
teams_tag = table.find_all('div', attrs={"class": "datacell teams"})[0].contents[-1].contents
home_team = teams_tag[1].text
away_team = teams_tag[-1].text
# Get opening
opening_tag = table.find_all('div', attrs={"class": "child-open"})[0].contents
home_open_value = opening_tag[1]
away_open_value = opening_tag[-1]
# Get current
current_tag = table.find_all('div', attrs={"class": "child-current"})[0].contents
home_current_value = current_tag[1]
away_current_value = current_tag[-1]
# Create list
rows.append([time, date, home_team, away_team,
home_open_value, away_open_value,
home_current_value, away_current_value])
columns = ['time', 'date', 'home_team', 'away_team',
'home_open', 'away_open',
'home_current', 'away_current']
print(pd.DataFrame(rows, columns=columns))
Here is my solution to your question.
from bs4 import BeautifulSoup
import requests
page_link = 'https://www.thespread.com/nhl-hockey-public-betting-chart'
page_response = requests.get(page_link, timeout=5)
# here, we fetch the content from the url, using the requests library
page_content = BeautifulSoup(page_response.content, "html.parser")
for cell in page_content.find_all('div', attrs={'class': 'datarow'}):
name = cell.text.strip()
print (name)

Scraping multiple pages with python

I'm trying to scrape a multiple page website with beautiful soup. The code works partially. It returns only the last one page instead of all pages. How can I fix the problem?
# import libraries
import urllib.request
from bs4 import BeautifulSoup
# specify the url
aziende = [
'35-azienda-4_planets', '36-azienda-archivio_23', '24-azienda-bm', '16-azienda-brolese_virginio', '39-azienda-castellani', '19-azienda-centro_ottico_bisa', '25-azienda-comel_optik', '37-azienda-de_lorenzo_occhiali', '15-azienda-delta_laser', '34-azienda-dem', '21-azienda-erizzo', '3-azienda-evo', '27-azienda-farben_occhialeria', '32-azienda-gio__eyewear', '7-azienda-gipizeta', '42-azienda-h8', '20-azienda-idea_91', '5-azienda-lem', '41-azienda-lasertec', '22-azienda-le_thi_thu_thu', '28-azienda-m1', '1-azienda-mati_', '38-azienda-metal_dream', '30-azienda-mictu', '23-azienda-nete', '10-azienda-new_italian_design_eyewear', '31-azienda-okki_lux', '9-azienda-ottica_pra_floriani', '12-azienda-pao', '40-azienda-palladio_occhiali', '29-azienda-plastil_due', '17-azienda-punti_di_vista', '14-azienda-quemme', '4-azienda-red_line', '43-azienda-revert', '33-azienda-sm', '6-azienda-scussel', '8-azienda-sistem', '18-azienda-stile_italiano', '26-azienda-tecnodanta', '11-azienda-toffoli_costantino', '13-azienda-tri_color', '2-azienda-zago'
]
for azienda in aziende:
page_link = 'http://www.occhialeriabellunotreviso.it/' + azienda
page = urllib.request.urlopen(page_link ) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, 'html.parser') # parse the html using beautiful soup and store in variable `soup`
# Take out the <div> of name and get its value
name_box = soup.find('h2')
name = name_box.text.strip() # strip() is used to remove starting and trailing
print (name)
Just put the final lines of code that are outside the for-loop inside the for-loop so they are run for every page.
# import libraries
import urllib.request
from bs4 import BeautifulSoup
# specify the url
aziende = [
'35-azienda-4_planets', '36-azienda-archivio_23', '24-azienda-bm', '16-azienda-brolese_virginio', '39-azienda-castellani', '19-azienda-centro_ottico_bisa', '25-azienda-comel_optik', '37-azienda-de_lorenzo_occhiali', '15-azienda-delta_laser', '34-azienda-dem', '21-azienda-erizzo', '3-azienda-evo', '27-azienda-farben_occhialeria', '32-azienda-gio__eyewear', '7-azienda-gipizeta', '42-azienda-h8', '20-azienda-idea_91', '5-azienda-lem', '41-azienda-lasertec', '22-azienda-le_thi_thu_thu', '28-azienda-m1', '1-azienda-mati_', '38-azienda-metal_dream', '30-azienda-mictu', '23-azienda-nete', '10-azienda-new_italian_design_eyewear', '31-azienda-okki_lux', '9-azienda-ottica_pra_floriani', '12-azienda-pao', '40-azienda-palladio_occhiali', '29-azienda-plastil_due', '17-azienda-punti_di_vista', '14-azienda-quemme', '4-azienda-red_line', '43-azienda-revert', '33-azienda-sm', '6-azienda-scussel', '8-azienda-sistem', '18-azienda-stile_italiano', '26-azienda-tecnodanta', '11-azienda-toffoli_costantino', '13-azienda-tri_color', '2-azienda-zago'
]
for azienda in aziende:
page_link = 'http://www.occhialeriabellunotreviso.it/' + azienda
page = urllib.request.urlopen(page_link ) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, 'html.parser') # parse the html using beautiful soup and store in variable `soup`
# Take out the <div> of name and get its value
name_box = soup.find('h2')
name = name_box.text.strip() # strip() is used to remove starting and trailing
print (name)
There is nothing wrong with the way you have already tried except for the indentation. However, alternative approach could be something like below:
import urllib.request
from bs4 import BeautifulSoup
link = 'http://www.occhialeriabellunotreviso.it/{}'
aziende = (
'35-azienda-4_planets', '36-azienda-archivio_23', '24-azienda-bm', '16-azienda-brolese_virginio', '39-azienda-castellani', '19-azienda-centro_ottico_bisa', '25-azienda-comel_optik', '37-azienda-de_lorenzo_occhiali', '15-azienda-delta_laser', '34-azienda-dem', '21-azienda-erizzo', '3-azienda-evo', '27-azienda-farben_occhialeria', '32-azienda-gio__eyewear', '7-azienda-gipizeta', '42-azienda-h8', '20-azienda-idea_91', '5-azienda-lem', '41-azienda-lasertec', '22-azienda-le_thi_thu_thu', '28-azienda-m1', '1-azienda-mati_', '38-azienda-metal_dream', '30-azienda-mictu', '23-azienda-nete', '10-azienda-new_italian_design_eyewear', '31-azienda-okki_lux', '9-azienda-ottica_pra_floriani', '12-azienda-pao', '40-azienda-palladio_occhiali', '29-azienda-plastil_due', '17-azienda-punti_di_vista', '14-azienda-quemme', '4-azienda-red_line', '43-azienda-revert', '33-azienda-sm', '6-azienda-scussel', '8-azienda-sistem', '18-azienda-stile_italiano', '26-azienda-tecnodanta', '11-azienda-toffoli_costantino', '13-azienda-tri_color', '2-azienda-zago'
)
def get_item(url):
for azienda in aziende:
page = urllib.request.urlopen(url.format(azienda))
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('h2').get_text(strip=True)
yield name_box
if __name__ == '__main__':
for item in get_item(link):
print(item)

Python 2.7: How to check if row already exists?

I am trying to check if a row already exists. If it doesn't, something has to be written in the row. My CSV file is always empty.
# import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
# integer for first article id
articleid = 4449
articles = 4459
while articleid < articles:
# specify the url and article id
url = 'http://www.bkfrem.dk/default.asp?vis=nyheder&id='+str(articleid)
articleid += 1
# query the website and return the html to the variable
page = urllib2.urlopen(url)
# parse the html using beautiful soup and store in variable soup
soup = BeautifulSoup(page, 'html.parser')
# create CSV file
csvfile = csv.writer(open('news.csv', 'a'))
# take out the <div> of name and get its value and text
title_box = soup.find('h1', attrs={'style': 'margin-bottom:0px'})
title = title_box.text.encode('utf-8').strip()
date_box = soup.find('div', attrs={'style': 'font-style:italic; padding-bottom:10px'})
date = date_box.text.encode('utf-8').strip()
articleText_box = soup.find('div', attrs={'class': 'news'})
articleText = articleText_box.text.encode('utf-8').strip()
# print the data (encoded) to the CSV file
with open('news.csv', 'rb') as csvfileO:
f_reader = csv.reader(csvfileO, delimiter=',')
for row in f_reader:
if articleText not in row:
csvfile.writerow(["Title", "Date", "Text"])
csvfile.writerow((title, date, articleText))
What am I doing wrong since it's empty?
for row in f_reader:
if articleText not in
csvfile.writerow(["Title", "Date", "Text"])
csvfile.writerow((title, date, articleText))
You have if articleText not in
Not in what? You should have it pointing to something to validate.
if articleText not in "Something":
csvfile.writerow(["Title", "Date", "Text"])
csvfile.writerow((title, date, articleText))

Categories