How would I proceed in this web scraping project using bs4 and requests? I am trying to extract user info from a forum site (myfitnesspal exactly: https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1), specifically the username, message, and date posted, and load them into columns on a csv. I have this code so far but am unsure about how to proceed:
from bs4 import BeautifulSoup
import csv
import requests
# get page source and create a BS object
print('Reading page...')
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
src = page.content
soup = BeautifulSoup(src, 'html.parser')
#container = soup.select('#vanilla_discussion_index > div.container')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul')
postdata = soup.select('div.Message')
user = []
date = []
text = []
for post in postdata:
text.append(BeautifulSoup(str(post), 'html.parser').get_text().encode('utf-8').strip())
print(text) # this stores the text of each comment/post in a list,
# so next I'd want to store this in a csv with columns
# user, date posted, post with this under the post column
# and do the same for user and date
This script will get all messages from the page and saves them in data.csv:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for u, d, m in zip(soup.select('.Username'), soup.select('.DateCreated'), soup.select('.Message')):
all_data.append([u.text, d.get_text(strip=True),m.get_text(strip=True, separator='\n')])
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
writer.writerow(row)
Screenshot from LibreOffice:
One rule of thumb I like to follow with web scraping is being specific as possible without picking up unnecessary information. So for example, if I want to select a username I inspect the element containing the information I need:
<a class="Username" href="...">Username</a>
Since I am trying to collect usernames it makes the most sense to select by the class "Username":
soup.select("a.Username")
This gives me a list of all the usernames that are found on the page, this is great, however, if we want to select the data in "packages" (by post in your example we need to collect each post individually.
To accomplish this you could do something like the following:
comments = soup.select("div.comment")
This will make it easier to then do the following:
with open('file.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['user', 'date', 'text']
for comment in comments:
username = comment.select_one("div.Username")
date = comment.select_one("span.BodyDate")
message = comment.select_one("div.Message")
writer.writerow([username, date, message])
Doing it this way also makes sure your data stays in order even if an element is missing.
Here you go:
from bs4 import BeautifulSoup
import csv
import requests
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
soup = BeautifulSoup(page.content, 'html.parser')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul > li')
with open('data.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=['user', 'date', 'text'])
writer.writeheader()
for comment in container:
writer.writerow({
'user': comment.find('a', {'class': 'Username'}).get_text(),
'date': comment.find('span', {'class': 'BodyDate DateCreated'}).get_text().strip(),
'text': comment.find('div', {'class': 'Message'}).get_text().strip()
})
Related
When i run the code and i get my CSV file, its actually empty.
'''
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('section', class_='re-CardPackAdvance')
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a', class_='re-CardPackAdvance-info-container').text.replace('\n', '')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)
'''
I expected to have all the info scrapped from this website, but seems like i did something wrong at some point
You are parsing the resulting soup not appropriately. There is no section with the re-CardPackAdvance class. I adapted the code accordingly (find all articles with class that starts with re-CardPack). Please also note that you need to shift the for-loop by one indention. However, due to the structure of the page, only the first two entries are loaded directly when fetching the page. All other entries are fetched after the page has loaded in the browser (via javascript). I think you might consider using the API of the page instead.
import requests
from bs4 import BeautifulSoup
from csv import writer
import re
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all("article", class_=re.compile("^re-CardPack"))
print(len(lists))
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a').get('title')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)
I am new to python. is anyone know {sum(int(td.text) for td in soup.select('td:last-child')[1:])} what is use of [1:] in this or [0] or [1]. i saw it in many scraping examples below for in loop. As i was practicing i build this code and don't able to scrape all data in csv file. thanks in advance, sorry for two question at one time.
import requests
from bs4 import BeautifulSoup
import csv
url= "https://iplt20.com/stats/2020/most-runs"
r= requests.get (url)
soup= BeautifulSoup (r.content, 'html5lib')
lst= []
table=soup.find ('div', attrs = {'class':'js-table'})
#for row in table.findAll ('div', attrs= {'class':'top-players__player-name'}):
# score = {}
# score['Player'] = row.a.text.strip()
# lst.append(score)
for row in table.findAll (class_='top-players__m top-players__padded '):
score = {}
score['Matches'] = int(row.td.text)
lst.append(score)
filename= 'iplStat.csv'
with open (filename, 'w', newline='') as f:
w= csv.DictWriter(f,['Player', 'Matches'])
w.writeheader()
for score in lst:
w.writerow(score)
print (lst)
All of this is not even needed. Just use pandas:
import requests
import pandas as pd
url = "https://iplt20.com/stats/2020/most-runs"
r = requests.get (url)
df = pd.read_html(r.content)[0]
df.to_csv("iplStats.csv", index = False)
Screenshot of csv file:
Everything works as expected when I'm using a single URL for the URL variable to scrape, but not getting any results when attempting to read links from a csv. Any help is appreciated.
Info about the CSV:
One column with a header called "Links"
300 rows of links with no space, commoa, ; or other charters before/after the links
One link in each row
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
#print(res.url)
url = res
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_= "app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_= "app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_= "app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_= "app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0,email_elm1,email_elm2,email_elm3)
print(final_email_elm)
df = pd.DataFrame(final_email_elm)
#getting an output in csv format for the dataframe we created
#df.to_csv('draft_part2_scrape.csv')
The problem lies in this part of the code:
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
res = requests.get(link['Links'])
...
After the loop is executed, res will have the last link. So, this program will only scrape the last link.
To solve this problem, store all the links in a list and iterate that list to scrape each of the link. You can store the scraped result in a seperate dataframe and concatenate them at the end to store in a single file:
import requests # required to make request
from bs4 import BeautifulSoup # required to parse html
import pandas as pd
import csv
links = []
with open("urls.csv") as infile:
reader = csv.DictReader(infile)
for link in reader:
links.append(link['Links'])
dfs = []
for url in links:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
email_elm0 = soup.find_all(class_="app-support-list__item")[0].text.strip()
email_elm1 = soup.find_all(class_="app-support-list__item")[1].text.strip()
email_elm2 = soup.find_all(class_="app-support-list__item")[2].text.strip()
email_elm3 = soup.find_all(class_="app-support-list__item")[3].text.strip()
final_email_elm = (email_elm0, email_elm1, email_elm2, email_elm3)
print(final_email_elm)
dfs.append(pd.DataFrame(final_email_elm))
#getting an output in csv format for the dataframe we created
df = pd.concat(dfs)
df.to_csv('draft_part2_scrape.csv')
I am designing scraping project for my research but i am stuck in to write scrape data in csv. Please help me for that?
i have successfully scrape data but i want to store it in csv here below is my code
need to write code to pull all of the html from a website then save it to a csv file.
I believe I somehow need to turn the links into a list and then write the list, but I'm unsure how to do that.
This is what I have so far:
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
print("Wait Scraper is working on ")
time.sleep(10)
if(page.status_code != 200):
print("Error in Scraping check the url")
else:
print("Successfully scrape the data")
time.sleep(10)
print("Loading data in csv")
file = csv.writer(open('dataminer.csv', 'w'))
file.writerow(['ProfileName', 'CompanyName', 'Salary', 'Job', 'Location'])
for pname in soup.find_all(class_="profile-name"):
#print(pname.text)
profname = pname.text
file.writerow([profname, ])
for cname in soup.find_all(class_="company_name"):
print(cname.text)
for salary in soup.find_all(class_="salary"):
print(salary.text)
for lpa in soup.find_all(class_="jobText"):
print(lpa.text)
for loc in soup.find_all(class_="location"):
print(loc.text)
Make a dict and save the data into it then save to csv, check below code!
import requests
import time
from bs4 import BeautifulSoup
import csv
# Collect and parse first page
page = requests.get('https://www.myamcat.com/jobs')
soup = BeautifulSoup(page.content, 'lxml')
data = []
print("Wait Scrapper is working on ")
if(page.status_code != 200):
print("Error in Srapping check the url")
else:
print("Successfully scrape the data")
for x in soup.find_all('div',attrs={'class':'job-page'}):
data.append({
'pname':x.find(class_="profile-name").text.encode('utf-8'),
'cname':x.find(class_="company_name").text.encode('utf-8'),
'salary':x.find(class_="salary").text.encode('utf-8'),
'lpa':x.find(class_="jobText").text.encode('utf-8'),
'loc':x.find(class_="location").text.encode('utf-8')})
print("Loading data in csv")
with open('dataminer.csv', 'w') as f:
fields = ['salary', 'loc', 'cname', 'pname', 'lpa']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
Apart from what you have got in other answer, you can scrape and write the content at the same time as well. I used .select() instead of .find_all() to achieve the same.
import csv
import requests
from bs4 import BeautifulSoup
URL = "https://www.myamcat.com/jobs"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
with open('myamcat_doc.csv','w',newline="",encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(['pname','cname','salary','loc'])
for item in soup.select(".job-listing .content"):
pname = item.select_one(".profile-name h3").get_text(strip=True)
cname = item.select_one(".company_name").get_text(strip=True)
salary = item.select_one(".salary .jobText").get_text(strip=True)
loc = item.select_one(".location .jobText").get_text(strip=True)
writer.writerow([pname,cname,salary,loc])
I want to write prices and corresponding addresses to a CSV file in Excel. I have this code so far which gives the output shown below in the photo.
What I want is a column for price first and a column for the address second.
[![from bs4 import BeautifulSoup
import requests
import csv
number = "1"
url = "http://www.trademe.co.nz/browse/categoryattributesearchresults.aspx?cid=5748&search=1&v=list&134=1&nofilters=1&originalsidebar=1&key=1654466070&page=" + number + "&sort_order=prop_default&rptpath=350-5748-3399-"
r= requests.get(url)
soup = BeautifulSoup(r.content)
output_file= open("output.csv","w")
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
n = 1
while n != 150:
b = (price\[n\].text)
b = str(b)
n = n + 1
output_file.write(b)
output_file.close()][1]][1]
Maybe something like this?
from bs4 import BeautifulSoup
import requests
import csv
....
r = requests.get(url)
soup = BeautifulSoup(r.content)
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
dataset = [(x.text, y.text) for x,y in zip(price, address)]
with open("output.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
for data in dataset[:150]: #truncate to 150 rows
writer.writerow(data)
There are a few problems with your code. Getting the prices and addresses into separate lists risks the site switching the order of the items, etc. and getting them mixed up. When scraping entries like this it is important to first find the larger enclosing container, then narrow down from there.
Unfortunately the URL you provided is no longer valid. As such I just browsed to another set of listings for this example:
from bs4 import BeautifulSoup
import requests
import csv
url = 'http://www.trademe.co.nz/property/residential-property-for-sale'
url += '/waikato/view-list'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open('output.csv', 'w', newline='') as csvfile:
propertyWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for listing in soup.find_all('div',
{'class': 'property-list-view-card'}):
price = listing.find_all('div',
{'class': 'property-card-price-container'})
address = listing.find_all('div',
{'class': 'property-card-subtitle'})
propertyWriter.writerow([price[0].text.strip(),
address[0].text.strip()])