How to STOP getting an empty CSV file with Scraping - python

When i run the code and i get my CSV file, its actually empty.
'''
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('section', class_='re-CardPackAdvance')
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a', class_='re-CardPackAdvance-info-container').text.replace('\n', '')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)
'''
I expected to have all the info scrapped from this website, but seems like i did something wrong at some point

You are parsing the resulting soup not appropriately. There is no section with the re-CardPackAdvance class. I adapted the code accordingly (find all articles with class that starts with re-CardPack). Please also note that you need to shift the for-loop by one indention. However, due to the structure of the page, only the first two entries are loaded directly when fetching the page. All other entries are fetched after the page has loaded in the browser (via javascript). I think you might consider using the API of the page instead.
import requests
from bs4 import BeautifulSoup
from csv import writer
import re
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all("article", class_=re.compile("^re-CardPack"))
print(len(lists))
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a').get('title')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)

Related

Rewriting Rows instead of adding to new one

Hello Everyone I am doing a web scraping of a website which has multiple pages(doing for 9 pages) and writing data in a csv file. every page has 24 rows of data which comes in total of 216 rows data for 9 pages but I am getting only 24 rows of data which I think is page no 9 data and python just re-writing the data again & again for every page in same rows instead of appending it.so please help me to figure out how I can make python to append each page data in ex. Here is my code:
import requests
from bs4 import BeautifulSoup
from csv import writer
for page in range(1,10):
url = 'https://www.flipkart.com/searchq=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}'.format(page =page)
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
links = soup.find_all('div', class_= '_2kHMtA')
with open('Flipkart.csv', 'w', encoding = 'utf8', newline= '') as f:
thewriter = writer(f)
header = ('Title', 'Specification', 'price', 'Rating Out of 5')
thewriter.writerow(header)
for link in links:
title = link.find('div', class_= '_4rR01T').text
Specification = link.find('ul', class_='_1xgFaf').text
price = link.find('div', class_ = '_30jeq3 _1_WHN1').text
Rating = link.find('span', class_='_1lRcqv')
if Rating:
Rating = Rating.text
else:
Rating = 'N/A'
info = [title, Specification, price,Rating]
thewriter.writerow(info)
First, your url is missing a question mark after search:
url = 'https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}'.format(page =page)
Next, change:
with open('Flipkart.csv', 'w', encoding = 'utf8', newline= '') as f:
into:
with open('Flipkart.csv', 'a', encoding = 'utf8', newline= '') as f:
as we want to use the mode a to append to the file. With w for write, you keep overwriting the file, which is the reason why you end up with only the information from the last page. See open.
Finally, put the header info inside an if-statement:
if page == 1:
header = ('Title', 'Specification', 'price', 'Rating Out of 5')
thewriter.writerow(header)
Otherwise, you will be repeating the header for each new page.

Want to Scrap each category individual but either it scraping data in single alphabet form or in a paragraph form

I want to extract Name & Position, Education, Contact number and email all in different column of csv but when I extract it either it is a single block per alphabet or a single column per paragraph(if I list it).Here is the code:
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
page = soup.find_all('p')
for i in page:
i = i.text
with open('page.csv', 'a', encoding = 'utf8', newline='') as f:
thewriter = writer(f)
thewriter.writerow(i)
You can use regex to pull out what you need:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
url = 'https://governors.pwcs.edu/about_us/staff_bios_and_contact_information'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
content = soup.find('div', {'id':'divContent'})
p_list = content.find_all('p')
rows = []
for p in p_list:
string = p.text
text = re.search('(^.*) (Education: )(.*)( Contact).*(\d{3}-\d{3}-\d{4})\s*([a-zA-z1-9].*#[\w].*\.[\w].*)', string).groups()
name = text[0]
edu = text[2]
phone = text[4]
email = text[5]
row = {
'name':name,
'education':edu,
'phone':phone,
'email':email}
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv('page.csv', index=False)

Write list values to CSV as they pertain to current iteration

I'm trying to write a list to a csv file such that the it comes out looking like this
I'm sure I'm not using the CSV library correctly since it prints each character of just the first link to the file Here's my code:
for t in terms:
fields = ["Search Term", "URL"]
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, "lxml")
for item in soup.find_all("item"):
link= str(item)
i = link.find("<link/>")
j = link.find("<guid")
links = link[i+7:j]
with open("urls.csv", "w") as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(links)
Any help would be so appreciated. Thanks!!
Use xml parser when creating the soup:
import csv
import requests
from bs4 import BeautifulSoup
terms = ["refrigerator", "kitchen sink"]
with open("urls.csv", "w") as f_out:
writer = csv.writer(f_out)
writer.writerow(["Search Term", "URL"])
for t in terms:
url = f"https://news.google.com/rss/search?q={t}&hl=en-US&gl=US&ceid=US%3Aen"
print(f"Getting {url}")
html_page = requests.get(url)
soup = BeautifulSoup(html_page.content, "xml")
for item in soup.find_all("link"):
writer.writerow([t, item.get_text(strip=True)])
Creates urls.csv (screenshot from LibreOffice):

problem in Webscraping by Python/ Python prints classes first in CSV then prints the information

I want to get info from the website by Web scraping with python(I learn it now), but it prints classes (which I got the info from) first in CSV then prints the information which I want. I saw the Youtube video many times, and I wrote the same code but it doesn't happen like the problem which I got. Is there anyone kan HELP me?
This is an image link for CSV to show you how It looks when I click on RUN
Code:
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
Job_titles = []
Company_names = []
Location_names = []
Job_skills = []
Links = []
result = requests.get("https://wuzzuf.net/search/jobs/?q=python&a=hpb")
src = result.content
soup = BeautifulSoup(src, "lxml")
Job_titles = soup.find_all('h2', {"class":"css-m604qf"})
Company_names = soup.find_all('a', {"class":"css-17s97q8"})
Location_names = soup.find_all('span', {"class":"css-5wys0k"})
Job_skills = soup.find_all("div", {'class':"css-y4udm8"})
for i in range(len(Company_names)):
Job_titles.append(Job_titles[i].text)
Company_names.append(Company_names[i].text)
Location_names.append(Location_names[i].text)
Job_skills.append(Job_skills[i].text)
file_list = [Job_titles, Company_names, Location_names, Job_skills,]
exported = zip_longest(*file_list)
with open("C:/Users/Saleh saleh/Documents/jobtest.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["Job titles", "Company names", "Location", "Skills", "Links"])
wr.writerows(exported)
To get information from the site, you can use following example:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://wuzzuf.net/search/jobs/?q=python&a=hpb"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
with open("data.csv", "w") as f_in:
writer = csv.writer(f_in)
writer.writerow(
["Job titles", "Company names", "Location", "Skills", "Links"]
)
for title in soup.select("h2 > a"):
company_name = title.find_next("a")
location = company_name.find_next("span")
info = location.find_next("div", {"class": None})
writer.writerow(
[
title.text,
company_name.text,
location.text,
",".join(
a.text.replace("ยท", "").strip() for a in info.select("a")
),
title["href"],
]
)
Creates data.csv (screenshot from LibreOffice):

How would I extract username, post, and date posted from discussion board?

How would I proceed in this web scraping project using bs4 and requests? I am trying to extract user info from a forum site (myfitnesspal exactly: https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1), specifically the username, message, and date posted, and load them into columns on a csv. I have this code so far but am unsure about how to proceed:
from bs4 import BeautifulSoup
import csv
import requests
# get page source and create a BS object
print('Reading page...')
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
src = page.content
soup = BeautifulSoup(src, 'html.parser')
#container = soup.select('#vanilla_discussion_index > div.container')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul')
postdata = soup.select('div.Message')
user = []
date = []
text = []
for post in postdata:
text.append(BeautifulSoup(str(post), 'html.parser').get_text().encode('utf-8').strip())
print(text) # this stores the text of each comment/post in a list,
# so next I'd want to store this in a csv with columns
# user, date posted, post with this under the post column
# and do the same for user and date
This script will get all messages from the page and saves them in data.csv:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for u, d, m in zip(soup.select('.Username'), soup.select('.DateCreated'), soup.select('.Message')):
all_data.append([u.text, d.get_text(strip=True),m.get_text(strip=True, separator='\n')])
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
writer.writerow(row)
Screenshot from LibreOffice:
One rule of thumb I like to follow with web scraping is being specific as possible without picking up unnecessary information. So for example, if I want to select a username I inspect the element containing the information I need:
<a class="Username" href="...">Username</a>
Since I am trying to collect usernames it makes the most sense to select by the class "Username":
soup.select("a.Username")
This gives me a list of all the usernames that are found on the page, this is great, however, if we want to select the data in "packages" (by post in your example we need to collect each post individually.
To accomplish this you could do something like the following:
comments = soup.select("div.comment")
This will make it easier to then do the following:
with open('file.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['user', 'date', 'text']
for comment in comments:
username = comment.select_one("div.Username")
date = comment.select_one("span.BodyDate")
message = comment.select_one("div.Message")
writer.writerow([username, date, message])
Doing it this way also makes sure your data stays in order even if an element is missing.
Here you go:
from bs4 import BeautifulSoup
import csv
import requests
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
soup = BeautifulSoup(page.content, 'html.parser')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul > li')
with open('data.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=['user', 'date', 'text'])
writer.writeheader()
for comment in container:
writer.writerow({
'user': comment.find('a', {'class': 'Username'}).get_text(),
'date': comment.find('span', {'class': 'BodyDate DateCreated'}).get_text().strip(),
'text': comment.find('div', {'class': 'Message'}).get_text().strip()
})

Categories