save a web scaping result into a CSV or excel file - python

I'm trying to do a web scraping test for a website that sells cars, so I need to grab the cars info and store them in a CSV or an excel file. I want each info to be stored in the desired column, for example: car name; car price, millage...
my final code:
soup = BeautifulSoup(adress.content, 'html.parser')
title=soup.h1.text
Price=soup.find("div",class_="value details-price-value").get_text()
vin=soup.find("div",class_= "value details-vin-value").get_text()
car_info=[
]
car_info.append({"price":Price})
car_info.append({"title":title})
car_info.append({"item vin":vin})
with open('cars.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(car_info)

I've found a solution. so far is working, but still another problem of data being repeated in the CSV file
data title and price are getting repeated for each car
code:
url =input("enter site:")
car_info=[]
#def requisting():
adress = requests.get(url)
soup = BeautifulSoup(adress.content, 'html.parser')
title=soup.h1.text
Price = soup.find("div", class_="value details-price-value").get_text()
vin = soup.find("div", class_="value details-vin-value").get_text()
car_info.append(title)
car_info.append(Price)
car_info.append(vin)
info=['title','price','vin']
print(car_info)
with open("newcars.csv", 'a', newline="")as nc:
wr = csv.writer(nc)
wr.writerow(info)
wr.writerow(car_info)

Related

How to make sure the data is matching while web-scraping to CSV?

I'm extracting data from DESWATER website, these data are then saved in CSV. to make a small example of the issue I have these 2 authors, one having a full text file the other doesn't. Hence, it will save the file to the wrong author.
So the CSV output looks like this:
Authors | File
First Author | Second File
Second Author | Third File
But I want the output like this:
Authors | File
First Author | 'No File'
Second Author | Second File
Third Author | Third File
Here is a small test code:
from bs4 import BeautifulSoup
import requests
import time
import csv
list_of_authors = []
list_of_full_file = []
r = requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009')
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
#'Author'
s = soup.find('td', class_='testo_normale')
authors = s.find_all('i')
for author in authors:
list_of_authors.append(author.text.strip())
time.sleep(1)
#'FULL TEXT'
# find all the anchor tags with "href"
n=1
for link in soup.find_all('a', class_='testo_normale_rosso'):
if "fulltext.php?abst=" in link.get('href'):
# TO ADD
baseurl = 'https://www.deswater.com/'
Full_links=baseurl+link.attrs['href'].replace('\n','')
list_of_full_file.append(f'file {n}')
n+=1
time.sleep(1)
def Save_csv():
row_head =['Author', 'File Name']
Data = []
for author, file in zip(list_of_authors, list_of_full_file):
Data.append(author)
Data.append(file)
rows = [Data[i:i + 2] for i in range(0, len(Data), 2)]
with open('data.csv', 'w', encoding='utf_8_sig', newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(row_head)
csvwriter.writerows(rows)
Save_csv()
This code will ultimately extract data from 279 pages, so I need the code to automatically detect that there is no Full Text for this author, so I can append it as 'No File'
See the reference of the correct matching in the website here.
The first author doesn't have a full text file.
Any Ideas?
Try to change your strategy selecting the elements and avoid multiple lists if yo could not ensure same length.
Use css selectors here to select all <hr> that are the base for all other selections with find_previous():
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
Example
from bs4 import BeautifulSoup
import requests
import csv
soup = BeautifulSoup(requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009').content)
with open('data.csv', 'w', encoding='utf-8', newline='') as f:
data = []
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
dict_writer = csv.DictWriter(f, data[0].keys())
dict_writer.writeheader()
dict_writer.writerows(data)
Output
author,file
Miriam Balaban,no url
W. Richard Bowen,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzEucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kW.k#13#kRichardk#13#kBowenk#1#kk#4#kik#2#kk#1#kbrk#2#kWaterk#13#kengineeringk#13#kfork#13#kthek#13#kpromotionk#13#kofk#13#kpeacek#1#kbrk#2#k1k#15#k2009k#16#k1k#35#k6k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k1.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NC9URFdUX0FfMTA1MTI4NjRfTy5wZGY=&type=1
Steven J. Duranceau,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzcucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kStevenk#13#kJ.k#13#kDuranceauk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#kpermeatek#13#ktransientk#13#kresponsek#13#ktok#13#kperturbationsk#13#kfromk#13#ksteadyk#13#kstatek#13#kink#13#kak#13#knanofiltrationk#13#kprocessk#1#kbrk#2#k1k#15#k2009k#16#k7k#35#k16k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k7.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NS9URFdUX0FfMTA1MTI4NjVfTy5wZGY=&type=1
"Dmitry Lisitsin, David Hasson, Raphael Semiat",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzE3LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kDmitryk#13#kLisitsink#6#kk#13#kDavidk#13#kHassonk#6#kk#13#kRaphaelk#13#kSemiatk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#keffectk#13#kofk#13#kantik#35#kscalantk#13#konk#13#kCaCO3k#13#kprecipitationk#13#kink#13#kcontinuousk#13#kflowk#1#kbrk#2#k1k#15#k2009k#16#k17k#35#k24k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k17.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ni9URFdUX0FfMTA1MTI4NjZfTy5wZGY=&type=1
"M.A. Darwish, Fatima M. Al-Awadhi, A. Akbar, A. Darwish",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzI1LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kM.A.k#13#kDarwishk#6#kk#13#kFatimak#13#kM.k#13#kAlk#35#kAwadhik#6#kk#13#kA.k#13#kAkbark#6#kk#13#kA.k#13#kDarwishk#1#kk#4#kik#2#kk#1#kbrk#2#kAlternativek#13#kprimaryk#13#kenergyk#13#kfork#13#kpowerk#13#kdesaltingk#13#kplantsk#13#kink#13#kKuwaitk#32#kk#13#kthek#13#knucleark#13#koptionk#13#kIk#1#kbrk#2#k1k#15#k2009k#16#k25k#35#k41k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k25.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ny9URFdUX0FfMTA1MTI4NjdfTy5wZGY=&type=1
...

Rewriting Rows instead of adding to new one

Hello Everyone I am doing a web scraping of a website which has multiple pages(doing for 9 pages) and writing data in a csv file. every page has 24 rows of data which comes in total of 216 rows data for 9 pages but I am getting only 24 rows of data which I think is page no 9 data and python just re-writing the data again & again for every page in same rows instead of appending it.so please help me to figure out how I can make python to append each page data in ex. Here is my code:
import requests
from bs4 import BeautifulSoup
from csv import writer
for page in range(1,10):
url = 'https://www.flipkart.com/searchq=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}'.format(page =page)
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
links = soup.find_all('div', class_= '_2kHMtA')
with open('Flipkart.csv', 'w', encoding = 'utf8', newline= '') as f:
thewriter = writer(f)
header = ('Title', 'Specification', 'price', 'Rating Out of 5')
thewriter.writerow(header)
for link in links:
title = link.find('div', class_= '_4rR01T').text
Specification = link.find('ul', class_='_1xgFaf').text
price = link.find('div', class_ = '_30jeq3 _1_WHN1').text
Rating = link.find('span', class_='_1lRcqv')
if Rating:
Rating = Rating.text
else:
Rating = 'N/A'
info = [title, Specification, price,Rating]
thewriter.writerow(info)
First, your url is missing a question mark after search:
url = 'https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}'.format(page =page)
Next, change:
with open('Flipkart.csv', 'w', encoding = 'utf8', newline= '') as f:
into:
with open('Flipkart.csv', 'a', encoding = 'utf8', newline= '') as f:
as we want to use the mode a to append to the file. With w for write, you keep overwriting the file, which is the reason why you end up with only the information from the last page. See open.
Finally, put the header info inside an if-statement:
if page == 1:
header = ('Title', 'Specification', 'price', 'Rating Out of 5')
thewriter.writerow(header)
Otherwise, you will be repeating the header for each new page.

Beautiful Soup script not delivering desired CSV output

I am new to scraping/BS4 and am having a problem getting this csv file to list all of the members. My problem is the CSV is listing one member's information in repeat over multiple lines. If anyone has any ideas to fix this, would be greatly appreciated.
import requests
import csv
from bs4 import BeautifulSoup
r = requests.get('https://vermontmaple.org/basic-member-list')
soup = BeautifulSoup(r.text, 'html.parser')
with open('list.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['name', 'address', 'phone'])
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = soup.find('div', class_='name').get_text(strip=True)
maple_address = soup.find('div', class_='address').get_text(strip=True)
maple_phone = soup.find('div', class_='phone').get_text(strip=True)
writer.writerow([maple_name, maple_address, maple_phone])
f.close()
change soup.find to company.find inside the forloop
for company in soup.findAll('div', class_='directory_item selected'):
maple_name = company.find('div', class_='name').get_text(strip=True)
maple_address = company.find('div', class_='address').get_text(strip=True)
maple_phone = company.find('div', class_='phone').get_text(strip=True)
there is no need for a f.close()

CSV | Text stored as elements of a list

I am creating a csv file which gathers several articles scraped from a website. The articles are obtained by scraping the text from URLs contained in another file.
I would like to make the CSV file as a list in which each article corresponds to an element of the list.
The code that I used now is this:
import csv
import requests
from bf4 import BeautifulSoup
with open('Training_news.csv', newline='') as file:
reader= csv.reader (file, delimiter=' ')
for row in reader:
for url in row:
r=requests.get(url)
r.encoding = "ISO-8859-1"
soup = BeautifulSoup(r.content, 'lxml')
text = soup.find_all(("p",{"class": "story-body-text story-content"}))
with open('Training_News_5.csv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=' ')
spamwriter.writerow(text)
However, the CSV file created gives me this:
<p>Advertisement</p>, <p class="byline-dateline"><span class="byline" itemprop.......
<p class="feedback-message">We’re interested in your feedback on this page. <strong>Tell us what you think.</strong></p>, <p class="user-action">Go to Home Page »</p>
The articles stored are only three out of 50 and they do not allow me to select each article individually.

writing beautiful soup output to CSV

I want to write prices and corresponding addresses to a CSV file in Excel. I have this code so far which gives the output shown below in the photo.
What I want is a column for price first and a column for the address second.
[![from bs4 import BeautifulSoup
import requests
import csv
number = "1"
url = "http://www.trademe.co.nz/browse/categoryattributesearchresults.aspx?cid=5748&search=1&v=list&134=1&nofilters=1&originalsidebar=1&key=1654466070&page=" + number + "&sort_order=prop_default&rptpath=350-5748-3399-"
r= requests.get(url)
soup = BeautifulSoup(r.content)
output_file= open("output.csv","w")
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
n = 1
while n != 150:
b = (price\[n\].text)
b = str(b)
n = n + 1
output_file.write(b)
output_file.close()][1]][1]
Maybe something like this?
from bs4 import BeautifulSoup
import requests
import csv
....
r = requests.get(url)
soup = BeautifulSoup(r.content)
price = soup.find_all("div",{"class":"property-card-price-container"})
address = soup.find_all("div",{"class":"property-card-subtitle"})
dataset = [(x.text, y.text) for x,y in zip(price, address)]
with open("output.csv", "w", newline='') as csvfile:
writer = csv.writer(csvfile)
for data in dataset[:150]: #truncate to 150 rows
writer.writerow(data)
There are a few problems with your code. Getting the prices and addresses into separate lists risks the site switching the order of the items, etc. and getting them mixed up. When scraping entries like this it is important to first find the larger enclosing container, then narrow down from there.
Unfortunately the URL you provided is no longer valid. As such I just browsed to another set of listings for this example:
from bs4 import BeautifulSoup
import requests
import csv
url = 'http://www.trademe.co.nz/property/residential-property-for-sale'
url += '/waikato/view-list'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
with open('output.csv', 'w', newline='') as csvfile:
propertyWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
for listing in soup.find_all('div',
{'class': 'property-list-view-card'}):
price = listing.find_all('div',
{'class': 'property-card-price-container'})
address = listing.find_all('div',
{'class': 'property-card-subtitle'})
propertyWriter.writerow([price[0].text.strip(),
address[0].text.strip()])

Categories