Hello Everyone I am doing a web scraping of a website which has multiple pages(doing for 9 pages) and writing data in a csv file. every page has 24 rows of data which comes in total of 216 rows data for 9 pages but I am getting only 24 rows of data which I think is page no 9 data and python just re-writing the data again & again for every page in same rows instead of appending it.so please help me to figure out how I can make python to append each page data in ex. Here is my code:
import requests
from bs4 import BeautifulSoup
from csv import writer
for page in range(1,10):
url = 'https://www.flipkart.com/searchq=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}'.format(page =page)
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
links = soup.find_all('div', class_= '_2kHMtA')
with open('Flipkart.csv', 'w', encoding = 'utf8', newline= '') as f:
thewriter = writer(f)
header = ('Title', 'Specification', 'price', 'Rating Out of 5')
thewriter.writerow(header)
for link in links:
title = link.find('div', class_= '_4rR01T').text
Specification = link.find('ul', class_='_1xgFaf').text
price = link.find('div', class_ = '_30jeq3 _1_WHN1').text
Rating = link.find('span', class_='_1lRcqv')
if Rating:
Rating = Rating.text
else:
Rating = 'N/A'
info = [title, Specification, price,Rating]
thewriter.writerow(info)
First, your url is missing a question mark after search:
url = 'https://www.flipkart.com/search?q=laptops&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page}'.format(page =page)
Next, change:
with open('Flipkart.csv', 'w', encoding = 'utf8', newline= '') as f:
into:
with open('Flipkart.csv', 'a', encoding = 'utf8', newline= '') as f:
as we want to use the mode a to append to the file. With w for write, you keep overwriting the file, which is the reason why you end up with only the information from the last page. See open.
Finally, put the header info inside an if-statement:
if page == 1:
header = ('Title', 'Specification', 'price', 'Rating Out of 5')
thewriter.writerow(header)
Otherwise, you will be repeating the header for each new page.
Related
I'm extracting data from DESWATER website, these data are then saved in CSV. to make a small example of the issue I have these 2 authors, one having a full text file the other doesn't. Hence, it will save the file to the wrong author.
So the CSV output looks like this:
Authors | File
First Author | Second File
Second Author | Third File
But I want the output like this:
Authors | File
First Author | 'No File'
Second Author | Second File
Third Author | Third File
Here is a small test code:
from bs4 import BeautifulSoup
import requests
import time
import csv
list_of_authors = []
list_of_full_file = []
r = requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009')
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
#'Author'
s = soup.find('td', class_='testo_normale')
authors = s.find_all('i')
for author in authors:
list_of_authors.append(author.text.strip())
time.sleep(1)
#'FULL TEXT'
# find all the anchor tags with "href"
n=1
for link in soup.find_all('a', class_='testo_normale_rosso'):
if "fulltext.php?abst=" in link.get('href'):
# TO ADD
baseurl = 'https://www.deswater.com/'
Full_links=baseurl+link.attrs['href'].replace('\n','')
list_of_full_file.append(f'file {n}')
n+=1
time.sleep(1)
def Save_csv():
row_head =['Author', 'File Name']
Data = []
for author, file in zip(list_of_authors, list_of_full_file):
Data.append(author)
Data.append(file)
rows = [Data[i:i + 2] for i in range(0, len(Data), 2)]
with open('data.csv', 'w', encoding='utf_8_sig', newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(row_head)
csvwriter.writerows(rows)
Save_csv()
This code will ultimately extract data from 279 pages, so I need the code to automatically detect that there is no Full Text for this author, so I can append it as 'No File'
See the reference of the correct matching in the website here.
The first author doesn't have a full text file.
Any Ideas?
Try to change your strategy selecting the elements and avoid multiple lists if yo could not ensure same length.
Use css selectors here to select all <hr> that are the base for all other selections with find_previous():
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
Example
from bs4 import BeautifulSoup
import requests
import csv
soup = BeautifulSoup(requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009').content)
with open('data.csv', 'w', encoding='utf-8', newline='') as f:
data = []
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
dict_writer = csv.DictWriter(f, data[0].keys())
dict_writer.writeheader()
dict_writer.writerows(data)
Output
author,file
Miriam Balaban,no url
W. Richard Bowen,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzEucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kW.k#13#kRichardk#13#kBowenk#1#kk#4#kik#2#kk#1#kbrk#2#kWaterk#13#kengineeringk#13#kfork#13#kthek#13#kpromotionk#13#kofk#13#kpeacek#1#kbrk#2#k1k#15#k2009k#16#k1k#35#k6k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k1.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NC9URFdUX0FfMTA1MTI4NjRfTy5wZGY=&type=1
Steven J. Duranceau,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzcucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kStevenk#13#kJ.k#13#kDuranceauk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#kpermeatek#13#ktransientk#13#kresponsek#13#ktok#13#kperturbationsk#13#kfromk#13#ksteadyk#13#kstatek#13#kink#13#kak#13#knanofiltrationk#13#kprocessk#1#kbrk#2#k1k#15#k2009k#16#k7k#35#k16k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k7.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NS9URFdUX0FfMTA1MTI4NjVfTy5wZGY=&type=1
"Dmitry Lisitsin, David Hasson, Raphael Semiat",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzE3LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kDmitryk#13#kLisitsink#6#kk#13#kDavidk#13#kHassonk#6#kk#13#kRaphaelk#13#kSemiatk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#keffectk#13#kofk#13#kantik#35#kscalantk#13#konk#13#kCaCO3k#13#kprecipitationk#13#kink#13#kcontinuousk#13#kflowk#1#kbrk#2#k1k#15#k2009k#16#k17k#35#k24k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k17.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ni9URFdUX0FfMTA1MTI4NjZfTy5wZGY=&type=1
"M.A. Darwish, Fatima M. Al-Awadhi, A. Akbar, A. Darwish",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzI1LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kM.A.k#13#kDarwishk#6#kk#13#kFatimak#13#kM.k#13#kAlk#35#kAwadhik#6#kk#13#kA.k#13#kAkbark#6#kk#13#kA.k#13#kDarwishk#1#kk#4#kik#2#kk#1#kbrk#2#kAlternativek#13#kprimaryk#13#kenergyk#13#kfork#13#kpowerk#13#kdesaltingk#13#kplantsk#13#kink#13#kKuwaitk#32#kk#13#kthek#13#knucleark#13#koptionk#13#kIk#1#kbrk#2#k1k#15#k2009k#16#k25k#35#k41k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k25.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ny9URFdUX0FfMTA1MTI4NjdfTy5wZGY=&type=1
...
When i run the code and i get my CSV file, its actually empty.
'''
import requests
from bs4 import BeautifulSoup
from csv import writer
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('section', class_='re-CardPackAdvance')
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a', class_='re-CardPackAdvance-info-container').text.replace('\n', '')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)
'''
I expected to have all the info scrapped from this website, but seems like i did something wrong at some point
You are parsing the resulting soup not appropriately. There is no section with the re-CardPackAdvance class. I adapted the code accordingly (find all articles with class that starts with re-CardPack). Please also note that you need to shift the for-loop by one indention. However, due to the structure of the page, only the first two entries are loaded directly when fetching the page. All other entries are fetched after the page has loaded in the browser (via javascript). I think you might consider using the API of the page instead.
import requests
from bs4 import BeautifulSoup
from csv import writer
import re
url = 'https://www.fotocasa.es/es/alquiler/todas-las-casas/girona-provincia/todas-las-zonas/l'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all("article", class_=re.compile("^re-CardPack"))
print(len(lists))
with open('casas.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Titulo', 'Precio', 'Metros', 'Telefono']
thewriter.writerow(header)
for list in lists:
titulo = list.find('a').get('title')
precio = list.find('span', class_='re-CardPrice').text.replace('\n', '')
metros = list.find('span', class_='re-CardFeaturesWithIcons-feature-icon--surface').text.replace('\n', '')
telefono = list.find('a', class_='re-CardContact-phone').text.replace('\n', '')
info = [titulo, precio, metros, telefono]
thewriter.writerow(info)
I am scraping links from multiple pages under multiple searches and want to output scraped results into multiple .csv files. The table shows the .csv file which lists both my source urls and desired output file names:
url
outputfile
https://www.marketresearch.com/search/results.asp?categoryid=230&qtype=2&publisher=IDCs&datepub=0&submit2=Search
outputPS1xIDC.csv
https://www.marketresearch.com/search/results.asp?categoryid=90&qtype=2&publisher=IDC&datepub=0&submit2=Search
outputPS2xIDC.csv
https://www.marketresearch.com/search/results.asp?categoryid=233&qtype=2&publisher=IDC&datepub=0&submit2=Search
outputPS3xIDC.csv
https://www.marketresearch.com/search/results.asp?categoryid=169&qtype=2&publisher=IDC&datepub=0&submit2=Search
outputPS4xIDC.csv
Now, with the code below, I managed to read the urls in sequence and the rest of the code also works well (when I specify the output filename directly). However, it only outputs the last of the 4 pages in the list, so it overwrites the result each time. What I actually want for it is to output the results from the first url to the first outputfile, second to second, etc.
(Of course my actual list of source URLs is much longer than these 4).
Please help, especially with the last line, as clearly just writing [outputs] there doesn't work.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
data = []
for url in urls:
def scrape_it(url):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv([outputs], header=False) #works (but only for the last url) if instead of [outputs] I have f'filename.csv'
I don't have Pandas, and I don't really want to run your input, but a couple of things jump out a me when I look at your code:
It looks like you are not looping over url and output together. It looks like you loop over all the URLs, and then after all those loops you write once.
Likewise, data is just having the HTML table data appended and appended, it's never reset for each individual URL.
Without being able to run this, I recommend something like this. The scraping is fully encapsulated and separate from the loop, and as such you can now more clearly see the flow of inputs and outputs:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
def scrape_it(url, data):
page = requests.get(url, headers={'Cookie': 'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' in stri:
data = scrape_it(nexturl, data)
return data
with open('inputs.csv', newline='') as csvfile:
reader = csv.DictReader(csvfile)
urls = [row["url"] for row in reader]
outputs = [row["outputfile"] for row in reader]
for (url, output) in zip(urls, outputs): # work on url and output together
data = scrape_it(url, [])
myOutput = pd.DataFrame(data)
myOutput.to_csv(output, header=False)
I'm trying to do a web scraping test for a website that sells cars, so I need to grab the cars info and store them in a CSV or an excel file. I want each info to be stored in the desired column, for example: car name; car price, millage...
my final code:
soup = BeautifulSoup(adress.content, 'html.parser')
title=soup.h1.text
Price=soup.find("div",class_="value details-price-value").get_text()
vin=soup.find("div",class_= "value details-vin-value").get_text()
car_info=[
]
car_info.append({"price":Price})
car_info.append({"title":title})
car_info.append({"item vin":vin})
with open('cars.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(car_info)
I've found a solution. so far is working, but still another problem of data being repeated in the CSV file
data title and price are getting repeated for each car
code:
url =input("enter site:")
car_info=[]
#def requisting():
adress = requests.get(url)
soup = BeautifulSoup(adress.content, 'html.parser')
title=soup.h1.text
Price = soup.find("div", class_="value details-price-value").get_text()
vin = soup.find("div", class_="value details-vin-value").get_text()
car_info.append(title)
car_info.append(Price)
car_info.append(vin)
info=['title','price','vin']
print(car_info)
with open("newcars.csv", 'a', newline="")as nc:
wr = csv.writer(nc)
wr.writerow(info)
wr.writerow(car_info)
How would I proceed in this web scraping project using bs4 and requests? I am trying to extract user info from a forum site (myfitnesspal exactly: https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1), specifically the username, message, and date posted, and load them into columns on a csv. I have this code so far but am unsure about how to proceed:
from bs4 import BeautifulSoup
import csv
import requests
# get page source and create a BS object
print('Reading page...')
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
src = page.content
soup = BeautifulSoup(src, 'html.parser')
#container = soup.select('#vanilla_discussion_index > div.container')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul')
postdata = soup.select('div.Message')
user = []
date = []
text = []
for post in postdata:
text.append(BeautifulSoup(str(post), 'html.parser').get_text().encode('utf-8').strip())
print(text) # this stores the text of each comment/post in a list,
# so next I'd want to store this in a csv with columns
# user, date posted, post with this under the post column
# and do the same for user and date
This script will get all messages from the page and saves them in data.csv:
import csv
import requests
from bs4 import BeautifulSoup
url = 'https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
all_data = []
for u, d, m in zip(soup.select('.Username'), soup.select('.DateCreated'), soup.select('.Message')):
all_data.append([u.text, d.get_text(strip=True),m.get_text(strip=True, separator='\n')])
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
writer.writerow(row)
Screenshot from LibreOffice:
One rule of thumb I like to follow with web scraping is being specific as possible without picking up unnecessary information. So for example, if I want to select a username I inspect the element containing the information I need:
<a class="Username" href="...">Username</a>
Since I am trying to collect usernames it makes the most sense to select by the class "Username":
soup.select("a.Username")
This gives me a list of all the usernames that are found on the page, this is great, however, if we want to select the data in "packages" (by post in your example we need to collect each post individually.
To accomplish this you could do something like the following:
comments = soup.select("div.comment")
This will make it easier to then do the following:
with open('file.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['user', 'date', 'text']
for comment in comments:
username = comment.select_one("div.Username")
date = comment.select_one("span.BodyDate")
message = comment.select_one("div.Message")
writer.writerow([username, date, message])
Doing it this way also makes sure your data stays in order even if an element is missing.
Here you go:
from bs4 import BeautifulSoup
import csv
import requests
page= requests.get('https://community.myfitnesspal.com/en/discussion/10703170/what-were-eating/p1')
soup = BeautifulSoup(page.content, 'html.parser')
container = soup.select('#vanilla_discussion_index > div.container > div.row > div.content.column > div.CommentsWrap > div.DataBox.DataBox-Comments > ul > li')
with open('data.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=['user', 'date', 'text'])
writer.writeheader()
for comment in container:
writer.writerow({
'user': comment.find('a', {'class': 'Username'}).get_text(),
'date': comment.find('span', {'class': 'BodyDate DateCreated'}).get_text().strip(),
'text': comment.find('div', {'class': 'Message'}).get_text().strip()
})