I'm extracting data from DESWATER website, these data are then saved in CSV. to make a small example of the issue I have these 2 authors, one having a full text file the other doesn't. Hence, it will save the file to the wrong author.
So the CSV output looks like this:
Authors | File
First Author | Second File
Second Author | Third File
But I want the output like this:
Authors | File
First Author | 'No File'
Second Author | Second File
Third Author | Third File
Here is a small test code:
from bs4 import BeautifulSoup
import requests
import time
import csv
list_of_authors = []
list_of_full_file = []
r = requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009')
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
#'Author'
s = soup.find('td', class_='testo_normale')
authors = s.find_all('i')
for author in authors:
list_of_authors.append(author.text.strip())
time.sleep(1)
#'FULL TEXT'
# find all the anchor tags with "href"
n=1
for link in soup.find_all('a', class_='testo_normale_rosso'):
if "fulltext.php?abst=" in link.get('href'):
# TO ADD
baseurl = 'https://www.deswater.com/'
Full_links=baseurl+link.attrs['href'].replace('\n','')
list_of_full_file.append(f'file {n}')
n+=1
time.sleep(1)
def Save_csv():
row_head =['Author', 'File Name']
Data = []
for author, file in zip(list_of_authors, list_of_full_file):
Data.append(author)
Data.append(file)
rows = [Data[i:i + 2] for i in range(0, len(Data), 2)]
with open('data.csv', 'w', encoding='utf_8_sig', newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(row_head)
csvwriter.writerows(rows)
Save_csv()
This code will ultimately extract data from 279 pages, so I need the code to automatically detect that there is no Full Text for this author, so I can append it as 'No File'
See the reference of the correct matching in the website here.
The first author doesn't have a full text file.
Any Ideas?
Try to change your strategy selecting the elements and avoid multiple lists if yo could not ensure same length.
Use css selectors here to select all <hr> that are the base for all other selections with find_previous():
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
Example
from bs4 import BeautifulSoup
import requests
import csv
soup = BeautifulSoup(requests.get('https://www.deswater.com/vol.php?vol=1&oth=1|1-3|January|2009').content)
with open('data.csv', 'w', encoding='utf-8', newline='') as f:
data = []
for e in soup.select('.testo_normale hr'):
data.append({
'author': e.find_previous('i').text,
'file': 'https://www.deswater.com/'+e.find_previous('a').get('href') if 'fulltext' in e.find_previous('a').get('href') else 'no url'
})
dict_writer = csv.DictWriter(f, data[0].keys())
dict_writer.writeheader()
dict_writer.writerows(data)
Output
author,file
Miriam Balaban,no url
W. Richard Bowen,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzEucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kW.k#13#kRichardk#13#kBowenk#1#kk#4#kik#2#kk#1#kbrk#2#kWaterk#13#kengineeringk#13#kfork#13#kthek#13#kpromotionk#13#kofk#13#kpeacek#1#kbrk#2#k1k#15#k2009k#16#k1k#35#k6k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k1.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NC9URFdUX0FfMTA1MTI4NjRfTy5wZGY=&type=1
Steven J. Duranceau,https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzcucGRm&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kStevenk#13#kJ.k#13#kDuranceauk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#kpermeatek#13#ktransientk#13#kresponsek#13#ktok#13#kperturbationsk#13#kfromk#13#ksteadyk#13#kstatek#13#kink#13#kak#13#knanofiltrationk#13#kprocessk#1#kbrk#2#k1k#15#k2009k#16#k7k#35#k16k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k7.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2NS9URFdUX0FfMTA1MTI4NjVfTy5wZGY=&type=1
"Dmitry Lisitsin, David Hasson, Raphael Semiat",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzE3LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kDmitryk#13#kLisitsink#6#kk#13#kDavidk#13#kHassonk#6#kk#13#kRaphaelk#13#kSemiatk#1#kk#4#kik#2#kk#1#kbrk#2#kModelingk#13#kthek#13#keffectk#13#kofk#13#kantik#35#kscalantk#13#konk#13#kCaCO3k#13#kprecipitationk#13#kink#13#kcontinuousk#13#kflowk#1#kbrk#2#k1k#15#k2009k#16#k17k#35#k24k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k17.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ni9URFdUX0FfMTA1MTI4NjZfTy5wZGY=&type=1
"M.A. Darwish, Fatima M. Al-Awadhi, A. Akbar, A. Darwish",https://www.deswater.com/fulltext.php?abst=XFxEV1RfYWJzdHJhY3RzXFx2b2xfMVxcMV8yMDA5XzI1LnBkZg==&desc=k#1#kfontk#13#kfacek#7#kk#30#kGenevak#6#kk#13#kArialk#6#kk#13#kHelveticak#6#kk#13#ksank#35#kserifk#30#kk#13#ksizek#7#kk#30#k2k#30#kk#2#kk#1#kik#2#kM.A.k#13#kDarwishk#6#kk#13#kFatimak#13#kM.k#13#kAlk#35#kAwadhik#6#kk#13#kA.k#13#kAkbark#6#kk#13#kA.k#13#kDarwishk#1#kk#4#kik#2#kk#1#kbrk#2#kAlternativek#13#kprimaryk#13#kenergyk#13#kfork#13#kpowerk#13#kdesaltingk#13#kplantsk#13#kink#13#kKuwaitk#32#kk#13#kthek#13#knucleark#13#koptionk#13#kIk#1#kbrk#2#k1k#15#k2009k#16#k25k#35#k41k#1#kbrk#4#kk#2#kk#1#kak#13#khrefk#7#kDWTk#12#kabstractsk#4#kvolk#12#k1k#4#k1k#12#k2009k#12#k25.pdfk#13#kclassk#7#kk#5#kk#30#ktestok#12#knormalek#12#krossok#5#kk#30#kk#13#ktargetk#7#kk#5#kk#30#kk#12#kblankk#5#kk#30#kk#2#kAbstractk#1#kk#4#kak#2#kk#1#kbrk#2#k&id23=RFdUX2FydGljbGVzL1REV1RfSV8wMV8wMS0wM190ZmphL1REV1RfQV8xMDUxMjg2Ny9URFdUX0FfMTA1MTI4NjdfTy5wZGY=&type=1
...
First of all I got to say that I have very little experience with any sort of coding so even I dont completely know what Im after here, but Im trying my best!
Ive been writing this code that takes the HTML of a certain website and then gives me .CSV file of the elements(?) that are named (you can see these in the inspect panel of the website).
So my question is, how can I use criteria with my current code so I can tell the code to only return words with, for example, the letter g in them?
Im happy to elaborate!
Thank you already!
import urllib.request
from bs4 import BeautifulSoup
import csv
url = 'https://kouluruoka.fi/menu/kouvola_koulujenruokalista'
request = urllib.request.Request(url)
content = urllib.request.urlopen(request)
parse = BeautifulSoup(content, 'html.parser')
#These texts get words in <h2> and <span> named elements
text1 = parse.find_all('h2')
text2 = parse.find_all('span')
#This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for col1,col2 in zip(text1, text2):
writer.writerow([col1.get_text().strip(), col2.get_text().strip()])
You can check if elements contains some string/letter this way:
h2_elements = parse.find_all('h2')
span_elements = parse.find_all('span')
# This code uses the texts above to create the .CSV file
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for h2_element, span_element in zip(h2_elements, span_elements):
h2_element_str = h2_element.get_text().strip()
span_element_str = span_element.get_text().strip()
if 'a' in h2_element_str and 'a' in span_element_str:
writer.writerow([h2_element_str, span_element_str])
I'm trying to do a web scraping test for a website that sells cars, so I need to grab the cars info and store them in a CSV or an excel file. I want each info to be stored in the desired column, for example: car name; car price, millage...
my final code:
soup = BeautifulSoup(adress.content, 'html.parser')
title=soup.h1.text
Price=soup.find("div",class_="value details-price-value").get_text()
vin=soup.find("div",class_= "value details-vin-value").get_text()
car_info=[
]
car_info.append({"price":Price})
car_info.append({"title":title})
car_info.append({"item vin":vin})
with open('cars.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
wr.writerow(car_info)
I've found a solution. so far is working, but still another problem of data being repeated in the CSV file
data title and price are getting repeated for each car
code:
url =input("enter site:")
car_info=[]
#def requisting():
adress = requests.get(url)
soup = BeautifulSoup(adress.content, 'html.parser')
title=soup.h1.text
Price = soup.find("div", class_="value details-price-value").get_text()
vin = soup.find("div", class_="value details-vin-value").get_text()
car_info.append(title)
car_info.append(Price)
car_info.append(vin)
info=['title','price','vin']
print(car_info)
with open("newcars.csv", 'a', newline="")as nc:
wr = csv.writer(nc)
wr.writerow(info)
wr.writerow(car_info)
I extract html data from a mail and parse this data with beautifulsoup. Next, I want to store the parsed data under the right headers in the csv file. However, the text of the input data does not show accordingly in the output csv file.
Parsed input data (fruits_html) for csv file:
Apples 43 0 0 0<br/>
Bananas 2282 0 500 0<br/>
Grapes 2534 0 500 0<br/>
Oranges 274 0 0 0<br/>
--------------------------------------------------------------------------------------------------<br/>
Script:
# Parse raw messages to something readable
soup = BeautifulSoup(raw_email, 'html.parser')
fruits_html = soup.find_all('span')
headers = ["Names", "Quantity", "SpareQty", "MinQty", "MaxQty"]
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output, delimiter=',')
csv_output.writerow(headers)
for br in soup.find_all('span'):
csv_output.writerow([fruits_html for br in br.find_all('br')])
Desired output:
I want to store all the quantities under the right header in the csv file. Unfortunately, my current output shows the headers in the first row, and in the second row a large number of <br/> in different cells.
import csv
from bs4 import BeautifulSoup
from bs4.element import NavigableString
data = '''
<html>
<span>
Apples 43 0 0 0<br/>
Bananas 2282 0 500 0<br/>
Grapes 2534 0 500 0<br/>
Oranges 274 0 0 0<br/>
</span>
</html>'''
soup = BeautifulSoup(data, 'html.parser')
#print(soup.find_all("span"))
headers = ["Names", "Quantity", "SpareQty", "MinQty", "MaxQty"]
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output, delimiter=',')
csv_output.writerow(headers)
for br in soup.find_all("span"):
for item in br.contents:
if type(item) is not NavigableString:
continue
csv_output.writerow(item.strip().split())
With output.csv
Names,Quantity,SpareQty,MinQty,MaxQty
Apples,43,0,0,0
Bananas,2282,0,500,0
Grapes,2534,0,500,0
Oranges,274,0,0,0
My code writes to a CSV file titled 'output' here is a link to past help on this code
When I run my code my CSV file is being rewritten over in the body row. I want to write to a new row every time there is new information being scraped from the table of the stock table URL.
Here is what my CSV file looks like:
Index,P/E,EPS (ttm),Insider Own,Shs Outstand,Perf Week,Market Cap,Forward P/E,EPS next Y,Insider Trans,Shs Float,Perf Month,Income,PEG,EPS next Q,Inst Own,Short Float,Perf Quarter,Sales,P/S,EPS this Y,Inst Trans,Short Ratio,Perf Half Y,Book/sh,P/B,EPS next Y,ROA,Target Price,Perf Year,Cash/sh,P/C,EPS next 5Y,ROE,52W Range,Perf YTD,Dividend,P/FCF,EPS past 5Y,ROI,52W High,Beta,Dividend %,Quick Ratio,Sales past 5Y,Gross Margin,52W Low,ATR,Employees,Current Ratio,Sales Q/Q,Oper. Margin,RSI (14),Volatility,Optionable,Debt/Eq,EPS Q/Q,Profit Margin,Rel Volume,Prev Close,Shortable,LT Debt/Eq,Earnings,Payout,Avg Volume,Price,Recom,SMA20,SMA50,SMA200,Volume,Change
-,-,-3.00,45.18%,5.19M,30.47%,15.78M,-,-,0.00%,2.84M,-16.48%,-14.00M,-,-,1.00%,9.24%,88.82%,18.30M,0.86,-122.00%,136.99%,0.26,88.82%,27.27,0.11,-,-,4.00,-51.44%,0.87,3.51,15.00%,-,1.30 - 8.00,-27.10%,-,-,-15.40%,0.40%,-62.00%,2.73,-,1.10,-16.40%,25.10%,133.85%,0.52,450,1.20,-58.50%,-,53.21,19.81% 17.08%,No,0.37,-,-,5.40,2.96,Yes,0.13,-,-,991.40K,3.04,3.00,1.72%,-6.24%,29.44%,"5,358,503",2.70%
Here is my code:
import csv
import urllib.request
from bs4 import BeautifulSoup
twiturl = "https://twitter.com/ACInvestorBlog"
twitpage = urllib.request.urlopen(twiturl)
soup = BeautifulSoup(twitpage,"html.parser")
print(soup.title.text)
tweets = [i.text for i in soup.select('a.twitter-cashtag.pretty-link.js-nav b')]
print(tweets)
url_base = "https://finviz.com/quote.ashx?t="
url_list = [url_base + tckr for tckr in tweets]
for url in url_list:
fpage = urllib.request.urlopen(url)
fsoup = BeautifulSoup(fpage, 'html.parser')
#scrape single page and add data to list
#write datalist
with open('output.csv', 'wt') as file:
writer = csv.writer(file)
# write header row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2-cp'})))
# write body row
writer.writerow(map(lambda e : e.text, fsoup.find_all('td', {'class':'snapshot-td2'})))
Append mode
The issue is with your command open('output.csv', 'wt') - 'w' option opens the file for (over)writing. If you want to append data at the end of the existing file, use the 'a' option instead, as shown in the fine manual at https://docs.python.org/3.7/library/functions.html#open .
Also, you might want to check if the file exists beforehand and write the header row only if it does not.