i'm an absolut beginner, but with youtube and some websites i've written a crawler for the german website Immoscout24.
My problem: the crawler works fine, if all attributes are excisting. But if one site hasn't any attribute (e.g. "pre" in "beschreibung_container"), i'll get "NameError: name 'beschreibung' is not defined". How can i do, that it writes nothing ("") into my result list (csv), if the attribute not exists ans continues crawling?
for number in numbers:
my_url = "https://www.immobilienscout24.de/expose/%s#/" %number
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all("div", {"id":"is24-content"})
filename = "results_"+current_datetime+".csv"
f = open(filename, "a")
headers = "Objekt-ID##Titel##Adresse##Merkmale##Kosten##Bausubstanz und Energieausweis##Beschreibung##Ausstattung##Lage\n"
f.write(headers)
for container in containers:
try:
objektid_container = container.find_all("div", {"class":"is24-scoutid__content padding-top-s"})
objektid = objektid_container[0].get_text().strip()
titel_container = container.find_all("h1", {"class":"font-semibold font-xl margin-bottom margin-top-m palm-font-l"})
titel = titel_container[0].get_text().strip()
adresse_container = container.find_all("div", {"class":"address-block"})
adresse = adresse_container[0].get_text().strip()
criteria_container = container.find_all("div", {"class":"criteriagroup criteria-group--two-columns"})
criteria = criteria_container[0].get_text().strip()
preis_container = container.find_all("div", {"class":"grid-item lap-one-half desk-one-half padding-right-s"})
preis = preis_container[0].get_text().strip()
energie_container = container.find_all("div", {"class":"criteriagroup criteria-group--border criteria-group--two-columns criteria-group--spacing"})
energie = energie_container[0].get_text().strip()
beschreibung_container = container.find_all("pre", {"class":"is24qa-objektbeschreibung text-content short-text"})
beschreibung = beschreibung_container[0].get_text().strip()
ausstattung_container = container.find_all("pre", {"class":"is24qa-ausstattung text-content short-text"})
ausstattung = ausstattung_container[0].get_text().strip()
lage_container = container.find_all("pre", {"class":"is24qa-lage text-content short-text"})
lage = lage_container[0].get_text().strip()
except:
print("some mistake")
pass
f.write(objektid + "##" + titel + "##" + adresse + "##" + criteria.replace(" ", ";") + "##" + preis.replace(" ", ";") + "##" + energie.replace(" ", ";") + "##" + beschreibung.replace("\n", " ") + "##" + ausstattung.replace("\n", " ") + "##" + lage.replace("\n", " ") + "\n")
f.close()
EDIT
First problem is solved. Another problem: my result list shows in each column like:
look here
How can i do, that "Objekt-ID" and the other headlines are only in row No. 1?
For each variable, you can simply just do the following
obj = container.find_all("div", {"class":"xxxxx"}) or ""
objid = obj[0].get_text().strip() if obj else ""
The first line will default the value into "" empty string if find_all returns empty list or none. The second also does the same thing but check for the existence of value first then apply the if else condition.
I think you need to encapsulate each variable in try-except block.
E.g:
try:
objektid_container = container.find_all("div", {"class":"is24-scoutid__content padding-top-s"})
objektid = objektid_container[0].get_text().strip()
except:
objektid = ""
Do this for all variables
For Second issue Move your headers outside loop
Remove this code:
filename = "results_"+current_datetime+".csv"
f = open(filename, "a")
headers = "Objekt-ID##Titel##Adresse##Merkmale##Kosten##Bausubstanz und Energieausweis##Beschreibung##Ausstattung##Lage\n"
f.write(headers)
And add it before:
for number in numbers:
Related
So I am using append to extend my list of scraped apartments. In this code I run in to a problem, because I have created second for loop to change the page in the website. So first for loop gives new page to the next for loop to scrape. But when its done with one page it just overwrites the last list. What I am doing wrong?
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list) # Tried to make this work.. Think one list should be enaught.
And the functions:
def get_content_value(info_list_data):
if info_list_data.find("li"):
return [li.get_text(" ", strip=True).replace("\xa0", "").replace("€", "").replace("/ kk",
"").replace("\n", "") for li in info_list_data.find_all("li")]
else:
return info_list_data.get_text(" ", strip=True).replace("\xa0" , "").replace("€", "").replace("/
kk", "").replace("\n", "")
Last:
def get_apartment_data(url):
r = requests.get(url)
soup = bs(r.content)
all_info_list = soup.find_all(class_ = "CompactInfoRow__infoRow__2hjs_ flexboxgrid__row__wfmuy")
for info_list in all_info_list:
info_list.prettify()
info = {}
for index, info_list in enumerate(all_info_list):
content_key = info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-sm-4__3RH7g
ItemHeader__itemHeader__32xAv").get_text(" ", strip=True)
content_value = get_content_value(info_list.find(class_ = "flexboxgrid__col-xs-12__1I1LS
flexboxgrid__col-sm-8__2jfMv CompactInfoRow__content__3jGt4"))
info[content_key] = content_value
return info
for page in range(1, 4): # Gives new page to scrape
r = requests.get( url + str(page))
soup = bs(r.content)
apartments = soup.select(".ListPage__cardContainer__39dKQ")
base_path = "https://www.etuovi.com"
x = []
apartment_list = []
for index ,apartment in enumerate(apartments):
if index == 2: # Just to not scrape every item
break
relative_path = apartment.a['href']
full_path = base_path + relative_path
id_number = apartment.a['id']
apartment_list.append(get_apartment_data(full_path)) #This works for one page
x.append(apartment_list.copy())
You need to use the copy() method to make an independent copy. Otherwise each time you make a new apartment_list, it will change in your x list too. Like twins lists.
More generally:
x = []
lst = [1,2,3]
x.append(lst)
print (x)
lst[0] = 0
x.append(lst)
print (x)
Output:
[[1,2,3]]
[[0,2,3],[0,2,3]]
The right way is:
x = []
lst = [1,2,3]
x.append(lst.copy())
print (x)
lst[0] = 0
x.append(lst.copy())
print (x)
Output:
[[1,2,3]]
[[1,2,3],[0,2,3]]
I'm currently working on a project for myself, and that includes scraping this specific website.
My code currently looks like this:
for i in range(0,4):
my_url = 'https://www.kickante.com.br/campanhas-crowdfunding?page='+str(i)
uclient = ureq(my_url)
page_html = uclient.read()
uclient.close()
page_soup = soup(page_html, 'html.parser')
containers = page_soup.find_all("div", {"class":"campaign-card-wrapper views-row"})
for container in containers:
#Achando os títulos das campanhas
titleCampaignBruto = container.div.div.a.img["title"].replace('Crowdfunding para: ', '')
titleCampaignParsed = titleCampaignBruto.strip().replace(",", ";")
#Achando o valor da campanha
arrecadadoFind = container.div.find_all("div",{"class":"funding-raised"})
arrecadado = arrecadadoFind[0].text.strip().replace(",", ".")
#Número de doadores
doadoresBruto = container.div.find_all('span', {"class":"contributors-value"})
doadoresParsed = doadoresBruto[0].text.strip().replace(",",";")
#target da campanha
fundingGoal = container.div.find_all('div', {"class":"funding-progress"})
quantoArrecadado = fundingGoal[0].text.strip().replace(",",";")
#Descricao da campanha
descricaoBruta = container.div.find_all('div', {"class":"field field-name-field-short-description field-type-text-long field-label-hidden"})
descricaoParsed = descricaoBruta[0].text.strip().replace(",",";")
#link da campanha
linkCampanha = container.div.find_all('href')
print("Título da campanha: " + titleCampaignParsed)
print("Valor da campanha: " +arrecadado)
print("Doadores: "+ doadoresParsed)
print("target: " + quantoArrecadado)
print("descricao: " + descricaoParsed)
f.write(titleCampaignParsed + "," + arrecadado + "," + doadoresParsed + "," + quantoArrecadado+ "," + descricaoParsed.replace("," ,";") + "\n")
i = i+1
f.close()
When I open the csv file it generated, I see that some lines are broken where they shouldn't be (example: See line 31 on the csv file). That line should be a part of the previous line (line 30) as the body of the description.
Does anyone have an idea of what can be causing that? Thanks in advance.
Some of the text you're writing to CSV might contain newlines. You can remove them like so:
csv_line_entries = [
titleCampaignParsed, arrecadado, doadoresParsed,
quantoArrecadado, descricaoParsed.replace("," ,";")
]
csv_line = ','.join([
entry.replace('\n', ' ') for entry in csv_line_entries
])
f.write(csv_line + '\n')
Cause of the bug
The strip() method removes only leading and trailing newlines/whitespace.
import bs4
soup = bs4.BeautifulSoup('<p>Whatever\nelse\n</p>')
soup.find('p').text.strip()
>>> 'Whatever\nelse'
Notice that the inner \n is not removed.
You have newlines in the middle of the text. strip() only removes whitespace on the start and end of a string, so you need to use replace('\n','') instead. This replaces all of the newlines \n with nothing ''
i am new to python as a matter of fact, this is my first python project. I am using ebaysdk to search for electronics on ebay and i want it to return multiple results because my app is for comparing prices but it returns only one result.
Someone please help me to make the code return multiple results.
Here is my code snippet.
#app.route('/ebay_page_post', methods=['GET', 'POST'])
def ebay_page_post():
if request.method == 'POST':
#Get json format of the text sent by Ajax
search = request.json['search']
try:
#ebaysdk code starts here
api = finding(appid='JohnOkek-hybridse-PRD-5c2330105-9bbb62f2', config_file = None)
api_request = {'keywords':search, 'outputSelector': 'SellerInfo', 'categoryId': '293'}
response = api.execute('findItemsAdvanced', api_request)
soup = BeautifulSoup(response.content, 'lxml')
totalentries = int(soup.find('totalentries').text)
items = soup.find_all('item')
for item in items:
cat = item.categoryname.string.lower()
title = item.title.string.lower().strip()
price = int(round(float(item.currentprice.string)))
url = item.viewitemurl.string.lower()
seller = item.sellerusername.text.lower()
listingtype = item.listingtype.string.lower()
condition = item.conditiondisplayname.string.lower()
print ('____________________________________________________________')
#return json format of the result for Ajax processing
return jsonify(cat + '|' + title + '|' + str(price) + '|' + url + '|' + seller + '|' + listingtype + '|' + condition)
except ConnectionError as e:
return jsonify(e)
Based on the code you provided, added the key value pair collection example you could use :
#app.route('/ebay_page_post', methods=['GET', 'POST'])
def ebay_page_post():
if request.method == 'POST':
#Get json format of the text sent by Ajax
search = request.json['search']
try:
#ebaysdk code starts here
api = finding(appid='JohnOkek-hybridse-PRD-5c2330105-9bbb62f2', config_file = None)
api_request = {'keywords':search, 'outputSelector': 'SellerInfo', 'categoryId': '293'}
response = api.execute('findItemsAdvanced', api_request)
soup = BeautifulSoup(response.content, 'lxml')
totalentries = int(soup.find('totalentries').text)
items = soup.find_all('item')
# This will be returned
itemsFound = {}
# This index will be incremented
# each time an item is added
index = 0
for item in items:
cat = item.categoryname.string.lower()
title = item.title.string.lower().strip()
price = int(round(float(item.currentprice.string)))
url = item.viewitemurl.string.lower()
seller = item.sellerusername.text.lower()
listingtype = item.listingtype.string.lower()
condition = item.conditiondisplayname.string.lower()
# Adding the item found in the collection
# index is the key and the item json is the value
itemsFound[index] = jsonify(cat + '|' + title + '|' + str(price) + '|' + url + '|' + seller + '|' + listingtype + '|' + condition)
# Increment the index for the next items key
index++
for key in itemsFound:
print key, ':', itemsFound[key
# return itemsFound
except ConnectionError as e:
return jsonify(e)
Once the first item is found, add it to the collection. After your for loop finishes, then return the collection.
Right now you are returning (breaking the iteration) once you have found the first
I was able to solve the problem.
Click here to see how i did it
Thanks to every contributor, i am most grateful to you all.
Maybe this question was asked before but since I could not find a proper answer, I dare to ask a similar one. My problem is, I have been trying to scrape a Turkish car sale web site which is named 'Sahibinden'. I use jupyter notebook and sublime editors.Once I try to get the data written in a csv file, the Turkish letter changes to different characters. I tried. 'UTF-8 Encoding', '# -- coding: utf-8 --', ISO 8859-9, etc. but I could not solve the problem. The other issue is that Sublime editor does not create the csv file despite I did not have any problem on the jupyter notebook. You will find the csv file output in the image link. If someone can reply me I would appreciate it.
Note: the program works and no problem once I run print command on the editors.
Thanks a lot.
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import unicodedata
with open ('result1.csv','w') as f:
f.write('brand, model, year, oil_type, gear, odometer, body, hp,
eng_dim, color, warranty, condition, price, safe,
in_fea, outs_fea, mul_fea,pai_fea, rep_fea, acklm \n')
chrome_path = r"C:\Users\Mike\Desktop\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)
def final_page(fn_20):
for lur in fn_20:
driver.get(lur)
brand = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[3]/span''')
brand = brand.text
brand = brand.encode("utf-8")
print (brand)
model = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[5]/span''')
model = model.text
model = model.encode("utf-8")
print (model)
year = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[6]/span''')
year = year.text
year = year.encode("utf-8")
print (year)
oil_type = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[7]/span''')
oil_type = oil_type.text
oil_type = oil_type.encode("utf-8")
print (oil_type)
gear = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[8]/span''')
gear = gear.text
gear = gear.encode("utf-8")
print (gear)
odometer = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[9]/span''')
odometer = odometer.text
odometer = odometer.encode("utf-8")
print (odometer)
body = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[10]/span''')
body = body.text
body = body.encode("utf-8")
print (body)
hp = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[11]/span''')
hp = hp.text
hp = hp.encode("utf-8")
print (hp)
eng_dim = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[12]/span''')
eng_dim = eng_dim.text
eng_dim = eng_dim.encode("utf-8")
print (eng_dim)
color = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[14]/span''')
color = color.text
color = color.encode("utf-8")
print (color)
warranty = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[15]/span''')
warranty = warranty.text
warranty = warranty.encode("utf-8")
print (warranty)
condition = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/ul/li[19]/span''')
condition = condition.text
condition = condition.encode("utf-8")
print (condition)
price = driver.find_element_by_xpath('''//*[#id="classifiedDetail"]/div[1]/div[2]/div[2]/h3''')
price = price.text
price = price.encode("utf-8")
print (price)
safe = ''
safety1 = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[1]/li[#class='selected']''')
for ur in safety1:
ur1 = ur.text
ur1 = ur1.encode("utf-8")
safe +=ur1 + ', '
print (safe)
in_fea = ''
in_features = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[2]/li[#class='selected']''')
for ins in in_features:
ins1 = ins.text
ins1 = ins1.encode("utf-8")
in_fea += ins1 + ', '
print (in_fea)
outs_fea = ''
out_features = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[3]/li[#class='selected']''')
for outs in out_features:
out1 = outs.text
out1 = out1.encode("utf-8")
outs_fea += out1 + ', '
print (outs_fea)
mul_fea = ''
mult_features = driver.find_elements_by_xpath('''//div[#id='classifiedProperties']/ul[4]/li[#class='selected']''')
for mults in mult_features:
mul = mults.text
mul = mul.encode("utf-8")
mul_fea += mul + ', '
print (mul_fea)
pai_fea = ''
paint = driver.find_elements_by_xpath('''//div[#class='classified-pair custom-area ']/ul[1]/li[#class='selected']''')
for pai in paint:
pain = pai.text
pain = pain.encode("utf-8")
pai_fea += pain + ', '
print (pai_fea)
rep_fea = ''
replcd = driver.find_elements_by_xpath('''//div[#class='classified-pair custom-area']/ul[2]/li[#class='selected']''')
for rep in replcd:
repa = rep.text
repa = repa.encode("utf-8")
rep_fea += rep + ', '
print (rep_fea)
acklm = driver.find_element_by_xpath('''//div[#id='classified-detail']/div[#class='uiBox'][1]/div[#id='classifiedDescription']''')
acklm = acklm.text
acklm = acklm.encode("utf-8")
print (acklm)
try:
with open ('result1.csv', 'a') as f:
f.write (brand + ',' [enter image description here][1]+ model + ',' + year + ',' + oil_type + ',' + gear + ',' + odometer + ',' + body + ',' + hp + ',' + eng_dim + ',' + color + ',' + warranty + ',' + condition + ',' + price + ',' + safe + ',' + in_fea + ',' + outs_fea + ',' + mul_fea + ',' + pai_fea + ',' + rep_fea + ',' + acklm + '\n')
except Exception as e:
print (e)
driver.close
import codecs
file = codecs.open("utf_test", "w", "utf-8")
file.write(u'\ufeff')
file.write("test with utf-8")
file.write("字符")
file.close()
or this also works for me
with codecs.open("utf_test", "w", "utf-8-sig") as temp:
temp.write("this is a utf-test\n")
temp.write(u"test")
I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).