How to combine lists in csv output

How to combine lists in csv output - python

I am new to BS4 and python.
For a project i am trying to get some real estate data.
i made my code so that is get two lists.
my challege is to combine te data in the output.
can any one help me please?
ty
ps: any tips on more efficiënt code are welkom.
from selenium import webdriver
from bs4 import BeautifulSoup
#open('output.csv', 'w').close()
import re
import time
import requests
from itertools import chain
from pandas import DataFrame
import csv
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
#input('Press Enter after bypassing Captcha')
#print(url)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
#print(inside)
for huis in info:
#locatie = huis.find('div')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split()[:-1])
#locatie = huis.find('h2')
#locatie = ' '.join(locatie.get_text(separator='\r\n', strip=True).split())
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
#sep by newline, strip whitespace, then split to get the last 3 elements to cut out, then rejoin
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
l1 = ('{},{},{},{}'.format(street, address, price, pricetag))
#print('{},{},{},{}'.format(street, address, price, pricetag))
out = open('output.csv', 'w')
saveFile = open('output.csv', 'a')
saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#print (list1)
for items in inside:
href = items.get('href')
#print (href)
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
#print (tr)
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
#print('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
saveFile = open('output.csv', 'a')
saveFile.write(soort+ "," + bouwjaar+ "," + woonoppervlakte + "," + inhoud + "," + perceel + '\n')
saveFile.close()
#output = list(chain(list1,list2))
#print (output)
page += 1
#output = list(chain(list1,list2))
#print (output)
#kenmerken = inside.find_all ('a', {'class': 'href'})
#print (href)
#print (details)
#print('{},{},{},{}'.format(street, address, price, pricetag))
#saveFile = open('jaap.csv', 'a')
#saveFile.write(street + "," + address + "," + price + "," + pricetag + '\n')
#saveFile.close()
jaap_spider(1)

Right now your code doesn't actually seem to make two lists. But asuming that you would make a list of lists for l1 out of for huis in info: and a list of lists l2 from for items in inside:, what you could do to combine two lists of lists is: outputlist = [a + b for a, b in zip(l1, l2)].
I incorporated that, plus a conversion to a Pandas DataFrame and an export to csv in the adapted code below:
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
#browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser = webdriver.Chrome(r'C:\Users\NLNIEH\.spyder-py3\chromedriver.exe')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(5)
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
jaap_spider(1)

You can use csv for writing list in csv file.
import csv
def write_list_in_file(filepath, output):
with open(filepath, 'a') as outtsv:
tuple_writer = csv.writer(outtsv, delimiter=',')
tuple_writer.writerow(output)

Related

ValueError: I/O operation on closed file. WebScraping BeatutifulSoup/Python

I am trying to save results in .csv but a receive the follow message and I have no idea how to fix that:
f.write(linha_csv)
ValueError: I/O operation on closed file.
Code Bellow:
import requests
from bs4 import BeautifulSoup
import csv
from csv import reader, writer
url_base = "https://lista.mercadolivre.com.br/"
soup = BeautifulSoup(requests.get(url_base + produto_nome).content,
"html.parser")
produtos = soup.findAll('div', attrs =
{'class': 'andes-card andes-card--flat andes-card--default ui-
search-result ui-search-result--core andes-card--padding-default'}
)
with open
(r'Lista_Precos_MercadoLivre.csv','a',encoding='utf8',newline='')
as f:
fieldnames = ['Produto','Link do Produto','Preco']
dw = csv.DictWriter(f,delimiter=';',fieldnames=fieldnames)
dw.writeheader()
i = 1
while True:
for tag in soup:
titulo = soup.find('h2', attrs={'class': 'ui-search-
item__title'})
print(i, tag.text)
print(i,'Título do Produto:', titulo.text)
print(i,'Link do Produto:', link['href'])
next_link = soup.select_one( "a.andes-pagination__link:-soup-
contains(Seguinte)"
)
if not next_link: break
linha_csv = titulo.text + ';' + link['href'] + ';' + "R$" +
real.text + "," + centavos.text + '\n'
f.write(linha_csv)

Cause indentation in your question is not set correct, it may caused by that fact. Moving the writing part into your for-loop should fix the issue:
for tag in soup.select('li.ui-search-layout__item'):
linha_csv = tag.h2.text + ';' + tag.a['href'] + ';' + tag.select_one('.price-tag-amount').text + '\n'
f.write(linha_csv)
Example
import requests, csv
from bs4 import BeautifulSoup
url_base = "https://lista.mercadolivre.com.br/"
query = "vinho"
soup = BeautifulSoup(requests.get(url_base + query).content, "html.parser")
with open (r'Lista_Precos_MercadoLivre.csv','a',encoding='utf8',newline='') as f:
fieldnames = ['Produto','Link do Produto','Preco']
dw = csv.DictWriter(f,delimiter=';',fieldnames=fieldnames)
dw.writeheader()
while True:
for tag in soup.select('li.ui-search-layout__item'):
linha_csv = tag.h2.text + ';' + tag.a['href'] + ';' + tag.select_one('.price-tag-amount').text + '\n'
f.write(linha_csv)
next_link = soup.select_one( "a.andes-pagination__link:-soup-contains(Seguinte)")
if not next_link:
break
soup = BeautifulSoup(requests.get(next_link["href"]).content, "html.parser")

Handeling Cookie pop-up after page 6/7

I have build a webscraping for real estate data with the help of some fellowsmembers on this website.
It works perfectly, but after is crawls to page 6/7 or furhter, a cookie the typical cookie warning pop up, and seem to disrupt my output in my CSV file.
Is there a way to handle the pop up?
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
THe cookie script in the website:
(function(){function g(a){return{get:function(b){var c=JSON.parse(a.getItem(b));return!c||Date.parse(c.expires)<=(new Date).getTime()?(a.removeItem(b),null):c.value},set:function(b,c,d){c={value:c,expires:d.toUTCString()};a.setItem(b,JSON.stringify(c))},remove:function(b){a.removeItem(b)}}}function d(a,b,c,d){this.parseCommand=function(e,g){function h(){var a=JSON.stringify({messageId:k,value:l||!1});window.parent.postMessage(a,"")}var m=q[a],n=e.action,p=e.key,k=e.messageId,f=e.siteId,f=d?p:p+":"+
f,l=e.value,r=e.expiresMinutes||1440(e.expiresDays||365),s=function(){var a=new Date;a.setTime(a.getTime()+6E4*r);return a}();if(!function(){var a={_hjSet:c,_hjGet:b,_hjRemove:c}[n]||[];return 0<=a.indexOf("")||0<=a.indexOf(g)}())throw Error("Command "+n+" not allowed on key: "+p);switch(n){case "_hjSet":m.set(f,l,s);break;case "_hjGet":l=m.get(f);h();break;case "_hjRemove":m.remove(f)}}}function h(a){try{var b=JSON.parse(a.data);b.key&&k[b.key]&&k[b.key].parseCommand(b,a.origin)}catch(c){return null}}
var q;try{q={cookie:{get:function(a){return(a=RegExp("(?:^|; )"+a+"=([^;])").exec(document.cookie))?a[1]:void 0},set:function(a,b,c){document.cookie=a+"="+b+"; path=/; expires="+c.toUTCString()},remove:function(a){document.cookie=a+"=; expires=Tue, 13 Mar 1979 00:00:00 UTC; path=/;"}},localStorage:g(localStorage),sessionStorage:g(sessionStorage)}}catch(t){return}var k={_hjOptOut:new d("cookie",[""],["https://www.hotjar.com","https://local.hotjar.com","http://local.hotjar.com","https://insights-staging.hotjar.com",
"http://insights-staging.hotjar.com"],!0),grant_consent:new d("cookie",[""],[""],!1),screenshot_retake:new d("localStorage",[""],[""],!1),screenshot_active_retake:new d("sessionStorage",[""],["*"],!1)};window.addEventListener?window.addEventListener("message",h,!1):window.attachEvent("onmessage",h)})();

To overcome the pop up problem just check after loading the page if there any pop up available. If yes,then click on that.Hope this help.
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[#class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[#class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})

Skip Over Item if element doesn't exist on page

I have a script that loops through multiple pages of a website and I want to skip over or add a blank space for the item that might not be on certain pages. For example, there are some pages that do not contain a description about the book. When I run into one of those pages I get an attribute error. My script below loops through the first two pages with no problem, but when it hits the third page it stops.
Here is the traceback
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/beautifulsoup4-4.6.0-py3.6.egg/bs4/element.py", line 737, in __getattr__ AttributeError: 'NavigableString' object has no attribute 'text'
How can I fix this? Here is my script:
from bs4 import BeautifulSoup as soup
import requests
import json
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
#html parsing
page_soup = soup(response.content, "html5lib")
#grabs info for each textbook
containers = page_soup.findAll("div",{"class":"LongDescription"})
author = page_soup.select("p")
about = page_soup.find("div",{"id":"AboutBook"})
for container in containers:
item = {}
item['type'] = "Textbook"
item['title'] = container.find("div",{"class":"twothird"}).h1.text
item['author'] = author[3].get_text(separator=', ')
if item['author'] == " ":
item['author'] = "University of Minnesota Libraries Publishing"
item['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
if not container.find(string="Publisher: "):
item['publisher_url'] = item['publisher'] = ""
else:
item['publisher'] = container.find(text="Publisher: ").nextSibling.text
item['publisher_url'] = container.find(text="Publisher: ").nextSibling['href']
item['source'] = "Open Textbook Library"
if not about.h2.nextSibling.nextSibling.nextSibling:
item['description'] = ""
else:
item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text
item['base_url'] = "https://open.umn.edu/opentextbooks/"
if container.find("p",{"class":"Badge-Condition"}).a:
item['license'] = container.find("p",{"class":"Badge-Condition"}).a.text
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license'] = ''
if container.find("p",{"class":"Badge-Condition"}).a:
item['license_url'] = container.find("p",{"class":"Badge-Condition"}).a["href"]
if container.find("img",{"class":"ctl00_maincontent_imgLicence"}):
item['license_url'] = ''
if container.find("div",{"class":"twothird"}).p:
item['review'] = container.find("div",{"class":"twothird"}).p.text
else:
item['review'] = ''
if item['review'].startswith('('):
item['review'] = item['review'].replace('(', '')
if item['review'].endswith(' reviews)'):
item['review'] = item['review'].replace(' reviews)', '')
if item['review'] > str(0):
item['review'] = "Reviewed Resource"
else:
item['review'] = ''
item['image_url'] = "https://open.umn.edu/opentextbooks/" + container.img["src"]
data.append(item) # add the item to the list
with open("./json/otl-1.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)

I wouldn't recommend parsing the description with item['description'] = about.h2.nextSibling.nextSibling.nextSibling.text, that's too much specific. I came up with this code:
from bs4 import BeautifulSoup as soup
import requests
import json
from pprint import pprint
base_url = "https://open.umn.edu/opentextbooks/"
data = []
n = 30
for i in range(4, n+1):
response = requests.get(base_url + "BookDetail.aspx?bookId=" + str(i))
page_soup = soup(response.content, "lxml")
data = {}
title, author, description = page_soup.select('h1')[0].text, \
page_soup.select('h1 ~ p')[3].get_text(', '), \
'\n'.join(p.text.strip() for p in page_soup.select('div#AboutBook > p') if p.text.strip())
data['type'] = "Textbook"
data['title'] = title
data['author'] = author if author.strip() else "University of Minnesota Libraries Publishing"
data['link'] = "https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=" + str(i)
data['source'] = "Open Textbook Library"
data['description'] = description
pprint(data)
# with open("./json/otl-1.json", "w") as writeJSON:
# json.dump(data, writeJSON, ensure_ascii=False)
Prints:
{'author': 'University of Minnesota Libraries Publishing',
'description': 'This book is intended for an undergraduate or MBA level '
'Financial Accounting course. It covers the standard topics in '
'a standard sequence, utilizing the Socratic method of asking '
'and answering questions.',
'link': 'https://open.umn.edu/opentextbooks/BookDetail.aspx?bookId=4',
'source': 'Open Textbook Library',
'title': 'Financial Accounting',
'type': 'Textbook'}
...and so on (for each book)

Wherever you are getting the AttributeError you can use the following code:
Try:
your code here
except AttributeError:
pass or other codes

Crawling output into multiple csv files

I would like to know how to export my results from crawling into multiple csv files for each different city that I have crawled. Somehow I´m running into walls, do not get a proper way to sort it out.
That is my code:
import requests
from bs4 import BeautifulSoup
import csv
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
output_file= open("TA.csv", "w", newline='')
RegionIDArray = [187147,187323,186338]
dict = {187147: 'Paris', 187323: 'Berlin', 186338: 'London'}
already_printed = set()
for reg in RegionIDArray:
for page in range(1,700,30):
r = requests.get("https://www.tripadvisor.de/Attractions-c47-g" + str(reg) + "-oa" + str(page) + ".html")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "element_wrap"})
for item in g_data:
header = item.find_all("div", {"class": "property_title"})
item = (header[0].text.strip())
if item not in already_printed:
already_printed.add(item)
print("POI: " + str(item) + " | " + "Location: " + str(dict[reg]))
writer = csv.writer(output_file)
csv_fields = ['POI', 'Locaton']
if g_data:
writer.writerow([str(item), str(dict[reg])])
My goal would be that I get three sperate CSV files for Paris, Berlin and London instead of getting all the results in one big csv file.
Could you guys help me out? Thanks for your feedback:)

I did some minor modifications to your code. To make files for each locale, I moved the out_file name inside the loop.
Note, that I don't have time now, the very last line is a hack to ignore unicode errors -- it just skips trying to output a line with a non ascii character. Thas isn't good. Maybe someone can fix that part?
import requests
from bs4 import BeautifulSoup
import csv
user_agent = {'User-agent': 'Chrome/43.0.2357.124'}
RegionIDArray = {187147: 'Paris', 187323: 'Berlin', 186338: 'London'}
already_printed = set()
for reg in RegionIDArray:
output_file= open("TA" + str(reg) + ".csv", "w")
for page in range(1,700,30):
r = requests.get("https://www.tripadvisor.de/Attractions-c47-g" + str(reg) + "-oa" + str(page) + ".html")
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class": "element_wrap"})
for item in g_data:
header = item.find_all("div", {"class": "property_title"})
item = (header[0].text.strip())
if item not in already_printed:
already_printed.add(item)
# print("POI: " + str(item) + " | " + "Location: " + str(RegionIDArray[reg]))
writer = csv.writer(output_file)
csv_fields = ['POI', 'Locaton']
if g_data:
try:
writer.writerow([str(item), str(RegionIDArray[reg])])
except:
pass

Can't figure out why list index out of range

I'm brand new to Python just trying to make a webscraper. I can't figure out why my index is saying list out of range when I have the variable set to 0 for the first index before I start building up the list.
import requests
from bs4 import BeautifulSoup
def kijiji_spider(max_pages):
page = 1
while page <= max_pages:
url = "http://www.kijiji.ca/b-cars-trucks/alberta/convertible__coupe__hatchback__other+body+type__sedan__wagon/page-" + str(page) + "/c174l9003a138?price=__5000"
sourcecode = requests.get(url)
plain_text = sourcecode.text
soup = BeautifulSoup(plain_text)
a = 0
lista=[]
for link in soup.find_all("a", {"class": "title"}):
if a == 0:
href = "|http://www.kijiji.ca" + link.get("href")
lista.append(href)
elif a != 0:
href = "http://www.kijiji.ca" + link.get("href")
lista.append(href)
a += 1
i = 0
listb = []
for link in soup.find_all("a", {"class": "title"}):
title = link.string
listb[i] = listb[i] + "|" + title.strip()
i += 1
x = 0
listc = []
for other in soup.find_all("td", {"class": "price"}):
price = other.string
listc[x] = listc[x] + "|" + price.strip()
x += 1
page += 1
print(lista)
print(listb)
print(listc)
kijiji_spider(1)

Your listb is empty, and then you are trying to access item 0 in it. Since its empty, there is nothing to access so you are getting the IndexError exception:
i = 0
listb = []
for link in soup.find_all("a", {"class": "title"}):
title = link.string
listb[i] = listb[i] + "|" + title.strip()
i += 1
I think what you want to do here is append to the values from the first list you created (lista), so you probably wanted listb.append(lista[i] + '|' + title.split()).
You don't need counters for lists in Python, you simply append to the list and it will grow automatically.
I am not sure why you are adding | before your URLs, but your entire code can be simplified to the following:
def kijiji_spider(max_pages):
page = 1
collected_urls = [] # store all URLs on each "run"
while page <= max_pages:
url = "http://www.kijiji.ca/b-cars-trucks/alberta/convertible__coupe__hatchback__other+body+type__sedan__wagon/page-" + str(page) + "/c174l9003a138?price=__5000"
sourcecode = requests.get(url)
plain_text = sourcecode.text
soup = BeautifulSoup(plain_text)
links = [i.get('href') for i in soup.find_all('a', {'class': 'title'})]
titles = [i.string.strip() for i in soup.find_all('a', {'class': 'title'})]
prices = [i.string.strip() for i in soup.find_all("td", {"class": "price"})]
results = zip(links, titles, prices)
collected_urls.append(results)
page += 1
data = kijiji_spider(5)
for results in data:
for link, title, price in results:
print('http://www.kijiji.ca{} | {} | {}'.format(link, title, price))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to combine lists in csv output - python

You can use csv for writing list in csv file. import csv def write_list_in_file(filepath, output): with open(filepath, 'a') as outtsv: tuple_writer = csv.writer(outtsv, delimiter=',') tuple_writer.writerow(output)

Related

ValueError: I/O operation on closed file. WebScraping BeatutifulSoup/Python

Handeling Cookie pop-up after page 6/7

Skip Over Item if element doesn't exist on page

Crawling output into multiple csv files

Can't figure out why list index out of range

Categories

Resources