WebCrawler, only few items have discounted prices - index error - python

I am new to programming and am trying to build my first little web crawler in python.
Goal: Crawling a product list page - scraping brand name, article name, original price and new price - saving in CSV file
Status: I've managed to get the brand name, article name as well as original price and put them into correct order into a list (e.g. 10 products). As there is a brand name, description and price for all items, my code get them in correct order into the csv.
Code:
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myUrl = 'https://www.zalando.de/rucksaecke-herren/'
#open connection, grabbing page, saving in page_html and closing connection
uClient = uReq(myUrl)
page_html = uClient.read()
uClient.close()
#Datatype, html paser
page_soup = soup(page_html, "html.parser")
#grabbing information
brand_Names = page_soup.findAll("div",{"class": "z-nvg-cognac_brandName-2XZRz z-nvg-cognac_textFormat-16QFn"})
articale_Names = page_soup.findAll ("div",{"class": "z-nvg-cognac_articleName--arFp z-nvg-cognac_textFormat-16QFn"})
original_Prices = page_soup.findAll("div",{"class": "z-nvg-cognac_originalPrice-2Oy4G"})
new_Prices = page_soup.findAll("div",{"class": "z-nvg-cognac_promotionalPrice-3GRE7"})
#opening a csv file and printing its header
filename = "XXX.csv"
file = open(filename, "w")
headers = "BRAND, ARTICALE NAME, OLD PRICE, NEW PRICE\n"
file.write(headers)
#How many brands on page?
products_on_page = len(brand_Names)
#Looping through all brands, atricles, prices and writing the text into the CSV
for i in range(products_on_page):
brand = brand_Names[i].text
articale_Name = articale_Names[i].text
price = original_Prices[i].text
new_Price = new_Prices[i].text
file.write(brand + "," + articale_Name + "," + price.replace(",",".") + new_Price.replace(",",".") +"\n")
#closing CSV
file.close()
Problem: I am struggling with getting the discounted prices into my csv at the right place. Not every item has a discount and I currently see two issues with my code:
I use .findAll to look for the information on the website - as there are less discounted products then total products, my new_Prices contains fewer prices (e.g. 3 prices for 10 products). If i would be able to add them to the list, I assume they would show up in the first 3 rows. How can i make sure to add the new_Prices to the right prodcuts?
I am getting "Index Error: list index out of range" Error, which i assume is caused by the fact that i am looping through 10 products, however for new_Prices i am reaching the end quicker then for my other lists? Does that make sense and is that my assumption correct?
I am very much appreciating any help.
Thank,
Thorsten

Since some items don't have a 'div.z-nvg-cognac_promotionalPrice-3GRE7' tag you can't use the list index reliably.
However you can select all the container tags ('div.z-nvg-cognac_infoContainer-MvytX') and use find to select tags on each item.
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import csv
my_url = 'https://www.zalando.de/sporttaschen-reisetaschen-herren/'
client = urlopen(my_url)
page_html = client.read().decode(errors='ignore')
page_soup = soup(page_html, "html.parser")
headers = ["BRAND", "ARTICALE NAME", "OLD PRICE", "NEW PRICE"]
filename = "test.csv"
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
items = page_soup.find_all(class_='z-nvg-cognac_infoContainer-MvytX')
for item in items:
brand_names = item.find(class_="z-nvg-cognac_brandName-2XZRz z-nvg-cognac_textFormat-16QFn").text
articale_names = item.find(class_="z-nvg-cognac_articleName--arFp z-nvg-cognac_textFormat-16QFn").text
original_prices = item.find(class_="z-nvg-cognac_originalPrice-2Oy4G").text
new_prices = item.find(class_="z-nvg-cognac_promotionalPrice-3GRE7")
if new_prices is not None:
new_prices = new_prices.text
writer.writerow([brand_names, articale_names, original_prices, new_prices])
If you want to get more than 24 items per page you have to use a client that runs js, like selenium.
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import csv
my_url = 'https://www.zalando.de/sporttaschen-reisetaschen-herren/'
driver = webdriver.Firefox()
driver.get(my_url)
page_html = driver.page_source
driver.quit()
page_soup = soup(page_html, "html.parser")
...
Footnotes:
The naming conventions for functions and variables is lowercase with underscores.
When reading or writting csv files it's best to use the csv lib.
When handling files you can use the with statement.

Related

How to stop trailing zeros splitting across cell with Beautiful Soup?

I'm building a web scraper. The top line on this data scrape splits the title because there the number "1,000" at the end. How do I stop this from happening?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.topcashback.co.uk/easyjet-holidays/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("tr")[1:]
filename = "topcashbackEasyJetholidays.csv"
f = open(filename,"w")
headers = "title, rate \n"
f.write(headers)
for container in containers:
title = container.td.div.span.text
rate = container.find("span",{"class":"cashback-desc"}).text
print("title: " + title)
print("rate: " + rate)
f.write(title + "," + rate + "," "\n")
f.close()
The easy and ugly way - cover title with quotes so the comma in 1,000 won't be treat as separator in csv.
f.write('"' + title + '",' + rate + "," "\n") # btw. why the last comma?
# or with f-string
f.write(f'"{title}",{rate}\n")
The more fancy way - use csvwriter
I would check out this before trying to reinvent the wheel:
import pandas as pd
my_url = 'https://www.topcashback.co.uk/easyjet-holidays/'
tables = pd.read_html(my_url, encoding='utf-8')
df = tables[0]
df.columns = ['title', 'n/a', 'rate']
df = df[['title', 'rate']]
df.to_csv("topcashbackEasyJetholidays.csv", index=False)
print(df)
Output:
title rate
0 London Gatwick Departures over £1,000 £50.00
1 Holiday Bookings £1000 and Over £40.00
2 Holiday Bookings £999 and Under £25.00
CSV:
title,rate
"London Gatwick Departures over £1,000",£50.00
Holiday Bookings £1000 and Over,£40.00
Holiday Bookings £999 and Under,£25.00
You'll also need to have lxml installed, aka pip install lxml
Here's the "fancy way", which I think is clearly the better way to go. I find it to actually be an easier and simpler way to code up the problem:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
my_url = 'https://www.topcashback.co.uk/easyjet-holidays/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("tr")[1:]
filename = "topcashbackEasyJetholidays.csv"
with open(filename,"w") as f:
writer = csv.writer(f)
writer.writerow(["title", "rate"])
for container in containers:
title = container.td.div.span.text
rate = container.find("span",{"class":"cashback-desc"}).text
print("title: " + title)
print("rate: " + rate)
writer.writerow([title, rate])
There are other advantages to using a CSV writer. The code is more readable and the details of the CSV file format are hidden. There are other characters that could cause you problems and the CSV writer will transparently deal with all of them. The CSV writer will only use quotes when it has to, making your CSV file smaller. If you support multiple output formats, the same code can be used to write all of them by just creating different kinds of writers at the start of the writing code.

Scraping website with BS4 // accessing class

I am tring to extract different information from websites with BeautifulSoup, such as title of the product and the price.
I do that with different urls, looping through the urls with for...in.... Here, I'll just provide a snippet without the loop.
from bs4 import BeautifulSoup
import requests
import csv
url= 'https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
price = soup.find('meta', property="product:price:amount")
title = soup.find("div", {"class": "flix-model-name"})
title2 = soup.find('div', class_="flix-model-name")
title3 = soup.find("div", attrs={"class": "flix-model-name"})
print(price['content'])
print(title)
print(title2)
print(title3)
So from this URL https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html I wasnt to extract the product number. the only place I find it is in the div class="flix-model-name". However, I am totally unable to reach it. I tried different ways to access it in the title, title2, title3, but I always have the output none.
I am a bit of a beginner, so I guess I am probably missing something basic... If so, please pardon me for that.
Any help is welcome! Many thanks in advance!
just for info, with each url I thought of appending the data and write them on a CSV file like that:
for url in urls:
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
row=[]
try:
# title = YOUR VERY WELCOMED ANSWER
prices = soup.find('meta', property="product:price:amount")
row = (title.text+','+prices['content']+'\n')
data.append(row)
except:
pass
file = open('database.csv','w')
i = 0
while i < (len(data)):
file.write(data[i])
i +=1
file.close()
Many thanks in advance for your help!
David
Try below approach using python - requests simple, straightforward, reliable, fast and less code is required when it comes to requests. I have fetched the API URL from website itself after inspecting the network section of google chrome browser.
What exactly below script is doing:
First it will take the API URL, create the URL based on 2 dynamic parameters(product and category) and then do GET request to get the data.
After getting the data script will parse the JSON data using json.loads library.
Finally, it will iterate all over the list of products one by one and print the details which are divided in 2 categotries 'box1_ProductToProduct' and 'box2_KategorieTopseller' like Brand, Name, Product number and Unit price. Same way you can add more details by looking in to the API call.
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def scrap_product_details():
PRODUCT = 'MMCH1991479' #Product number
CATEGORY = '680942' #Category number
URL = 'https://www.mediamarkt.ch/rde_server/res/MMCH/recomm/product_detail/sid/WACXyEbIf3khlu6FcHlh1B1?product=' + PRODUCT + '&category=' + CATEGORY # dynamic URL
response = requests.get(URL,verify = False) #GET request to fetch the data
result = json.loads(response.text) # Parse JSON data using json.loads
box1_ProductToProduct = result[0]['box1_ProductToProduct'] # Extracted data from API
box2_KategorieTopseller = result[1]['box2_KategorieTopseller']
for item in box1_ProductToProduct: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
for item in box2_KategorieTopseller: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
scrap_product_details()

Python Web Scraping with Multiple URLs + merge datas

What I'm trying to do is
Take multiple URLs.
Take h2 text in every URL.
Merge h2 texts and then write csv.
In this code, I did:
Take one URL. Take h2 text in URL.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://example.com/ekonomi/20200108/"
#i am trying to do | urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/]
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
out_filename = "output.csv"
headers = "title \n"
f = open(out_filename, "w")
f.write(headers)
container = containers[0]
for container in containers:
title = container.h2.get_text()
f.write(title.replace(",", " ") + "\n")
f.close() # Close the file
Provided your iteration through the containers is correct, this should work:
You want to iterate through the urls. Each url will grab the title, and append it into a list. Then just create a series with that list and write to csv with Pandas:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/']
titles = []
for page_url in urls:
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
for container in containers:
titles.append(container.h2.get_text())
df = pd.DataFrame(titles, columns=['title'])
df.to_csv("output.csv", index=False)

Only writing first row to csv

I am trying to scrape page. I can get it to pull all the data and save it to array objects but cannot get my for loop to iterate over every index of the arrays and output those to CSV. It will write the headers and the first object. Novice to writing code so any help is appreciated.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.sports-reference.com/cfb/schools/air-force/'
# Open Connection & Grabbing the Page
uClient = uReq(my_url)
#Creating variable to Save the Page
page_html = uClient.read()
#Closing the connection
uClient.close()
#Parse the data to HTML
page_soup = soup(page_html, "html.parser")
#Grab container info from the DOM
containers = page_soup.findAll("div",{"class":"overthrow table_container"})
filename = "airforce.csv"
f = open(filename, "w")
headers = "year, wins, losses, ties, wl, sos\n"
f.write(headers)
for container in containers:
#Find all years
year_container = container.findAll("td",{"data-stat":"year_id"})
year = year_container[0].text
#Find number of Wins
wins_container = container.findAll("td",{"data-stat":"wins"})
wins = wins_container[0].text
#Find number of Wins
losses_container = container.findAll("td",{"data-stat":"losses"})
losses = losses_container[0].text
#Number of Ties if any
ties_container = container.findAll("td",{"data-stat":"ties"})
ties = ties_container[0].text
#Win-Loss as a percentage
wl_container = container.findAll("td",{"data-stat":"win_loss_pct"})
wl = wl_container[0].text
#Strength of Schedule. Can be +/- w/0 being average
sos_container = container.findAll("td",{"data-stat":"sos"})
sos = sos_container[0].text
f.write(year + "," + wins + "," + losses + "," + ties + "," + wl + "," +
sos + "\n")
f.close()
You want to find the table (body) and then iterate over the table rows that are not header rows, i.e. all rows that don't have a class.
For writing (and reading) CSV files there is a csv module in the standard library.
import csv
from urllib.request import urlopen
import bs4
def iter_rows(html):
headers = ['year_id', 'wins', 'losses', 'ties', 'win_loss_pct', 'sos']
yield headers
soup = bs4.BeautifulSoup(html, 'html.parser')
table_body_node = soup.find('table', 'stats_table').tbody
for row_node in table_body_node('tr'):
if not row_node.get('class'):
yield [
row_node.find('td', {'data-stat': header}).text
for header in headers
]
def main():
url = 'https://www.sports-reference.com/cfb/schools/air-force/'
with urlopen(url) as response:
html = response.read()
with open('airforce.csv', 'w') as csv_file:
csv.writer(csv_file).writerows(iter_rows(html))
if __name__ == '__main__':
main()
Pulling up the html source code, there is only one container to be put into your container list. Which means that your for loop is trying to access the wrong information.
You should use a range() generator to access the different elements of td that reside inside of the one item in your containers list.
try this
#number of records to iterate over
num = len(list(containers.findAll("td",{"data-stat":"year_id"})))
for i in range(num):
#Find all years
year_container = containers.findAll("td",{"data-stat":"year_id"})
year = year_containers[i].text

Saving data from website to csv with find_all from BeautifulSoup

I am trying to learn how to scrape a website with Python and BeautifulSoup. I have been able to collect all the names/job-titles, and I'm trying to save them into a csv-file. I either need some type of loop or append in order to get them all into a csv-file. As it stands now, only the final name and job-title are saved in the csv-file.
#import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
#specify the url
buzzly_page = 'http://buzzlymedia.com/ourwork/'
#query the website and return the html to the variable 'page'
page = urllib2.urlopen(buzzly_page)
#parse the html
soup = BeautifulSoup(page, 'html.parser')
#query to get value of name
for name_box in soup.find_all('strong', attrs={'itemprop': 'name'}):
name = name_box.text.strip() #remove starting and trailing
print name
#query to get value of job-title
for job_box in soup.find_all('span', attrs={'itemprop': 'jobTitle'}):
job = job_box.text.strip() #remove starting and trailing
print job
#write into csv-file
with open('buzzly_clients.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name, job])
Find the divs that contains the elements you want and iterate over them like this.
# import libraries
import csv
import urllib2
from bs4 import BeautifulSoup
# specify the url
buzzly_page = 'http://buzzlymedia.com/ourwork/'
# query the website and return the html to the variable 'page'
page = urllib2.urlopen(buzzly_page)
# parse the html
soup = BeautifulSoup(page, 'html.parser')
# write into csv-file
with open('buzzly_clients.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for div in soup.find_all('div', attrs={'class': 'avia-testimonial-meta-mini'}):
# query to get value of name
name_box = div.find('strong', attrs={'itemprop': 'name'})
name = name_box.text.strip() # remove starting and trailing
print (name)
# query to get value of job-title
job_box = div.find('span', attrs={'itemprop': 'jobTitle'})
job = job_box.text.strip() # remove starting and trailing
print (job)
writer.writerow([name, job])

Categories