I'm having trouble scraping multiple URL's

I'm having trouble scraping multiple URL's - python

I'm having trouble scraping multiple URLs. Essentially I'm able to run this for only one genre, but the second I include other links it stops working.
The goal is to get the data and place it into a csv file with the movie title, url, and genre. Any help would be appreciated!
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html,"html.parser")
containers = page_soup.findAll("li",{"class":"nm-content-horizontal-row-item"})
# name the output file to write to local disk
out_filename = "netflixaction2.csv"
# header of csv file to be written
headers = "Movie_Name, Movie_ID \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
for container in containers:
title_container = container.findAll("a",{"class":"nm-collections-title nm-collections-link"})
title_container = title_container[0].text
movieid = container.findAll("a",{"class":"nm-collections-title nm-collections-link"})
movieid = movieid[0].attrs['href']
print("Movie Name: " + title_container, "\n")
print("Movie ID: " , movieid, "\n")
f.write(title_container + ", " + movieid + "\n")
f.close() # Close the file

The reason you are getting the error is that you trying to do a GET requests on a list.
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
uClient = uReq(my_url)
what I suggest to do here is to loop through each link etc:
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
for link in my_url:
uClient = uReq(link)
page_html = uClient.read()
....
and to mention, if you are just applying the code for the loop, it will override your f.write function. What you need to do is something like:
New edit:
import csv
import requests
from bs4 import BeautifulSoup as soup
# All given URLS
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
# Create and open CSV file
with open("netflixaction2.csv", 'w', encoding='utf-8') as csv_file:
# Headers for CSV
headers_for_csv = ['Movie Name', 'Movie Link']
# Small function for csv DictWriter
csv_writer = csv.DictWriter(csv_file, delimiter=',', lineterminator='\n', fieldnames=headers_for_csv)
csv_writer.writeheader()
# We need to loop through each URL from the list
for link in my_url:
# Do a simple GET requests with the URL
response = requests.get(link)
page_soup = soup(response.text, "html.parser")
# Find all nm-content-horizontal-row-item
containers = page_soup.findAll("li", {"class": "nm-content-horizontal-row-item"})
# Loop through each found "li"
for container in containers:
movie_name = container.text.strip()
movie_link = container.find("a")['href']
print(f"Movie Name: {movie_name} | Movie link: {movie_link}")
# Write to CSV
csv_writer.writerow({
'Movie Name': movie_name,
'Movie Link': movie_link,
})
# Close the file
csv_file.close()
That should be your solution :) Feel free to comment if i'm missing something!

Related

How to stop trailing zeros splitting across cell with Beautiful Soup?

I'm building a web scraper. The top line on this data scrape splits the title because there the number "1,000" at the end. How do I stop this from happening?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.topcashback.co.uk/easyjet-holidays/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("tr")[1:]
filename = "topcashbackEasyJetholidays.csv"
f = open(filename,"w")
headers = "title, rate \n"
f.write(headers)
for container in containers:
title = container.td.div.span.text
rate = container.find("span",{"class":"cashback-desc"}).text
print("title: " + title)
print("rate: " + rate)
f.write(title + "," + rate + "," "\n")
f.close()

The easy and ugly way - cover title with quotes so the comma in 1,000 won't be treat as separator in csv.
f.write('"' + title + '",' + rate + "," "\n") # btw. why the last comma?
# or with f-string
f.write(f'"{title}",{rate}\n")
The more fancy way - use csvwriter

I would check out this before trying to reinvent the wheel:
import pandas as pd
my_url = 'https://www.topcashback.co.uk/easyjet-holidays/'
tables = pd.read_html(my_url, encoding='utf-8')
df = tables[0]
df.columns = ['title', 'n/a', 'rate']
df = df[['title', 'rate']]
df.to_csv("topcashbackEasyJetholidays.csv", index=False)
print(df)
Output:
title rate
0 London Gatwick Departures over £1,000 £50.00
1 Holiday Bookings £1000 and Over £40.00
2 Holiday Bookings £999 and Under £25.00
CSV:
title,rate
"London Gatwick Departures over £1,000",£50.00
Holiday Bookings £1000 and Over,£40.00
Holiday Bookings £999 and Under,£25.00
You'll also need to have lxml installed, aka pip install lxml

Here's the "fancy way", which I think is clearly the better way to go. I find it to actually be an easier and simpler way to code up the problem:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
my_url = 'https://www.topcashback.co.uk/easyjet-holidays/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("tr")[1:]
filename = "topcashbackEasyJetholidays.csv"
with open(filename,"w") as f:
writer = csv.writer(f)
writer.writerow(["title", "rate"])
for container in containers:
title = container.td.div.span.text
rate = container.find("span",{"class":"cashback-desc"}).text
print("title: " + title)
print("rate: " + rate)
writer.writerow([title, rate])
There are other advantages to using a CSV writer. The code is more readable and the details of the CSV file format are hidden. There are other characters that could cause you problems and the CSV writer will transparently deal with all of them. The CSV writer will only use quotes when it has to, making your CSV file smaller. If you support multiple output formats, the same code can be used to write all of them by just creating different kinds of writers at the start of the writing code.

Getting a list output to write correctly in rows in a csv file

I am trying to write this output to a csv file but it is simply not working. I have tried many writing to csv tutorials but none of them work. If you could please direct me to tutorial explaining why this isnt working, I would like to learn the issue and solve it.
import bs4
from urllib.request import urlopen as ureq
from bs4 import BeautifulSoup as soup
import csv
myurl = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'
uclient = ureq(myurl)
page_html = uclient.read()
uclient.close()
page_soup = soup(page_html, 'html.parser')
items = page_soup.find_all('div', {'class':'item-container'})
#filename = 'zeus.csv'
#f = open(filename, 'w')
#header = 'Item Details\n'
#f.write(header)
#contain = items[0]
#container = items[0]
for container in items:
details = container.a.img['title']
with open('zeus.csv', 'w') as f:
f.write(details + "\n")
#print(details)

You can run
with open('zeus.csv', 'w') as f:
for container in items:
details = container.a.img['title']
f.write("{} \n ".format(details))
The problems that were in the code are that with open('zeus.csv', 'w') as f: was in the loop so in each iteration it is overwritten the previous iterations.

You can try something like that for writing list to .csv file :
import csv
#open file
with open(..., 'w', newline='') as your_file:
writer = csv.writer(your_file, quoting=csv.QUOTE_ALL)
# write your list values
writer.writerow(your_list)

Python Web Scraping with Multiple URLs + merge datas

What I'm trying to do is
Take multiple URLs.
Take h2 text in every URL.
Merge h2 texts and then write csv.
In this code, I did:
Take one URL. Take h2 text in URL.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://example.com/ekonomi/20200108/"
#i am trying to do | urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/]
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
out_filename = "output.csv"
headers = "title \n"
f = open(out_filename, "w")
f.write(headers)
container = containers[0]
for container in containers:
title = container.h2.get_text()
f.write(title.replace(",", " ") + "\n")
f.close() # Close the file

Provided your iteration through the containers is correct, this should work:
You want to iterate through the urls. Each url will grab the title, and append it into a list. Then just create a series with that list and write to csv with Pandas:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/']
titles = []
for page_url in urls:
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
for container in containers:
titles.append(container.h2.get_text())
df = pd.DataFrame(titles, columns=['title'])
df.to_csv("output.csv", index=False)

Only writing first row to csv

I am trying to scrape page. I can get it to pull all the data and save it to array objects but cannot get my for loop to iterate over every index of the arrays and output those to CSV. It will write the headers and the first object. Novice to writing code so any help is appreciated.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.sports-reference.com/cfb/schools/air-force/'
# Open Connection & Grabbing the Page
uClient = uReq(my_url)
#Creating variable to Save the Page
page_html = uClient.read()
#Closing the connection
uClient.close()
#Parse the data to HTML
page_soup = soup(page_html, "html.parser")
#Grab container info from the DOM
containers = page_soup.findAll("div",{"class":"overthrow table_container"})
filename = "airforce.csv"
f = open(filename, "w")
headers = "year, wins, losses, ties, wl, sos\n"
f.write(headers)
for container in containers:
#Find all years
year_container = container.findAll("td",{"data-stat":"year_id"})
year = year_container[0].text
#Find number of Wins
wins_container = container.findAll("td",{"data-stat":"wins"})
wins = wins_container[0].text
#Find number of Wins
losses_container = container.findAll("td",{"data-stat":"losses"})
losses = losses_container[0].text
#Number of Ties if any
ties_container = container.findAll("td",{"data-stat":"ties"})
ties = ties_container[0].text
#Win-Loss as a percentage
wl_container = container.findAll("td",{"data-stat":"win_loss_pct"})
wl = wl_container[0].text
#Strength of Schedule. Can be +/- w/0 being average
sos_container = container.findAll("td",{"data-stat":"sos"})
sos = sos_container[0].text
f.write(year + "," + wins + "," + losses + "," + ties + "," + wl + "," +
sos + "\n")
f.close()

You want to find the table (body) and then iterate over the table rows that are not header rows, i.e. all rows that don't have a class.
For writing (and reading) CSV files there is a csv module in the standard library.
import csv
from urllib.request import urlopen
import bs4
def iter_rows(html):
headers = ['year_id', 'wins', 'losses', 'ties', 'win_loss_pct', 'sos']
yield headers
soup = bs4.BeautifulSoup(html, 'html.parser')
table_body_node = soup.find('table', 'stats_table').tbody
for row_node in table_body_node('tr'):
if not row_node.get('class'):
yield [
row_node.find('td', {'data-stat': header}).text
for header in headers
]
def main():
url = 'https://www.sports-reference.com/cfb/schools/air-force/'
with urlopen(url) as response:
html = response.read()
with open('airforce.csv', 'w') as csv_file:
csv.writer(csv_file).writerows(iter_rows(html))
if __name__ == '__main__':
main()

Pulling up the html source code, there is only one container to be put into your container list. Which means that your for loop is trying to access the wrong information.
You should use a range() generator to access the different elements of td that reside inside of the one item in your containers list.
try this
#number of records to iterate over
num = len(list(containers.findAll("td",{"data-stat":"year_id"})))
for i in range(num):
#Find all years
year_container = containers.findAll("td",{"data-stat":"year_id"})
year = year_containers[i].text

WebCrawler, only few items have discounted prices - index error

I am new to programming and am trying to build my first little web crawler in python.
Goal: Crawling a product list page - scraping brand name, article name, original price and new price - saving in CSV file
Status: I've managed to get the brand name, article name as well as original price and put them into correct order into a list (e.g. 10 products). As there is a brand name, description and price for all items, my code get them in correct order into the csv.
Code:
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myUrl = 'https://www.zalando.de/rucksaecke-herren/'
#open connection, grabbing page, saving in page_html and closing connection
uClient = uReq(myUrl)
page_html = uClient.read()
uClient.close()
#Datatype, html paser
page_soup = soup(page_html, "html.parser")
#grabbing information
brand_Names = page_soup.findAll("div",{"class": "z-nvg-cognac_brandName-2XZRz z-nvg-cognac_textFormat-16QFn"})
articale_Names = page_soup.findAll ("div",{"class": "z-nvg-cognac_articleName--arFp z-nvg-cognac_textFormat-16QFn"})
original_Prices = page_soup.findAll("div",{"class": "z-nvg-cognac_originalPrice-2Oy4G"})
new_Prices = page_soup.findAll("div",{"class": "z-nvg-cognac_promotionalPrice-3GRE7"})
#opening a csv file and printing its header
filename = "XXX.csv"
file = open(filename, "w")
headers = "BRAND, ARTICALE NAME, OLD PRICE, NEW PRICE\n"
file.write(headers)
#How many brands on page?
products_on_page = len(brand_Names)
#Looping through all brands, atricles, prices and writing the text into the CSV
for i in range(products_on_page):
brand = brand_Names[i].text
articale_Name = articale_Names[i].text
price = original_Prices[i].text
new_Price = new_Prices[i].text
file.write(brand + "," + articale_Name + "," + price.replace(",",".") + new_Price.replace(",",".") +"\n")
#closing CSV
file.close()
Problem: I am struggling with getting the discounted prices into my csv at the right place. Not every item has a discount and I currently see two issues with my code:
I use .findAll to look for the information on the website - as there are less discounted products then total products, my new_Prices contains fewer prices (e.g. 3 prices for 10 products). If i would be able to add them to the list, I assume they would show up in the first 3 rows. How can i make sure to add the new_Prices to the right prodcuts?
I am getting "Index Error: list index out of range" Error, which i assume is caused by the fact that i am looping through 10 products, however for new_Prices i am reaching the end quicker then for my other lists? Does that make sense and is that my assumption correct?
I am very much appreciating any help.
Thank,
Thorsten

Since some items don't have a 'div.z-nvg-cognac_promotionalPrice-3GRE7' tag you can't use the list index reliably.
However you can select all the container tags ('div.z-nvg-cognac_infoContainer-MvytX') and use find to select tags on each item.
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import csv
my_url = 'https://www.zalando.de/sporttaschen-reisetaschen-herren/'
client = urlopen(my_url)
page_html = client.read().decode(errors='ignore')
page_soup = soup(page_html, "html.parser")
headers = ["BRAND", "ARTICALE NAME", "OLD PRICE", "NEW PRICE"]
filename = "test.csv"
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers)
items = page_soup.find_all(class_='z-nvg-cognac_infoContainer-MvytX')
for item in items:
brand_names = item.find(class_="z-nvg-cognac_brandName-2XZRz z-nvg-cognac_textFormat-16QFn").text
articale_names = item.find(class_="z-nvg-cognac_articleName--arFp z-nvg-cognac_textFormat-16QFn").text
original_prices = item.find(class_="z-nvg-cognac_originalPrice-2Oy4G").text
new_prices = item.find(class_="z-nvg-cognac_promotionalPrice-3GRE7")
if new_prices is not None:
new_prices = new_prices.text
writer.writerow([brand_names, articale_names, original_prices, new_prices])
If you want to get more than 24 items per page you have to use a client that runs js, like selenium.
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import csv
my_url = 'https://www.zalando.de/sporttaschen-reisetaschen-herren/'
driver = webdriver.Firefox()
driver.get(my_url)
page_html = driver.page_source
driver.quit()
page_soup = soup(page_html, "html.parser")
...
Footnotes:
The naming conventions for functions and variables is lowercase with underscores.
When reading or writting csv files it's best to use the csv lib.
When handling files you can use the with statement.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

I'm having trouble scraping multiple URL's - python

Related

How to stop trailing zeros splitting across cell with Beautiful Soup?

Getting a list output to write correctly in rows in a csv file

Python Web Scraping with Multiple URLs + merge datas

Only writing first row to csv

WebCrawler, only few items have discounted prices - index error

Categories

Resources