Beautiful Soup webscraper stops after 1st "Scrape" - python

I am trying to build a scraper.
When I run the code, the for loop is only running once. It just stops after running 1 time and it gives out the same result twice.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
url = "https://opensea.io/collection/morethangamersnftmtg?search[toggles][0]=BUY_NOW"
headers = {'User-Agent': 'Mozilla/5.0'}
request= Request(url, headers=headers)
page_html = urlopen(request).read()
html_soup = BeautifulSoup(page_html, "html.parser")
nft_items = html_soup.find_all("div" , class_="Blockreact__Block-sc-1xf18x6-0 dBFmez AssetsSearchView--assets")
filename = "MTG.csv"
f = open(filename, "w")
headers = "title, price \n"
f.write(headers)
for nft in nft_items:
title = nft.find("div" , class_="Overflowreact__OverflowContainer-sc-10mm0lu-0 kohuDY").text
price = nft.find("div" , class_="Overflowreact__OverflowContainer-sc-10mm0lu-0 gjwKJf Price--amount").text
f.write(title + "," + price)
#f.close()

Related

BeautifulSoup: Missing Schema invalid url error

I am trying to download images from a web page using BeautifulSoup. I am getting the following error
MissingSchema: Invalid URL
import requests
from bs4 import BeautifulSoup
import os
from os.path import basename
url = "https://xxxxxx"
#r = requests.get(url)
request_page = urlopen(url)
page_html = request_page.read()
request_page.close()
soup = BeautifulSoup(page_html, 'html.parser')
#print(soup.title.text)
images = soup.find_all('img')
for image in images:
name = image['alt']
link =image['src']
with open(name.replace(' ', '-').replace('/', '') + 'jpg', 'wb') as f:
im = requests.get(link)
f.write(im.content)
print(images)
I am unsure why. I know I can read the images fine because the print works fine until I aded the following code
with open(name.replace(' ', '-').replace('/', '') + 'jpg', 'wb') as f:
im = requests.get(link)
f.write(im.content)
I would be grateful for any help
thanks
EDIT
The url is
url = "https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/September_2018"
I added the print link as requested and the output is below
//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg/300px-Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg
//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/Titian_-_Portrait_of_a_man_with_a_quilted_sleeve.jpg/280px-Titian_-_Portrait_of_a_man_with_a_quilted_sleeve.jpg
//upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Bee_on_Lavender_Blossom_2.jpg/250px-Bee_on_Lavender_Blossom_2.jpg
edit
I am just wondering if it the size of the name in the link? On looking at that it seems to be buried in a lot of folders before we get to the jpeg?
As I suspected based on the error, when you added that print statement you can see that the links you are trying to access are not valid urls.
//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg/300px-Portrait_of_Tsaritsa_Natalya_Kirillovna_Naryshkina_-_Google_Cultural_Institute.jpg needs to start with https:.
To fix this, simply add that to the image['src'].
Second issue you need to fix is that when you write the file, you are writing it as 'Natalya-Naryshkinajpg'. You need that with jpg as the file extesions: for example 'Natalya-Naryshkina.jpg' I fixed that as well.
Code:
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Wikipedia:Picture_of_the_day/September_2019"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
r = requests.get(url, headers=headers)
page_html = r.text
soup = BeautifulSoup(page_html, 'html.parser')
#print(soup.title.text)
images = soup.find_all('img')
for image in images:
name = image['alt']
link = 'https:' + image['src']
#print(link)
if 'static' not in link:
try:
extension = link.split('.')[-1]
with open(name.replace(' ', '-').replace('/', '') + '.' + extension, 'wb') as f:
im = requests.get(link, headers=headers)
f.write(im.content)
print(name)
except Exception as e:
print(e)
print(images)
This should hopefully work:
import re
import requests
from bs4 import BeautifulSoup
site = 'https://books.toscrape.com/'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
if not filename:
print("Regex didn't match with the url: {}".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)

How to save my links from BeautifulSoup in a text file with python?

I'm learning python and webscraping, It is very cool but I am not able to get what I want.
I'm trying to save products links in a text file to scrape data after.
here is my script, which work correctly (almost) in the console of pycharm :
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
My goal is to save the result of the links variable, line by line in a text file.
I tried this, but something is wrong and I can't get each url :
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Please, does someone can help me?
You can try this way.
Just open the file once and write the complete data to it. Opening and closing files inside a loop is not a good thing to do.
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
with open('text.txt', 'w') as f:
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = 'https://www.topachat.com/' + a.get('href')
f.write(link+'\n')
Sample output from text.txt
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in11020650.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10119254.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20005046.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002036.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002591.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20004309.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002592.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10089390.html
.
.
.
Your problem is in for link in links line:
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links)
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Type of links is string and your for loop iterates it letter-by-letter (or characater-by-character). That is why you see a single character at each line in your txt file. You can just remove the for loop and the code will work:
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
with open("urls.txt", "a") as f:
f.write(links+"\n")
You can do like this:
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
url_list = set()
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
url_list.add(links)
with open("urls.txt", "a") as f:
for link in url_list:
f.write(link+"\n")

I'm having trouble scraping multiple URL's

I'm having trouble scraping multiple URLs. Essentially I'm able to run this for only one genre, but the second I include other links it stops working.
The goal is to get the data and place it into a csv file with the movie title, url, and genre. Any help would be appreciated!
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html,"html.parser")
containers = page_soup.findAll("li",{"class":"nm-content-horizontal-row-item"})
# name the output file to write to local disk
out_filename = "netflixaction2.csv"
# header of csv file to be written
headers = "Movie_Name, Movie_ID \n"
# opens file, and writes headers
f = open(out_filename, "w")
f.write(headers)
for container in containers:
title_container = container.findAll("a",{"class":"nm-collections-title nm-collections-link"})
title_container = title_container[0].text
movieid = container.findAll("a",{"class":"nm-collections-title nm-collections-link"})
movieid = movieid[0].attrs['href']
print("Movie Name: " + title_container, "\n")
print("Movie ID: " , movieid, "\n")
f.write(title_container + ", " + movieid + "\n")
f.close() # Close the file
The reason you are getting the error is that you trying to do a GET requests on a list.
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
uClient = uReq(my_url)
what I suggest to do here is to loop through each link etc:
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
for link in my_url:
uClient = uReq(link)
page_html = uClient.read()
....
and to mention, if you are just applying the code for the loop, it will override your f.write function. What you need to do is something like:
New edit:
import csv
import requests
from bs4 import BeautifulSoup as soup
# All given URLS
my_url = ['https://www.netflix.com/browse/genre/1365', 'https://www.netflix.com/browse/genre/7424']
# Create and open CSV file
with open("netflixaction2.csv", 'w', encoding='utf-8') as csv_file:
# Headers for CSV
headers_for_csv = ['Movie Name', 'Movie Link']
# Small function for csv DictWriter
csv_writer = csv.DictWriter(csv_file, delimiter=',', lineterminator='\n', fieldnames=headers_for_csv)
csv_writer.writeheader()
# We need to loop through each URL from the list
for link in my_url:
# Do a simple GET requests with the URL
response = requests.get(link)
page_soup = soup(response.text, "html.parser")
# Find all nm-content-horizontal-row-item
containers = page_soup.findAll("li", {"class": "nm-content-horizontal-row-item"})
# Loop through each found "li"
for container in containers:
movie_name = container.text.strip()
movie_link = container.find("a")['href']
print(f"Movie Name: {movie_name} | Movie link: {movie_link}")
# Write to CSV
csv_writer.writerow({
'Movie Name': movie_name,
'Movie Link': movie_link,
})
# Close the file
csv_file.close()
That should be your solution :) Feel free to comment if i'm missing something!

How to scrape data until the last page in Python with BeautifulSoup4?

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-
events/?page=1'
#opening connection , downloading page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parser
page_soup = soup(page_html, "html.parser")
# catch each events
card = page_soup.findAll("div",{"class":"eds-media-card-content__content"})
filename = "Data_Events.csv"
f = open(filename, "w")
headers = "events_name, events_dates, events_location, events_fees\n"
f.write(headers)
for activity in card :
event_activity = activity.findAll("div",{"class":"eds-event-
card__formatted-name--is-clamped"})
events_name = event_activity[0].text
event_date = activity.findAll("div",{"class":"eds-text-bs--fixed eds-
text-color--grey-600 eds-l-mar-top-1"})
events_dates = event_date[0].text
events_location = event_date[1].text
events_fees = event_date[2].text
print("events_name: " + events_name)
print("events_dates: " + events_dates)
print("events_location: " + events_location)
print("events_fees: " + events_fees)
f.write(events_name + "," + events_dates + "," + events_location + "," +
events_fees + "\n")
f.close()
Hi, i am still a beginner in using Python language and i would like to know how can i apply a function where this script is able to get data to a next page within the website?
I have try to do a
for pages in page (1, 49)
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-
events/?page=1'
Any advice would be appreciated
import itertools
import requests
from bs4 import BeautifulSoup
def parse_page(url, page)
params = dict(page=page)
resp = requests.get(url, params=params) # will format `?page=#` to url
soup = BeautifulSoup(resp.text, 'html.parser')
... # parse data from page
url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events'
for page in itertools.count(start=1): # don't need to know total pages
try:
parse_page(url, page)
except Exception:
# `parse_url` was designed for a different page layout and will
# fail when no more pages to scrape, so we break here
break

Only repeats 1 to 10 pages when I crawl Python(Blog crawling)

I want to crawl Naver blog with the following code, only posts 1 to 10 on the first page will be crawled. 11 ~ 20, 21 ~ 30 .... How do I edit to continue crawling
import sys
from bs4 import BeautifulSoup
import requests
import csv
BASE_URL = "https://search.naver.com/search.naver?where=post&sm=tab_pge&query=%ED%99%94%EC%A0%95%EC%B2%9C&st=sim&date_option=8&date_from=20160101&date_to=20161231&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=p%3Afrom20160101to20161231&ie=utf8&start="
f = open("park01.csv", 'w', newline='')
wr =csv.writer(f)
for i in range(100):
URL_with_page_num = BASE_URL + str(1 + i*10)
response = requests.get(BASE_URL)
response.status_code
print (response.status_code)
dom = BeautifulSoup(response.content, "html.parser")
post_elements = dom.select("li.sh_blog_top")
for post_element in post_elements:
title_element = post_element.select_one("a.sh_blog_title")
passage_element = post_element.select_one("dd.sh_blog_passage")
title = title_element.text
url = title_element.get("href")
passage = passage_element.text
data=[title, url, passage]
wr.writerow(data)
f.close()
I guess the problem is in below code -
for i in range(100):
URL_with_page_num = BASE_URL + str(1 + i*10)
response = requests.get(BASE_URL)
put URL_with_page_num in place of BASE_URL in last line of above code
response = requests.get(URL_with_page_num)

Categories