How to scrape data until the last page in Python with BeautifulSoup4? - python

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-
events/?page=1'
#opening connection , downloading page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parser
page_soup = soup(page_html, "html.parser")
# catch each events
card = page_soup.findAll("div",{"class":"eds-media-card-content__content"})
filename = "Data_Events.csv"
f = open(filename, "w")
headers = "events_name, events_dates, events_location, events_fees\n"
f.write(headers)
for activity in card :
event_activity = activity.findAll("div",{"class":"eds-event-
card__formatted-name--is-clamped"})
events_name = event_activity[0].text
event_date = activity.findAll("div",{"class":"eds-text-bs--fixed eds-
text-color--grey-600 eds-l-mar-top-1"})
events_dates = event_date[0].text
events_location = event_date[1].text
events_fees = event_date[2].text
print("events_name: " + events_name)
print("events_dates: " + events_dates)
print("events_location: " + events_location)
print("events_fees: " + events_fees)
f.write(events_name + "," + events_dates + "," + events_location + "," +
events_fees + "\n")
f.close()
Hi, i am still a beginner in using Python language and i would like to know how can i apply a function where this script is able to get data to a next page within the website?
I have try to do a
for pages in page (1, 49)
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-
events/?page=1'
Any advice would be appreciated

import itertools
import requests
from bs4 import BeautifulSoup
def parse_page(url, page)
params = dict(page=page)
resp = requests.get(url, params=params) # will format `?page=#` to url
soup = BeautifulSoup(resp.text, 'html.parser')
... # parse data from page
url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events'
for page in itertools.count(start=1): # don't need to know total pages
try:
parse_page(url, page)
except Exception:
# `parse_url` was designed for a different page layout and will
# fail when no more pages to scrape, so we break here
break

Related

Beautiful Soup webscraper stops after 1st "Scrape"

I am trying to build a scraper.
When I run the code, the for loop is only running once. It just stops after running 1 time and it gives out the same result twice.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
url = "https://opensea.io/collection/morethangamersnftmtg?search[toggles][0]=BUY_NOW"
headers = {'User-Agent': 'Mozilla/5.0'}
request= Request(url, headers=headers)
page_html = urlopen(request).read()
html_soup = BeautifulSoup(page_html, "html.parser")
nft_items = html_soup.find_all("div" , class_="Blockreact__Block-sc-1xf18x6-0 dBFmez AssetsSearchView--assets")
filename = "MTG.csv"
f = open(filename, "w")
headers = "title, price \n"
f.write(headers)
for nft in nft_items:
title = nft.find("div" , class_="Overflowreact__OverflowContainer-sc-10mm0lu-0 kohuDY").text
price = nft.find("div" , class_="Overflowreact__OverflowContainer-sc-10mm0lu-0 gjwKJf Price--amount").text
f.write(title + "," + price)
#f.close()

How to save my links from BeautifulSoup in a text file with python?

I'm learning python and webscraping, It is very cool but I am not able to get what I want.
I'm trying to save products links in a text file to scrape data after.
here is my script, which work correctly (almost) in the console of pycharm :
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
My goal is to save the result of the links variable, line by line in a text file.
I tried this, but something is wrong and I can't get each url :
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Please, does someone can help me?
You can try this way.
Just open the file once and write the complete data to it. Opening and closing files inside a loop is not a good thing to do.
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
with open('text.txt', 'w') as f:
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = 'https://www.topachat.com/' + a.get('href')
f.write(link+'\n')
Sample output from text.txt
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in11020650.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10119254.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20005046.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002036.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002591.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20004309.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002592.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10089390.html
.
.
.
Your problem is in for link in links line:
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links)
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Type of links is string and your for loop iterates it letter-by-letter (or characater-by-character). That is why you see a single character at each line in your txt file. You can just remove the for loop and the code will work:
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
with open("urls.txt", "a") as f:
f.write(links+"\n")
You can do like this:
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
url_list = set()
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
url_list.add(links)
with open("urls.txt", "a") as f:
for link in url_list:
f.write(link+"\n")

Web Scraping - ResultSet object has no attribute 'findAll'

Having an issue with bs4 when reading second value in array within a for loop. Below I will paste the code.
However, when I use line #19, I receive no errors. When I swap it out for the entire array (line #18), It errors out when it attempts to gather the second value. Note that the second value in the array is the same value as line #19.
import requests
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
SmartLiving_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
Headers = "Description, URL, Price \n"
text_file = open("HayneedlePrices.csv", "w")
text_file.write(Headers)
text_file.close()
URL_Array = [SmartLiving_IDS, IEL_IDS, TD_IDS]
#URL_Array = [IEL_IDS]
for URL in URL_Array:
print("\n" + "Loading New URL:" "\n" + URL + "\n" + "\n")
uClient = uReq(URL)
page_html = uClient.read()
uClient.close()
soup = soup(page_html, "html.parser")
Containers = soup.findAll("div", {"product-card__container___1U2Sb"})
for Container in Containers:
Title = Container.div.img["alt"]
Product_URL = Container.a["href"]
Price_Container = Container.findAll("div", {"class":"product-card__productInfo___30YSc body no-underline txt-black"})[0].findAll("span", {"style":"font-size:20px"})
Price_Dollars = Price_Container[0].get_text()
Price_Cents = Price_Container[1].get_text()
print("\n" + "#####################################################################################################################################################################################################" + "\n")
# print(" Container: " + "\n" + str(Container))
# print("\n" + "-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" + "\n")
print(" Description: " + str(Title))
print(" Product URL: " + str(Product_URL))
print(" Price: " + str(Price_Dollars) + str(Price_Cents))
print("\n" + "#####################################################################################################################################################################################################" + "\n")
text_file = open("HayneedlePrices.csv", "a")
text_file.write(str(Title) + ", " + str(Product_URL) + ", " + str(Price_Dollars) + str(Price_Cents) + "\n")
text_file.close()
print("Information gathered and Saved from URL Successfully.")
print("Looking for Next URL..")
print("No Additional URLs to Gather. Process Completed.")
The problem is that you import BeautifulSoup as soup and also define a variable soup = soup(page_html, "html.parser") with the same name!
I refactored your code a bit, let me know if it works as expected!
import csv
import requests
from bs4 import BeautifulSoup
smart_living_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=Smart%20Living&selectedFacets=Brand%7CSmart%20Living&sortBy="
IEL_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=IEL&selectedFacets=Brand%7CIts%20Exciting%20Lighting&sortBy="
TD_IDS = "https://www.hayneedle.com/search/index.cfm?categoryID=&page=1&searchQuery=two%20dogs&selectedFacets=Brand%7CTwo%20Dogs%20Designs&sortBy="
site_URLs = [smart_living_IDS, IEL_IDS, TD_IDS]
sess = requests.Session()
prod_data = []
for curr_URL in site_URLs:
req = sess.get(url=curr_URL)
soup = BeautifulSoup(req.content, "lxml")
containers = soup.find_all("div", {"product-card__container___1U2Sb"})
for curr_container in containers:
prod_title = curr_container.div.img["alt"]
prod_URL = curr_container.a["href"]
price_container = curr_container.find(
"div",
{"class": "product-card__productInfo___30YSc body no-underline txt-black"},
)
dollars_elem = price_container.find("span", {"class": "main-price-dollars"})
cents_elem = dollars_elem.find_next("span")
prod_price = dollars_elem.get_text() + cents_elem.get_text()
prod_price = float(prod_price[1:])
prod_data.append((prod_title, prod_URL, prod_price))
CSV_headers = ("title", "URL", "price")
with open("../out/hayneedle_prices.csv", "w", newline="") as file_out:
writer = csv.writer(file_out)
writer.writerow(CSV_headers)
writer.writerows(prod_data)
I tested it by repeating the current URL list 10 times, it took longer than I was anticipating. There are certainly improvements to be made, I might rewrite it to use lxml in the next few days, and multiprocessing might also be a good option. It all depends on how you're using this, of course :)

Python Web Scraping with Multiple URLs + merge datas

What I'm trying to do is
Take multiple URLs.
Take h2 text in every URL.
Merge h2 texts and then write csv.
In this code, I did:
Take one URL. Take h2 text in URL.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://example.com/ekonomi/20200108/"
#i am trying to do | urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/]
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
out_filename = "output.csv"
headers = "title \n"
f = open(out_filename, "w")
f.write(headers)
container = containers[0]
for container in containers:
title = container.h2.get_text()
f.write(title.replace(",", " ") + "\n")
f.close() # Close the file
Provided your iteration through the containers is correct, this should work:
You want to iterate through the urls. Each url will grab the title, and append it into a list. Then just create a series with that list and write to csv with Pandas:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import pandas as pd
urls = ['https://example.com/ekonomi/20200114/', 'https://example.com/ekonomi/20200113/', 'https://example.com/ekonomi/20200112/', 'https://example.com/ekonomi/20200111/']
titles = []
for page_url in urls:
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
# finds each product from the store page
containers = page_soup.findAll("div", {"class": "b-plainlist__info"})
for container in containers:
titles.append(container.h2.get_text())
df = pd.DataFrame(titles, columns=['title'])
df.to_csv("output.csv", index=False)

Scraping multiple pages with beautifulsoup4 using python 3.6.3

I am trying to loop through multiple pages and my code doesn't extract anything. I am kind of new to scraping so bear with me. I made a container so I can target each listing. I also made a variable to target the anchor tag that you would press to go to the next page. I would really appreciate any help I could get. Thanks.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
for page in range(0,25):
file = "breakfeast_chicago.csv"
f = open(file, "w")
Headers = "Nambusiness_name, business_address, business_city, business_region, business_phone_number\n"
f.write(Headers)
my_url = 'https://www.yellowpages.com/search?search_terms=Stores&geo_location_terms=Chicago%2C%20IL&page={}'.format(page)
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each listing
containers = page_soup.findAll("div",{"class": "result"})
new = page_soup.findAll("a", {"class":"next ajax-page"})
for i in new:
try:
for container in containers:
b_name = i.find("container.h2.span.text").get_text()
b_addr = i.find("container.p.span.text").get_text()
city_container = container.findAll("span",{"class": "locality"})
b_city = i.find("city_container[0].text ").get_text()
region_container = container.findAll("span",{"itemprop": "postalCode"})
b_reg = i.find("region_container[0].text").get_text()
phone_container = container.findAll("div",{"itemprop": "telephone"})
b_phone = i.find("phone_container[0].text").get_text()
print(b_name, b_addr, b_city, b_reg, b_phone)
f.write(b_name + "," +b_addr + "," +b_city.replace(",", "|") + "," +b_reg + "," +b_phone + "\n")
except: AttributeError
f.close()
If using BS4 try : find_all
Try dropping into a trace using import pdb;pdb.set_trace() and try to debug what is being selected in the for loop.
Also, some content may be hidden if it is loaded via javascript.
Each anchor tag or href for "clicking" is just another network request, and if you plan to follow the link consider slowing down the number of requests in between each request, so you don't get blocked.
You can try like the below script. It will traverse different pages through pagination and collect name and phone numbers from each container.
import requests
from bs4 import BeautifulSoup
my_url = "https://www.yellowpages.com/search?search_terms=Stores&geo_location_terms=Chicago%2C%20IL&page={}"
for link in [my_url.format(page) for page in range(1,5)]:
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select(".info"):
try:
name = item.select(".business-name [itemprop='name']")[0].text
except Exception:
name = ""
try:
phone = item.select("[itemprop='telephone']")[0].text
except Exception:
phone = ""
print(name,phone)

Categories