Scraping multiple paginated links with BeautifulSoup and Requests - python

Python Beginner here. I'm trying to scrape all products from one category on dabs.com. I've managed to scrape all products on a given page, but I'm having trouble iterating over all the paginated links.
Right now, I've tried to isolate all the pagination buttons with the span class='page-list" but even that isn't working. Ideally, I would like to make the crawler keep clicking next until it has scraped all products on all pages. How can I do this?
Really appreciate any input
from bs4 import BeautifulSoup
import requests
base_url = "http://www.dabs.com"
page_array = []
def get_pages():
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
page_list = soup.findAll('span', class="page-list")
pages = page_list[0].findAll('a')
for page in pages:
page_array.append(page.get('href'))
def scrape_page(page):
html = requests.get(base_url)
soup = BeautifulSoup(html.content, "html.parser")
Product_table = soup.findAll("table")
Products = Product_table[0].findAll("tr")
if len(soup.findAll('tr')) > 0:
Products = Products[1:]
for row in Products:
cells = row.find_all('td')
data = {
'description' : cells[0].get_text(),
'price' : cells[1].get_text()
}
print data
get_pages()
[scrape_page(base_url + page) for page in page_array]

Their next page button has a title of "Next" you could do something like:
import requests
from bs4 import BeautifulSoup as bs
url = 'www.dabs.com/category/computing/11001/'
base_url = 'http://www.dabs.com'
r = requests.get(url)
soup = bs(r.text)
elm = soup.find('a', {'title': 'Next'})
next_page_link = base_url + elm['href']
Hope that helps.

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

Pulling all yelp reviews via beautifulsoup

I need some help in pulling all reviews for a hotel using beautiful soup; this is what i have thus far, but i need some inspiration pulling all the reviews via API or regular.
import time
import random
from bs4 import BeautifulSoup as bs
import urllib.request as url
html = urllib.request.urlopen('https://www.yelp.com/biz/shore-cliff-hotel-pismo-beach-2').read().decode('utf-8')
soup = bs(html, 'html.parser')
relevant= soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
reviews = []
for div in relevant:
for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
text = html_class.find('span')
review = html_class.getText(
reviews.append(review)
enter code here
This does the job,
base_url = "https://www.yelp.com/biz/capri-laguna-laguna-beach"
new_page = "?start={}"
content = requests.get(url).content
soup = BeautifulSoup(content, "html.parser")
reviews = []
for i in range(0, 501, 10):
new_page_url = url + new_page.format(i)
new_content = requests.get(url).content
new_soup = BeautifulSoup(content, "html.parser")
relevant= new_soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
for div in relevant:
for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
text = html_class.find('span')
review = html_class.getText()
reviews.append(review)
Code explaination -
If you click to go to the 2nd page you'll see that ?start=10 get's add to the base URL https://www.yelp.com/biz/capri-laguna-laguna-beach. If you go to the 3rd page then you'll see ?start=20 and so on. The number here is the index of the review, and each page has 10 of them. There are 51 total pages meaning the first review on the 51st page would have the index 501. So the added part to the URL would be ?start=500.
So for each page on the website, the code creates a new URL, gets the HTML content of that URL, creates a soup for it and fetches the review from this newly created soup.

how to scrape multiple pages from one site

I want to scrap multiple pages from one site.the pattern like this:
https://www.example.com/S1-3-1.html https://www.example.com/S1-3-2.html https://www.example.com/S1-3-3.html https://www.example.com/S1-3-4.html https://www.example.com/S1-3-5.html.
I tried three method to scrape all of these pages once, but every method only scrape the first page. I show the code below, and anyone can check and tell me what is the problem will be highly appreciated.
===============method 1====================
import requests
for i in range(5): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
===============method 2=============
import urllib2,sys
from bs4 import BeautifulSoup
for numb in ('1', '5'):
address = ('https://www.example.com/S1-3-' + numb + '.html')
html = urllib2.urlopen(address).read()
soup = BeautifulSoup(html,'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
=============method 3==============
import requests
from bs4 import BeautifulSoup
url = 'https://www.example.com/S1-3-1.html'
for round in range(5):
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
paging = soup.select('div.paging a')
next_url = 'https://www.example.com/'+paging[-1]['href'] # paging[-1]['href'] is next page button on the page
url = next_url
I checked some answers and checked, but it is not loop problem, please check image shown below,it is only first page results. it is really me annoyed several days
please see photo:only first page results,
results picture 2
Your indentation is out of order.
try(Method 1)
from bs4 import BeautifulSoup
import requests
for i in range(1, 6): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
Your page analysis should be inside the loop, like this, otherwise, it will only use one page:
.......
for i in range(5): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
........
Firstly, you have to introduce all orders inside of the loop, otherwise, only will work with the last iteration.
Second,
You could try closing the requests session at the end of each iteration:
import requests
for i in range(5): # Number of pages plus one
url = "https://www.example.com/S1-3-{}.html".format(i)
r = requests.get(url)
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all('div', attrs={'class':'product-item item-template-0 alternative'})
r.close()

Get links from a site's homepage using python

I want to write a script to get a home page's links to social media (twitter / facebook mostly), and I'm completely stuck since I am fairly new to Python.
The task I want to accomplish is to parse the website, find the social media links, and save it in a new data frame where each column would contain the original URL, the twitter link, and the facebook link. Here's what I have so far of this code for the new york times website:
from bs4 import BeautifulSoup
import requests
url = "http://www.nytimes.com"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for site in sm_sites:
if all(site in sm_sites for link in all_links):
print(site)
else:
print('no link')
I'm having some problems understanding what the loop is doing, or how to make it work for what I need it to. I also had tried to store the site instead of doing print(site) but that was not working... So I figured I'd ask for help. Before asking, I went through a bunch of responses here but none could get me to do what I needed to do.
the way this code works, you already have your links. Your homepage link is the starting url, so http://www.nytimes.com.
And you have the social media urls sm_sites = ['twitter.com','facebook.com'], all you're doing is confirming they exist on the main page. If you want to save the list of confirmed social media urls, then append them to a list
Here is one way to get the social media links off a page
import requests
from bs4 import BeautifulSoup
url = "https://stackoverflow.com/questions/tagged/python"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
sm_sites_present.append(link.attrs['href'])
print(sm_sites_present)
output:
['https://twitter.com/stackoverflow', 'https://www.facebook.com/officialstackoverflow/']
Update
for a df of urls
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
urls = [
"https://stackoverflow.com/questions/tagged/python",
"https://www.nytimes.com/",
"https://en.wikipedia.org/"
]
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
columns = ['url'] + sm_sites
df = pd.DataFrame(data={'url' : urls}, columns=columns)
def get_sm(row):
r = requests.get(row['url'])
output = pd.Series()
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
output[sm_site] = link.attrs['href']
return output
sm_columns = df.apply(get_sm, axis=1)
df.update(sm_columns)
df.fillna(value='no link')
output
This will do what you want with regards to adding it to a DataFrame. You can iterate through a list of websites (urlsToSearch), adding a row to the dataframe for each one containing the base website, all facebook links, and all twitter links.
from bs4 import BeautifulSoup
import requests
import pandas as pd
df = pd.DataFrame(columns=["Website", "Facebook", "Twitter"])
urlsToSearch = ["http://www.nytimes.com","http://www.businessinsider.com/"]
for url in urlsToSearch:
r = requests.get(url)
tw_links = []
fb_links = []
soup = BeautifulSoup(r.text, 'html.parser')
all_links = [link['href'] for link in soup.find_all('a', href = True)] #only get href
for link in all_links:
if "twitter.com" in link:
tw_links.append(link)
elif "facebook.com" in link:
fb_links.append(link)
df.loc[df.shape[0]] = [url,fb_links,tw_links] #Add row to end of df

Get the lists of things to do from tripadvisor

how to get the 'things to do' list? I am new to webscraping and i don't know how to loop through each page to get the href of all 'things to do'?tell me where i am doing wrong?Any help would be highly apreciated. Thanks in advance.
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
offset = 0
url = 'https://www.tripadvisor.com/Attractions-g255057-Activities-oa' + str(offset) + '-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
urls = []
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'last'}):
page_number = link.get('data-page-number')
last_offset = int(page_number) * 30
print('last offset:', last_offset)
for offset in range(0, last_offset, 30):
print('--- page offset:', offset, '---')
url = 'https://www.tripadvisor.com/Attractions-g255057-oa' + str(offset) + '-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'property_title'}):
iurl='https://www.tripadvisor.com/Attraction_Review-g255057' + link.get('href')
print(iurl)
Basically i want the href of each 'things to do'.
My desired output for 'things to do' is:
https://www.tripadvisor.com/Attraction_Review-g255057-d3377852-Reviews-Weston_Park-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Attraction_Review-g255057-d591972-Reviews-Canberra_Museum_and_Gallery-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Attraction_Review-g255057-d312426-Reviews-Lanyon_Homestead-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Attraction_Review-g255057-d296666-Reviews-Australian_National_University-Canberra_Australian_Capital_Territory.html
Like in below example i used this code for getting the href of each restaurant in canberra city
my code for restauranr which works perfectly is:
import requests
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
with requests.Session() as session:
for offset in range(0, 1050, 30):
url = 'https://www.tripadvisor.com/Restaurants-g255057-oa{0}-Canberra_Australian_Capital_Territory.html#EATERY_LIST_CONTENTS'.format(offset)
soup = BeautifulSoup(session.get(url).content, "html.parser")
for link in soup.select('a.property_title'):
iurl = 'https://www.tripadvisor.com/' + link.get('href')
print(iurl)
the output of restaurant code is:
https://www.tripadvisor.com/Restaurant_Review-g255057-d1054676-Reviews-Lanterne_Rooms-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Restaurant_Review-g255057-d755055-Reviews-Courgette_Restaurant-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Restaurant_Review-g255057-d6893178-Reviews-Pomegranate-Canberra_Australian_Capital_Territory.html
https://www.tripadvisor.com/Restaurant_Review-g255057-d7262443-Reviews-Les_Bistronomes-Canberra_Australian_Capital_Territory.html
.
.
.
.
Ok , it's not that hard, you just have to know which tags to use .
Let me explain with this example :
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.tripadvisor.com/' ## we need this to join the links later ##
main_page = 'https://www.tripadvisor.com/Attractions-g255057-Activities-oa{}-Canberra_Australian_Capital_Territory-Hotels.html#ATTRACTION_LIST_CONTENTS'
links = []
## get the initial page to find the number of pages ##
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html.parser")
## select the last page from the list of pages ('a', {'class':'pageNum taLnk'}) ##
last_page = max([ int(page.get('data-offset')) for page in soup.find_all('a', {'class':'pageNum taLnk'}) ])
## now iterate over that range (first page, last page, number of links), and extract the links from each page ##
for i in range(0, last_page + 30, 30):
page = main_page.format(i)
soup = BeautifulSoup(requests.get(page).text, "html.parser") ## get the next page and parse it with BeautifulSoup ##
## get the hrefs from ('div', {'class':'listing_title'}), and join them with base_url to make the links ##
links += [ base_url + link.find('a').get('href') for link in soup.find_all('div', {'class':'listing_title'}) ]
for link in links :
print(link)
That gives us 8 pages and 212 links in total ( 30 on each page, 2 on the last ) .
I hope this clears things up a bit

Categories