Scrape web untill the "next" page is disabled - python

url = 'https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-Global_Village-Dubai_Emirate_of_Dubai.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
def get_links():
review_links = []
for review_link in soup.find_all('a', {'class':'title'},href=True):
review_link = review_link['href']
review_links.append(review_link)
return review_links
link = 'https://www.tripadvisor.ie'
review_urls = []
for i in get_links():
review_url = link + i
print (review_url)
review_urls.append(review_url)
Here this code to save all the hyperlinks present on this webpage - but I want to scrape all the hyperlinks on the pages till 319. Not able to implement when pagination is disabled

There is a param you can change in the url to loop and get all the reviews.
So I just added a loop and requests all the urls
def get_page(index):
url = "https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-or{}-Global_Village-Dubai_Emirate_of_Dubai.html".format(str(index))
html = requests.get(url)
page = soup(html.text, 'html.parser')
return page
nb_review = 3187
for i in range(0, nb_review, 10):
page = get_page(i)
The full code using your snippet is:
from bs4 import BeautifulSoup as soup
import requests
def get_page(index):
url = "https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-or{}-Global_Village-Dubai_Emirate_of_Dubai.html".format(str(index))
html = requests.get(url)
page = soup(html.text, 'html.parser')
return page
def get_links(page):
review_links = []
for review_link in page.find_all('a', {'class':'title'},href=True):
review_link = review_link['href']
review_links.append(review_link)
return review_links
link = 'https://www.tripadvisor.ie'
review_urls = []
nb_review = 3187
for i in range(0, nb_review, 10):
page = get_page(i)
for i in get_links(page):
review_url = link + i
review_urls.append(review_url)
print(len(review_urls))
OUTPUT:
3187
EDIT:
You can obviously scrape the first page and get the review number to upgrade the code to make it more customizable

Related

My while loop to scrape all pages in the website is not working

Here is the website I am trying to scrape: https://books.toscrape.com/
Below are my functions. The scrape_all_pages() is not working. Is there a better way to get the page number from the website directly so I can use the range function instead?
I did checkout Finding number of pages using Python BeautifulSoup
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(num):
"""Takes a page and returns a list of links to the book that are on the page."""
BASE_URL = 'http://books.toscrape.com/catalogue/'
PAGE_URL = BASE_URL + str('page-')
book_url = []
soup = get_soup(PAGE_URL + str(num)+ '.html')
for x in soup.findAll("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
return book_url
def scrape_all_pages():
"""Scrapes all pages, returning a list of book links."""
page_num = 0
all_urls = []
while True:
url = scrape_page(page_num)
if not url:
break
all_urls += url
page_num += 1
return all_urls
It do not need range() in most cases. Would recommend to change strategy and take a look if there is a link to next page available or not:
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
or from python3.8 and later:
nextPage = BASE_URL + a['href'] if(a := soup.select_one('li.next a[href]')) else None
Example
Note Starts from https://books.toscrape.com/catalogue/page-45.html to limit for demo. You could simply change it to get https://books.toscrape.com/ getting all pages scraped
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(url):
"""Takes a page and append link of the books that are on the page to global list"""
BASE_URL = 'http://books.toscrape.com/catalogue/'
soup = get_soup(url)
for x in soup.find_all("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
return nextPage
def scrape_all_pages(url):
"""Scrapes all pages, returning a list of book links."""
while True:
if url:
print(url)
url = scrape_page(url)
else:
break
return book_url
book_url = []
scrape_all_pages('https://books.toscrape.com/catalogue/page-45.html')
Output
['http://books.toscrape.com/catalogue/annie-on-my-mind_120/index.html',
'http://books.toscrape.com/catalogue/and-then-there-were-none_119/index.html',
'http://books.toscrape.com/catalogue/a-walk-in-the-woods-rediscovering-america-on-the-appalachian-trail_118/index.html',
'http://books.toscrape.com/catalogue/a-visit-from-the-goon-squad_117/index.html',
'http://books.toscrape.com/catalogue/a-storm-of-swords-a-song-of-ice-and-fire-3_116/index.html',
'http://books.toscrape.com/catalogue/a-heartbreaking-work-of-staggering-genius_115/index.html',
'http://books.toscrape.com/catalogue/8-keys-to-mental-health-through-exercise_114/index.html',
'http://books.toscrape.com/catalogue/girlboss_113/index.html',
'http://books.toscrape.com/catalogue/the-suffragettes-little-black-classics-96_112/index.html',
'http://books.toscrape.com/catalogue/the-sense-of-an-ending_111/index.html',
'http://books.toscrape.com/catalogue/the-sandman-vol-2-the-dolls-house-the-sandman-volumes-2_110/index.html',
'http://books.toscrape.com/catalogue/the-course-of-love_109/index.html',
'http://books.toscrape.com/catalogue/sugar-rush-offensive-line-2_108/index.html',
'http://books.toscrape.com/catalogue/saga-volume-2-saga-collected-editions-2_107/index.html',
'http://books.toscrape.com/catalogue/run-spot-run-the-ethics-of-keeping-pets_106/index.html',...]

Beautiful soup doesn't scrape the data from the "next" pages

I am trying to scrape airbnb data using BeautifulSoup and Pandas. I checked a lot of tutorials and found the one I followed. The step in which the soup should scrape the data from the next page is not working, out of 15 pages, it scrapes only the first 2 or 3 pages or sometimes even none (even if the URLs of the pages are correct).
I cannot seem to understand why this happens and how to solve it. Can someone help out?
import requests
import bs4
import pandas as pd
import numpy as np
import csv
import time
url = 'https://www.airbnb.it/s/Italy/homes?checkin=2021-08-01&checkout=2021-08-02'
def get_page(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(soup):
result = []
result.extend(soup.find_all("div", {"class": "_8ssblpx"}))
return result
def get_listing_title(listing):
for l in listing:
try:
return str(l.find('div', {'class': '_1tanv1h'}).text)
except:
return None
def get_listing_subtitle(listing):
for l in listing:
try:
return str(l.find('span', {'class': '_1whrsux9'}).text)
except:
return None
def get_listing_info(listing):
for l in listing:
try:
return str(l.find_all('div', {'class': '_3c0zz1'})[0].text.lower())
except:
return None
def find_next_page(page):
base_url = "https://www.airbnb.it"
try:
nextpage = base_url + get_page(url).find_all("div", attrs={"class": "_jro6t0"})[0].find("a", attrs={'class':'_za9j7e'})['href']
except:
nextpage = None
return nextpage
title = []
subtitle = []
info = []
while url is not None:
soup = get_page(url)
listings = get_listings(soup)
for l in listings:
title.append(get_listing_title(l))
subtitle.append(get_listing_subtitle(l))
info.append(get_listing_info(l))
time.sleep(5)
url = find_next_page(soup)
print(url)
airbnb_data = pd.DataFrame(data = {'title': title,
'subtitle': subtitle,
'info': info})
airbnb_data

Script crawls only the first page instead of multiple pages

I am trying to crawl multiple pages of a website. But the program can only crawl the first page.
import requests
from bs4 import BeautifulSoup
import re
import json
import time
def make_soup(url):
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
jsonData = pattern.search(script.text).group(1)
pattern_number = re.compile(r'\"[0-9]{9,12}\":(\{\"data\":\{\"cachedFilters\":(.*?)\}\}),\"[0-9]{9,11}\"')
jsonData2 = pattern_number.search(jsonData).group(1)
dictData = json.loads(jsonData2)
return dictData
def get_reviews(dictData):
""" Return a list of five dicts with reviews.
"""
all_dictionaries = []
for data in dictData['data']['locations']:
for reviews in data['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewdate"] = reviews['createdDate']
userProfile = reviews['userProfile']
review_dict["author"] = userProfile['displayName']
all_dictionaries.append(review_dict)
return all_dictionaries
def main():
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list = get_reviews(dictData) # list with five dicts
#print(review_list)
page_number = 5
while page_number <= 260: # number in the URL
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list2 = get_reviews(dictData)
print(review_list2)
page_number += 5
time.sleep(0.5)
if __name__ == "__main__":
main()
And I'm not sure if I can crawl multiple pages with this URL. On the website there are 54 pages, but in the URL I always have to add the number 5, like this:
Page 1
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS
Page2
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or5-Coronado_Hotel-Zurich.html#REVIEWS
Page3
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or10-Coronado_Hotel-Zurich.html#REVIEWS
I don't know if this is a good idea.
Do you have any suggestions? Thank you in advance!
You assing new url to next_url but you use url to read page.
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
You have to rename variable
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)

How do I get hrefs from hrefs?

How do I get hrefs from hrefs using Python in class and method format?
I have tried:
root_url = 'https://www.iea.org'
class IEAData:
def __init__(self):
try:--
except:
def get_links(self, url):
all_links = []
page = requests.get(root_url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links =iea_obj.get_links(yearLinks)
print(links)
Recommended: links variable must have all month hrefs but not getting, so please tell me how I should do it.
There were a couple of issues with your code. Your get_links() function was not using the url that was passed to it. When looping over the returned links, you were passing yearLinks rather than the url.
The following should get you going:
from bs4 import BeautifulSoup
import requests
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for li in soup.find_all(class_='omrlist'):
all_links.append(root_url + li.find('a').get('href'))
return all_links
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
for url in yearLinks:
links = iea_obj.get_links(url)
print(url, links)
This would give you output starting:
https://www.iea.org/oilmarketreport/reports/2018/ ['https://www.iea.org/oilmarketreport/reports/2018/0118/', 'https://www.iea.org/oilmarketreport/reports/2018/0218/', 'https://www.iea.org/oilmarketreport/reports/2018/0318/', 'https://www.iea.org/oilmarketreport/reports/2018/0418/', 'https://www.iea.org/oilmarketreport/reports/2018/0518/', 'https://www.iea.org/oilmarketreport/reports/2018/0618/', 'https://www.iea.org/oilmarketreport/reports/2018/0718/', 'https://www.iea.org/oilmarketreport/reports/2018/0818/', 'https://www.iea.org/oilmarketreport/reports/2018/1018/']
https://www.iea.org/oilmarketreport/reports/2017/ ['https://www.iea.org/oilmarketreport/reports/2017/0117/', 'https://www.iea.org/oilmarketreport/reports/2017/0217/', 'https://www.iea.org/oilmarketreport/reports/2017/0317/', 'https://www.iea.org/oilmarketreport/reports/2017/0417/', 'https://www.iea.org/oilmarketreport/reports/2017/0517/', 'https://www.iea.org/oilmarketreport/reports/2017/0617/', 'https://www.iea.org/oilmarketreport/reports/2017/0717/', 'https://www.iea.org/oilmarketreport/reports/2017/0817/', 'https://www.iea.org/oilmarketreport/reports/2017/0917/', 'https://www.iea.org/oilmarketreport/reports/2017/1017/', 'https://www.iea.org/oilmarketreport/reports/2017/1117/', 'https://www.iea.org/oilmarketreport/reports/2017/1217/']
I'm fairly new to programming, and I'm still learning and trying to understand how classes and whatnot all work together. But gave it a shot (that's how we learn, right?)
Not sure if this is what you're looking for as your output. I changed 2 things and was able to put all the links from within the yearLinks into a list. Note that it'll also include the PDF links as well as the months links that I think you wanted. If you don't want those PDF links, and exclusively the months, then just don't include the pdf.
So here's the code I did it with, and maybe you can use that to fit into how you have it structured.
root_url = 'https://www.iea.org'
class IEAData:
def get_links(self, url):
all_links = []
page = requests.get(url)
soup = bs4.BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
#print(all_links)
iea_obj = IEAData()
yearLinks = iea_obj.get_links(root_url + '/oilmarketreport/reports/')
reportLinks = []
for url in yearLinks:
links = iea_obj.get_links(url)
# uncomment line below if you do not want the .pdf links
#links = [ x for x in links if ".pdf" not in x ]
reportLinks += links

How to scrape whole website using beautifulsoup

I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)

Categories