How to extracting content from pagination next button? - python

This is the website I am trying to scrape:
(https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage)
Below is the code that I have tried,but it repetitively return me first page and third page.
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3"
while True:
html = urlopen(url)
soup = BeautifulSoup(html ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.find('a', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()

Select your elements more specific, used css selectors here to get the <a> that is child of an element with class="PagedList-skipToNext" :
next_button = soup.select_one('.PagedList-skipToNext a')
Also check the results of your selection, base_url is not needed here:
url = next_button.get('href')
Example
from bs4 import BeautifulSoup
import requests
def parse():
url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
while True:
soup = BeautifulSoup(requests.get(url).text)
print(url) ## to see what you are working on or enter code that should be performed
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Output
https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=2
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=4
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=5
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=6
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=7
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=8
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=9
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=10
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=11
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=12
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=13
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=14

Related

My while loop to scrape all pages in the website is not working

Here is the website I am trying to scrape: https://books.toscrape.com/
Below are my functions. The scrape_all_pages() is not working. Is there a better way to get the page number from the website directly so I can use the range function instead?
I did checkout Finding number of pages using Python BeautifulSoup
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(num):
"""Takes a page and returns a list of links to the book that are on the page."""
BASE_URL = 'http://books.toscrape.com/catalogue/'
PAGE_URL = BASE_URL + str('page-')
book_url = []
soup = get_soup(PAGE_URL + str(num)+ '.html')
for x in soup.findAll("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
return book_url
def scrape_all_pages():
"""Scrapes all pages, returning a list of book links."""
page_num = 0
all_urls = []
while True:
url = scrape_page(page_num)
if not url:
break
all_urls += url
page_num += 1
return all_urls
It do not need range() in most cases. Would recommend to change strategy and take a look if there is a link to next page available or not:
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
or from python3.8 and later:
nextPage = BASE_URL + a['href'] if(a := soup.select_one('li.next a[href]')) else None
Example
Note Starts from https://books.toscrape.com/catalogue/page-45.html to limit for demo. You could simply change it to get https://books.toscrape.com/ getting all pages scraped
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(url):
"""Takes a page and append link of the books that are on the page to global list"""
BASE_URL = 'http://books.toscrape.com/catalogue/'
soup = get_soup(url)
for x in soup.find_all("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
return nextPage
def scrape_all_pages(url):
"""Scrapes all pages, returning a list of book links."""
while True:
if url:
print(url)
url = scrape_page(url)
else:
break
return book_url
book_url = []
scrape_all_pages('https://books.toscrape.com/catalogue/page-45.html')
Output
['http://books.toscrape.com/catalogue/annie-on-my-mind_120/index.html',
'http://books.toscrape.com/catalogue/and-then-there-were-none_119/index.html',
'http://books.toscrape.com/catalogue/a-walk-in-the-woods-rediscovering-america-on-the-appalachian-trail_118/index.html',
'http://books.toscrape.com/catalogue/a-visit-from-the-goon-squad_117/index.html',
'http://books.toscrape.com/catalogue/a-storm-of-swords-a-song-of-ice-and-fire-3_116/index.html',
'http://books.toscrape.com/catalogue/a-heartbreaking-work-of-staggering-genius_115/index.html',
'http://books.toscrape.com/catalogue/8-keys-to-mental-health-through-exercise_114/index.html',
'http://books.toscrape.com/catalogue/girlboss_113/index.html',
'http://books.toscrape.com/catalogue/the-suffragettes-little-black-classics-96_112/index.html',
'http://books.toscrape.com/catalogue/the-sense-of-an-ending_111/index.html',
'http://books.toscrape.com/catalogue/the-sandman-vol-2-the-dolls-house-the-sandman-volumes-2_110/index.html',
'http://books.toscrape.com/catalogue/the-course-of-love_109/index.html',
'http://books.toscrape.com/catalogue/sugar-rush-offensive-line-2_108/index.html',
'http://books.toscrape.com/catalogue/saga-volume-2-saga-collected-editions-2_107/index.html',
'http://books.toscrape.com/catalogue/run-spot-run-the-ethics-of-keeping-pets_106/index.html',...]

How to get multiple class to BeautifulSoup?

Trying to get torrent links from skidrowreloaded.
On the post detail page we have a div like this, I tried get this div by id but i think id is dynamic so I tried get this div by class but did not work,
<div id="tabs-105235-0-0" aria-labelledby="ui-id-1" class="ui-tabs-panel ui-widget-content ui-corner-bottom" role="tabpanel" aria-hidden="false">
the following code is returning none
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
err:
AttributeError: 'NoneType' object has no attribute 'find_all'
full code:
import os
from bs4 import BeautifulSoup
import requests
import webbrowser
clear = lambda: os.system('cls')
clear()
r = requests.get('https://www.skidrowreloaded.com/')
source = BeautifulSoup(r.content,"lxml")
source2 = source.find_all("h2")
games = []
for i in source2:
games.append(i.a.get("href"))
lastgame = games[0]
r = requests.get(lastgame)
source = BeautifulSoup(r.content,"lxml")
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
source3 = source2.find_all("a")
k = 0;
for i in source3:
if k == 0: #hide steam link.
k = k + 1
else:
if i.get("href") == "https://www.skidrowreloaded.com": #hide null links
pass
else: #throwing links to the browser
print(i.get("href"))
webbrowser.open(i.get("href"))
k = k + 1
To get all the links try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.skidrowreloaded.com/projection-first-light-goldberg/"
soup = BeautifulSoup(requests.get(url).text, "html.parser").find_all("a", {"target": "_blank"})
skip = 'https://www.skidrowreloaded.com'
print([a['href'] for a in soup if a['href'].startswith('https') and a['href'] != skip])
Output:
['https://store.steampowered.com/app/726490/Projection_First_Light/', 'https://mega.nz/file/geogAATS#-0U0PklF-Q5i5l_SELzYx3klh5FZob9HaD4QKcFH_8M', 'https://uptobox.com/rqnlpcp7yb3v', 'https://1fichier.com/?0syphwpyndpo38af04ky', 'https://yadi.sk/d/KAmlsBmGaI1f2A', 'https://pixeldra.in/u/wmcsjuhv', 'https://dropapk.to/v6r7mjfgxjq6', 'https://gofile.io/?c=FRWL1o', 'https://racaty.net/dkvdyjqvg02e', 'https://bayfiles.com/L0k7Qea2pb', 'https://tusfiles.com/2q00y4huuv15', 'https://megaup.net/2f0pv/Projection.First.Light-GoldBerg.zip', 'https://letsupload.org/88t5', 'https://filesupload.org/0d7771dfef54d055', 'https://dl.bdupload.in/17ykjrifizrb', 'https://clicknupload.co/o0k9dnd3iwoy', 'https://dailyuploads.net/n1jihwjwdmjp', 'https://userscloud.com/nircdd4q1t5w', 'https://rapidgator.net/file/b6b8f5782c7c2bdb534214342b58ef18', 'https://turbobit.net/m308zh1hdpba.html', 'https://hitfile.net/5OhkcqZ', 'https://filerio.in/0wbvn4md4i91', 'https://mirrorace.org/m/1Fiic', 'https://go4up.com/dl/0ee9f4866312b5/Projection.First.Light-GoldBerg.zip', 'https://katfile.com/w74l823vuyw5/Projection.First.Light-GoldBerg.zip.html', 'https://multiup.org/download/3d355ba18d58234c792da7a872ab4998/Projection.First.Light-GoldBerg.zip', 'https://dl1.indishare.in/hs55pkx4ex82']
You can use find_all as noted in the BeautifulSoup documentation
import requests
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", class_="ui-tabs-panel ui-widget-content ui-corner-bottom")
# do something with the data
edit -
looking at the response.text, the div exists, but does not have the class you're looking for, hence it returns empty. You can search by using regex like so
import requests, re
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", id=re.compile("^tabs"))
for ele in raw_data:
a_tag = ele.find("a")
# do something with the a_tag

Using BeautifulSoup to find links related to specific keyword

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

How to scrape a website using following pagination in bs4?

I have a script that scrapes a specific website, where the number of a page is defined with ?start={}. This site.
This is my script:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
for i in range(0, 480, 5):
html = urlopen('http://rl.odessa.ua/index.php/ru/poslednie-novosti?start={}'.format(i))
soup = BeautifulSoup(html, 'lxml')
for article in soup.findAll('article', class_ = 'item'):
try:
print('\t' + article.find('h1').find('a').get_text())
print(article.find('p').get_text() + '\n' + '*'*80)
except AttributeError as e:
print(e)
parse()
At the bottom of the page is located div.pagination with a.next. Here's a screenshot.
Is it a bad practise using range() instead of pagination? Anyway, please help me to rewrite the code above using pagination.
Whichever method works for you is fine, but locating the next button would make things easier. It could be done as follows:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'http://rl.odessa.ua/index.php'
url = 'http://rl.odessa.ua/index.php/ru/poslednie-novosti?start=0'
while True:
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for article in soup.findAll('article', class_ = 'item'):
try:
print('\t' + article.find('h1').find('a').get_text())
print(article.find('p').get_text() + '\n' + '*'*80)
except AttributeError as e:
print(e)
next_button = soup.find('a', class_='next', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()

Pagination with BeautifulSoup

I am trying to get some data from the following website. https://www.drugbank.ca/drugs
For every drug in the table, I will need to go deeply and have the name and some other specific features like categories, structured indication (please click on drug name to see the features I will use).
I wrote the following code but the issue that I can't make my code handle pagination (as you see there more than 2000 pages!).
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
for link in soup.select('name-head a'):
href = 'https://www.drugbank.ca/drugs/' + link.get('href')
pages_data(href)
def pages_data(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.text, "lxml")
g_data = soup.select('div.content-container')
for item in g_data:
print item.contents[1].text
print item.contents[3].findAll('td')[1].text
try:
print item.contents[5].findAll('td',{'class':'col-md-2 col-sm-4'})
[0].text
except:
pass
print item_url
drug_data()
How can I scrape all of the data and handle pagination properly?
This page uses almost the same url for all pages so you can use for loop to generate them
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
for x in range(1, 2001):
drug_data(x)
Or using while and try/except to get more then 2000 pages
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
page = 0
while True:
try:
page += 1
drug_data(page)
except Exception as ex:
print(ex)
print("probably last page:", page)
break # exit `while` loop
You can also find url to next page in HTML
<a rel="next" class="page-link" href="/drugs?approved=1&c=name&d=up&page=2">›</a>
so you can use BeautifulSoup to get this link and use it.
It displays current url, finds link to next page (using class="page-link" rel="next") and loads it
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
BTW: never use except:pass because you can have error which you didn't expect and you will not know why it doesn't work. Better display error
except Exception as ex:
print('Error:', ex)

Categories