I am trying to get some data from the following website. https://www.drugbank.ca/drugs
For every drug in the table, I will need to go deeply and have the name and some other specific features like categories, structured indication (please click on drug name to see the features I will use).
I wrote the following code but the issue that I can't make my code handle pagination (as you see there more than 2000 pages!).
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
for link in soup.select('name-head a'):
href = 'https://www.drugbank.ca/drugs/' + link.get('href')
pages_data(href)
def pages_data(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.text, "lxml")
g_data = soup.select('div.content-container')
for item in g_data:
print item.contents[1].text
print item.contents[3].findAll('td')[1].text
try:
print item.contents[5].findAll('td',{'class':'col-md-2 col-sm-4'})
[0].text
except:
pass
print item_url
drug_data()
How can I scrape all of the data and handle pagination properly?
This page uses almost the same url for all pages so you can use for loop to generate them
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
for x in range(1, 2001):
drug_data(x)
Or using while and try/except to get more then 2000 pages
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
page = 0
while True:
try:
page += 1
drug_data(page)
except Exception as ex:
print(ex)
print("probably last page:", page)
break # exit `while` loop
You can also find url to next page in HTML
<a rel="next" class="page-link" href="/drugs?approved=1&c=name&d=up&page=2">›</a>
so you can use BeautifulSoup to get this link and use it.
It displays current url, finds link to next page (using class="page-link" rel="next") and loads it
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
BTW: never use except:pass because you can have error which you didn't expect and you will not know why it doesn't work. Better display error
except Exception as ex:
print('Error:', ex)
Related
This is the website I am trying to scrape:
(https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage)
Below is the code that I have tried,but it repetitively return me first page and third page.
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3"
while True:
html = urlopen(url)
soup = BeautifulSoup(html ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.find('a', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()
Select your elements more specific, used css selectors here to get the <a> that is child of an element with class="PagedList-skipToNext" :
next_button = soup.select_one('.PagedList-skipToNext a')
Also check the results of your selection, base_url is not needed here:
url = next_button.get('href')
Example
from bs4 import BeautifulSoup
import requests
def parse():
url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
while True:
soup = BeautifulSoup(requests.get(url).text)
print(url) ## to see what you are working on or enter code that should be performed
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Output
https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=2
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=4
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=5
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=6
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=7
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=8
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=9
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=10
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=11
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=12
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=13
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=14
I have a script that scrapes a specific website, where the number of a page is defined with ?start={}. This site.
This is my script:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
for i in range(0, 480, 5):
html = urlopen('http://rl.odessa.ua/index.php/ru/poslednie-novosti?start={}'.format(i))
soup = BeautifulSoup(html, 'lxml')
for article in soup.findAll('article', class_ = 'item'):
try:
print('\t' + article.find('h1').find('a').get_text())
print(article.find('p').get_text() + '\n' + '*'*80)
except AttributeError as e:
print(e)
parse()
At the bottom of the page is located div.pagination with a.next. Here's a screenshot.
Is it a bad practise using range() instead of pagination? Anyway, please help me to rewrite the code above using pagination.
Whichever method works for you is fine, but locating the next button would make things easier. It could be done as follows:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'http://rl.odessa.ua/index.php'
url = 'http://rl.odessa.ua/index.php/ru/poslednie-novosti?start=0'
while True:
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for article in soup.findAll('article', class_ = 'item'):
try:
print('\t' + article.find('h1').find('a').get_text())
print(article.find('p').get_text() + '\n' + '*'*80)
except AttributeError as e:
print(e)
next_button = soup.find('a', class_='next', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()
I try to parse https://www.drugbank.ca/drugs. The idea is to extract all the drug names and some additional informationfor each drug. As you can see each webpage represents a table with drug names and the when we hit the drugname we can access to this drug information.
Let's say I will keep the following code to handle the pagination:
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
The issue is that in each page, and for each drug in the table of this page I need to capture :
Name.
Accession Number.
Structured Indications,
Generic Prescription Products,
I used the classical request/beautifusoup but can't go deep ..
Some Help please
Create function with requests and BeautifulSoup to get data from subpage
import requests
from bs4 import BeautifulSoup
def get_details(url):
print('details:', url)
# get subpage
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get data on subpabe
dts = soup.findAll('dt')
dds = soup.findAll('dd')
# display details
for dt, dd in zip(dts, dds):
print(dt.text)
print(dd.text)
print('---')
print('---------------------------')
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
# get links to subpages
links = soup.select('strong a')
for link in links:
# exeecute function to get subpage
get_details('https://www.drugbank.ca' + link['href'])
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
To crawl effectively, you'll want to implement a few measures, such as maintaining a queue of urls to visit and be aware of what urls you have already visited.
Keeping in mind that links can be absolute or relative and that redirects are very likely, you also probably want to construct the urls dynamically rather than string concatenation.
Here is a generic (we usually only want to use example.com on SO) crawling workflow...
from urllib.parse import urljoin, urlparse # python
# from urlparse import urljoin, urlparse # legacy python2
import requests
from bs4 import BeautifulSoup
def process_page(soup):
'''data extraction process'''
pass
def is_external(link, base='example.com'):
'''determine if the link is external to base'''
site = urlparse(link).netloc
return base not in site
def resolve_link(current_location, href):
'''resolves final location of a link including redirects'''
req_loc = urljoin(current_location, href)
response = requests.head(req_loc)
resolved_location = response.url # location after redirects
# if you don't want to visit external links...
if is_external(resolved_location):
return None
return resolved_location
url_queue = ['https://example.com']
visited = set()
while url_queue:
url = url_queue.pop() # removes a url from the queue and assign it to `url`
response = requests.get(url)
current_location = response.url # final location after redirects
visited.add(url) # note that we've visited the given url
visited.add(current_location) # and the final location
soup = BeautifulSoup(response.text, 'lxml')
process_page(soup) # scrape the page
link_tags = soup.find_all('a') # gather additional links
for anchor in link_tags:
href = anchor.get('href')
link_location = resolve_link(current_location, href)
if link_location and link_location not in visited:
url_queue.append(link_location)
I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)
I am learning to build web crawlers and currently working on getting all urls from a site. I have been playing around and don't have the same code as I did before but I have been able to get all the links but my issues is the recursion I need to do the same things over and over but what I think my issue is the recursion what it is doing is right for the code I have written. My code is bellow
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
page = urllib2.urlopen( url ).read()
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList:
urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
for url in urlList:
getAllUrl(url)
return urlList
except urllib2.HTTPError, e:
print e
if __name__ == "__main__":
urls = getAllUrl('http://bobthemac.com')
for x in urls:
print x
What I am trying to achieve is get all the urls for a site with the current set-up the program runs till it runs out of memory all I want is to get the urls from a site. Does anyone have any idea on how to do this think I have the right idea just need some small changes to the code.
EDIT
For those of you what are intrested bellow is my working code that gets all the urs for the site someone might find it useful. It's not the best code and does need some work but with some work it could be quite good.
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
urlList = []
try:
page = urllib2.urlopen( url ).read()
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList:
urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
return urlList
except urllib2.HTTPError, e:
urlList.append( e )
if __name__ == "__main__":
urls = getAllUrl('http://bobthemac.com')
fullList = []
for x in urls:
listUrls = list
listUrls = getAllUrl(x)
try:
for i in listUrls:
if not i in fullList:
fullList.append(i)
except TypeError, e:
print 'Woops wrong content passed'
for i in fullList:
print i
I think this works:
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
try:
page = urllib2.urlopen( url ).read()
except:
return []
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin(url, anchor['href']) not in urlList:
urlList.append(urlparse.urljoin(url, anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
return urlList
except urllib2.HTTPError, e:
print e
def listAllUrl(urls):
for x in urls:
print x
urls.remove(x)
urls_tmp = getAllUrl(x)
for y in urls_tmp:
urls.append(y)
if __name__ == "__main__":
urls = ['http://bobthemac.com']
while(urls.count>0):
urls = getAllUrl('http://bobthemac.com')
listAllUrl(urls)
In you function getAllUrl, you call getAllUrl again in a for loop, it makes a recursion.
Elements will never be moved out once put into urlList, so urlList will never be empty, and then, the recursion will never break up.
That's why your program will never end up util out of memory.