Currently the code: Finds the urls for all gyms and puts in csv like so:
https://www.lifetime.life/life-time-locations/al-vestavia-hills.html
https://www.lifetime.life/life-time-locations/az-biltmore.html
What I want it to do: I am having trouble extracting the address from each url. My attempt at the address part is is in the 4th and 5th line from the bottom of "The code" below. The exact error is:
gymrow.append(address_line1[0].text)
IndexError: list index out of range
The code*:
import urllib2
import BeautifulSoup
initial_url = "https://www.lifetime.life"
request = urllib2.Request("https://www.lifetime.life/view-all-locations.html")
response = urllib2.urlopen(request)
soup = BeautifulSoup.BeautifulSoup(response)
with open('gyms2.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for a in soup.findAll('a'):
if '/life-time-locations/' in a['href']:
gymurl1 = (urlparse.urljoin(initial_url, a.get('href')))
sitemap_content = requests.get(gymurl1).content
gymrow = [gymurl1]
address_line1 = soup.select('p[class~=small m-b-sm p-t-1] > span[class~=btn-icon-text]')
gymrow.append(address_line1[0].text)
print(gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
Image of inspect element: the p class, span class and the address I want to scrape
Thank you very much!
You get HTML from subpage but you don't convert to soup so you search on main page
response = requests.get(gymurl)
sub_soup = BeautifulSoup(response.text)
I had also problem with CSS selector
address_line = sub_soup.select('p.small.m-b-sm.p-t-1 span.btn-icon-text')
Some pages doesn't have elements in this place and it raise error so I use try/except to catch it.
Tested on Python 3 because on Python 2 .select() didn't work for me
import requests
from bs4 import BeautifulSoup
import urllib.parse
import csv
import time
initial_url = "https://www.lifetime.life"
response = requests.get("https://www.lifetime.life/view-all-locations.html")
soup = BeautifulSoup(response.text)
with open('gyms2.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for a in soup.findAll('a'):
if '/life-time-locations/' in a['href']:
gymurl = urllib.parse.urljoin(initial_url, a.get('href'))
print(gymurl)
response = requests.get(gymurl)
sub_soup = BeautifulSoup(response.text)
try:
address_line = sub_soup.select('p.small.m-b-sm.p-t-1 span.btn-icon-text')
gymrow = [gymurl, address_line[0].text.strip()]
print(gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
except Exception as ex:
print(ex)
EDIT: Python 2 using find() instead of select()
import requests
import BeautifulSoup
import csv
import urllib2
import time
initial_url = "https://www.lifetime.life"
response = requests.get("https://www.lifetime.life/view-all-locations.html")
soup = BeautifulSoup.BeautifulSoup(response.text)
with open('gyms2.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for a in soup.findAll('a'):
if '/life-time-locations/' in a['href']:
gymurl = urllib2.urlparse.urljoin(initial_url, a.get('href'))
print(gymurl)
response = requests.get(gymurl)
sub_soup = BeautifulSoup.BeautifulSoup(response.text)
try:
address_line = sub_soup.find('p', {'class': 'small m-b-sm p-t-1'}).find('span', {'class': 'btn-icon-text'})
gymrow = [gymurl, address_line.text]
print(gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
except Exception as ex:
print(ex)
EDIT: It seems there are many versions of pages. Every page may need separted try/except. But instead of putting second try/except inside first except I use continue to skip next try/except if first try works correctly.
import requests
from bs4 import BeautifulSoup
import urllib.parse
import csv
import time
initial_url = "https://www.lifetime.life"
response = requests.get("https://www.lifetime.life/view-all-locations.html")
soup = BeautifulSoup(response.text)
with open('gyms2.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for a in soup.findAll('a'):
if '/life-time-locations/' in a['href']:
gymurl = urllib.parse.urljoin(initial_url, a.get('href'))
print(gymurl)
response = requests.get(gymurl)
sub_soup = BeautifulSoup(response.text)
try:
address_line = sub_soup.select('p.small.m-b-sm.p-t-1 span.btn-icon-text')
gymrow = [gymurl, address_line[0].text.strip()]
print('type 1:', gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
continue # go back to `for`
except Exception as ex:
print('ex:', ex)
try:
address_line = sub_soup.find('div', {'class': 'btn-resp-md'}).find('p')
gymrow = [gymurl, address_line.text.strip()]
print('type 2:', gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
continue # go back to `for`
except Exception as ex:
print('ex:', ex)
try:
address_line = sub_soup.find('p', {'class': 'm-b-grid'})
gymrow = [gymurl, address_line.text.strip()]
print('type 3:', gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
continue # go back to `for`
except Exception as ex:
print('ex:', ex)
Related
This is the website I am trying to scrape:
(https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage)
Below is the code that I have tried,but it repetitively return me first page and third page.
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3"
while True:
html = urlopen(url)
soup = BeautifulSoup(html ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.find('a', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()
Select your elements more specific, used css selectors here to get the <a> that is child of an element with class="PagedList-skipToNext" :
next_button = soup.select_one('.PagedList-skipToNext a')
Also check the results of your selection, base_url is not needed here:
url = next_button.get('href')
Example
from bs4 import BeautifulSoup
import requests
def parse():
url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
while True:
soup = BeautifulSoup(requests.get(url).text)
print(url) ## to see what you are working on or enter code that should be performed
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Output
https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=2
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=4
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=5
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=6
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=7
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=8
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=9
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=10
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=11
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=12
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=13
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=14
I want to scrape the company info of all the companies from the below given URL and view their job details
URL : http://desiopt.com/search-results-jobs/
from selenium import webdriver
import bs4
import pandas as pd
from bs4 import BeautifulSoup
import re
driver = webdriver.Chrome(executable_path=r"C:/Users/Chandra Sekhar/Desktop/chrome-driver/chromedriver.exe")
titles=[]
driver.get("http://desiopt.com/search-results-jobs/")
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.findAll('div',attrs={'class':'listing-links'}):
info=a.find('div', attrs={'class':'userInfo'})
print(info.text)
titles.append(info.text)
df = pd.DataFrame({'Company info':titles})
df['Price'] = df['Price'].map(lambda x: re.sub(r'\W+', '', x))
df.to_csv('products1.csv', index=False)
Using the following url:
https://desiopt.com/search-results-jobs/?action=search&page=&listings_per_page=&view=list
Here's two parameter which you will edit page= and listings_per_page=:
Currently the website do have 37091 Jobs.
After my testing, I do see that listings_per_page is limited by 1000 per one page.
Example: https://desiopt.com/search-results-jobs/?action=search&page=1&listings_per_page=1000&view=list
So you will need to loop from page=1 to page=38 and set listings_per_page=1000
Which means 1000 result per page * 38 page = 38000
After that:
You will collect all links and pass it to list with condition to remove duplicates in case if you worry about sort. Otherwise just pass it to set which doesn't accept duplicates but it's don't care about sort. Then you can parse each url in list or set to collect the information.
By The Way, I'll loop over 371 page and each page include 100 items so i will get 37100 url (or less if the last page have little than 100 url) and remove the duplicates from, then parse:
import requests
from bs4 import BeautifulSoup
import csv
links = []
try:
for item in range(1, 372):
print(f"Extraction Page# {item}")
r = requests.get(
f"https://desiopt.com/search-results-jobs/?action=search&page={item}&listings_per_page=100&view=list")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('span', attrs={'class': 'captions-field'}):
for a in item.findAll('a'):
a = a.get('href')
if a not in links:
links.append(a)
except KeyboardInterrupt:
print("Good Bye!")
exit()
data = []
try:
for link in links:
r = requests.get(link)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('div', attrs={'class': 'compProfileInfo'}):
a = [a.text.strip() for a in item.findAll('span')]
if a[6] == '':
a[6] = 'N/A'
data.append(a[0:7:2])
except KeyboardInterrupt:
print("Good Bye!")
exit()
while True:
try:
with open('output.csv', 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Name', 'Phone', 'Email', 'Website'])
writer.writerows(data)
print("Operation Completed")
except PermissionError:
print("Please Close The File")
continue
except KeyboardInterrupt:
print("Good Bye")
exit()
break
Result Can Be Viewed Here:
Click Here
The output is 1885 Rows because I've let the script to remove duplicated links for companies before i parsed.
Run Code Online: Click Here
I have a script that scrapes a specific website, where the number of a page is defined with ?start={}. This site.
This is my script:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
for i in range(0, 480, 5):
html = urlopen('http://rl.odessa.ua/index.php/ru/poslednie-novosti?start={}'.format(i))
soup = BeautifulSoup(html, 'lxml')
for article in soup.findAll('article', class_ = 'item'):
try:
print('\t' + article.find('h1').find('a').get_text())
print(article.find('p').get_text() + '\n' + '*'*80)
except AttributeError as e:
print(e)
parse()
At the bottom of the page is located div.pagination with a.next. Here's a screenshot.
Is it a bad practise using range() instead of pagination? Anyway, please help me to rewrite the code above using pagination.
Whichever method works for you is fine, but locating the next button would make things easier. It could be done as follows:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'http://rl.odessa.ua/index.php'
url = 'http://rl.odessa.ua/index.php/ru/poslednie-novosti?start=0'
while True:
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for article in soup.findAll('article', class_ = 'item'):
try:
print('\t' + article.find('h1').find('a').get_text())
print(article.find('p').get_text() + '\n' + '*'*80)
except AttributeError as e:
print(e)
next_button = soup.find('a', class_='next', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()
I am trying to get some data from the following website. https://www.drugbank.ca/drugs
For every drug in the table, I will need to go deeply and have the name and some other specific features like categories, structured indication (please click on drug name to see the features I will use).
I wrote the following code but the issue that I can't make my code handle pagination (as you see there more than 2000 pages!).
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
for link in soup.select('name-head a'):
href = 'https://www.drugbank.ca/drugs/' + link.get('href')
pages_data(href)
def pages_data(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.text, "lxml")
g_data = soup.select('div.content-container')
for item in g_data:
print item.contents[1].text
print item.contents[3].findAll('td')[1].text
try:
print item.contents[5].findAll('td',{'class':'col-md-2 col-sm-4'})
[0].text
except:
pass
print item_url
drug_data()
How can I scrape all of the data and handle pagination properly?
This page uses almost the same url for all pages so you can use for loop to generate them
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
for x in range(1, 2001):
drug_data(x)
Or using while and try/except to get more then 2000 pages
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
page = 0
while True:
try:
page += 1
drug_data(page)
except Exception as ex:
print(ex)
print("probably last page:", page)
break # exit `while` loop
You can also find url to next page in HTML
<a rel="next" class="page-link" href="/drugs?approved=1&c=name&d=up&page=2">›</a>
so you can use BeautifulSoup to get this link and use it.
It displays current url, finds link to next page (using class="page-link" rel="next") and loads it
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
BTW: never use except:pass because you can have error which you didn't expect and you will not know why it doesn't work. Better display error
except Exception as ex:
print('Error:', ex)
I am learning to build web crawlers and currently working on getting all urls from a site. I have been playing around and don't have the same code as I did before but I have been able to get all the links but my issues is the recursion I need to do the same things over and over but what I think my issue is the recursion what it is doing is right for the code I have written. My code is bellow
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
page = urllib2.urlopen( url ).read()
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList:
urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
for url in urlList:
getAllUrl(url)
return urlList
except urllib2.HTTPError, e:
print e
if __name__ == "__main__":
urls = getAllUrl('http://bobthemac.com')
for x in urls:
print x
What I am trying to achieve is get all the urls for a site with the current set-up the program runs till it runs out of memory all I want is to get the urls from a site. Does anyone have any idea on how to do this think I have the right idea just need some small changes to the code.
EDIT
For those of you what are intrested bellow is my working code that gets all the urs for the site someone might find it useful. It's not the best code and does need some work but with some work it could be quite good.
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
urlList = []
try:
page = urllib2.urlopen( url ).read()
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin('http://bobthemac.com', anchor['href']) not in urlList:
urlList.append(urlparse.urljoin('http://bobthemac.com', anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
return urlList
except urllib2.HTTPError, e:
urlList.append( e )
if __name__ == "__main__":
urls = getAllUrl('http://bobthemac.com')
fullList = []
for x in urls:
listUrls = list
listUrls = getAllUrl(x)
try:
for i in listUrls:
if not i in fullList:
fullList.append(i)
except TypeError, e:
print 'Woops wrong content passed'
for i in fullList:
print i
I think this works:
#!/usr/bin/python
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
def getAllUrl(url):
try:
page = urllib2.urlopen( url ).read()
except:
return []
urlList = []
try:
soup = BeautifulSoup(page)
soup.prettify()
for anchor in soup.findAll('a', href=True):
if not 'http://' in anchor['href']:
if urlparse.urljoin(url, anchor['href']) not in urlList:
urlList.append(urlparse.urljoin(url, anchor['href']))
else:
if anchor['href'] not in urlList:
urlList.append(anchor['href'])
length = len(urlList)
return urlList
except urllib2.HTTPError, e:
print e
def listAllUrl(urls):
for x in urls:
print x
urls.remove(x)
urls_tmp = getAllUrl(x)
for y in urls_tmp:
urls.append(y)
if __name__ == "__main__":
urls = ['http://bobthemac.com']
while(urls.count>0):
urls = getAllUrl('http://bobthemac.com')
listAllUrl(urls)
In you function getAllUrl, you call getAllUrl again in a for loop, it makes a recursion.
Elements will never be moved out once put into urlList, so urlList will never be empty, and then, the recursion will never break up.
That's why your program will never end up util out of memory.