Webscraping when some attributes aren't available for all pages - python

I am trying to webscrape from a website called knowyourcity.info with many settlements on with information. This is my current loop:
for u in urllist:
response = get(u)
html_soup = BeautifulSoup(response.text, "html.parser")
headers_containers = html_soup.find('div', class_ = 'settlement-base-status section text-center')
names = headers_containers.h2.text
name.append(names)
year_established = headers_containers.h3.text
year.append(year_established)
headers1_containers = html_soup.find('div', class_ = 'col-xs-12 text-center')
countries = headers1_containers.h4.a.text
country.append(countries)
headers2_containers = html_soup.find('div', class_ = 'bold-it', id = "population")
populations = headers2_containers.text
population.append(populations)
headers3_containers = html_soup.find('div', class_ ='bold-it', id='sharedTaps')
tap = headers3_containers.text
taps.append(tap)
headers4_containers = html_soup.find_all('div', class_ = 'bold-it')
toiletSeat_toPerson = headers4_containers[7].text
toiletsToPerson.append(toiletSeat_toPerson)
However, for some settlements some attributes are not available. How do I add to this loop an "if true" statement?

If you want to skip a loop cycle by condition you can use the continue keyword.
for url in urllist:
if condition:
continue
Will break the current loop cycle if condition is True
and start with the next url in urllist

Related

Extracting URL From Span Element Without href

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File
You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

Dynamic Web Scraping with Selenium

I was trying to scrape data from amazon using selenium and beautiful soup..
I have scraped and obtained data from the first page and have a defined a function for it and managed to get the second page opened with the Click() method...
The soup objects that were used in first page is similar to the objects in second page...I am planning to scrape data till page 6....
Was wondering if i could apply the function defined for the first page to the next 5 pages and append the data, which can later be exported as csv.
Any suggestions regarding this would be appreciated..
def data_collection():
title = soup.find_all(name = "span", class_ = "a-size-base-plus a-color-
base a-text-normal")
all_specs = [specs.getText() for specs in title]
brands = [items.split(' ', 1)[0] for items in all_specs] #Brand
phones = [text.split(')')[0].split('(') for text in all_specs]
spec = []
for i in phones:
for j in i:
spec.append(j)
model = spec[::2] #Model
specifications = spec[1::2] #Specs
s_price_obj = soup.find_all(name = "span", class_ = "a-price-whole")
selling_price = [price.getText() for price in s_price_obj] #Price
review_obj = soup.find_all(name = "span", class_ = "a-icon-alt")
review = [ratings.getText() for ratings in review_obj]
review = review[:24] #Ratings
quantity_obj = soup.find_all(name = "span", class_ = "a-size-base")
quantity_sold = [items.getText() for items in quantity_obj]
quantity_sold = quantity_sold[:24] #Quantity Sold
page_number = ['1']*24 #Page Number
Date = date.today()
Date = [str(Date)]*24 #Date
data = [brands, model, specifications, selling_price, review,
quantity_sold, page_number, Date]
return data
The above is the function defined...Open to suggestions
You can try the following:-
Re-define your data_collection method to accept page source parsed by BeautifulSoup
def data_collection(soup):
title = soup.find_all(name = "span", class_ = "a-size-base-plus a-color- base a-text-normal")
all_specs = [specs.getText() for specs in title]
brands = [items.split(' ', 1)[0] for items in all_specs] #Brand
phones = [text.split(')')[0].split('(') for text in all_specs]
spec = []
for i in phones:
for j in i:
spec.append(j)
model = spec[::2] #Model
specifications = spec[1::2] #Specs
s_price_obj = soup.find_all(name = "span", class_ = "a-price-whole")
selling_price = [price.getText() for price in s_price_obj] #Price
review_obj = soup.find_all(name = "span", class_ = "a-icon-alt")
review = [ratings.getText() for ratings in review_obj]
review = review[:24] #Ratings
quantity_obj = soup.find_all(name = "span", class_ = "a-size-base")
quantity_sold = [items.getText() for items in quantity_obj]
quantity_sold = quantity_sold[:24] #Quantity Sold
page_number = ['1']*24 #Page Number
Date = date.today()
Date = [str(Date)]*24 #Date
data = [brands, model, specifications, selling_price, review,
quantity_sold, page_number, Date]
return data
Then loop through each page, get the page source, parse it using BeautifulSoup and pass it to the data_collection function. Example:-
#from page(1..6)
for i in range(1,7):
#change page=i in the url to iterate through the pages
url=f'https://www.amazon.in/s?k=mobile+phones&page={i}&qid=1632394501&ref=sr_pg_2'
driver.get(url)
#get current page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
#call data_collection function
data=data_collection(soup)
#code to append data to csv

Scrape multiple pages with Beautiful soup

I am trying to scrape multiple pages of a url.
But am able to scrape only the first page is there is a way to get all the pages.
Here is my code.
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
pd.set_option('max_colwidth',500) # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
comp_name = elem.find('span', attrs={'class':'company'}).getText().strip()
job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
home_url = "http://www.indeed.com"
job_link = "%s%s" % (home_url,elem.find('a').get('href'))
job_addr = elem.find('span', attrs={'class':'location'}).getText()
date_posted = elem.find('span', attrs={'class': 'date'}).getText()
description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
comp_link_overall = elem.find('span', attrs={'class':'company'}).find('a')
if comp_link_overall != None:
comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
else: comp_link_overall = None
df = df.append({'comp_name': comp_name, 'job_title': job_title,
'job_link': job_link, 'date_posted': date_posted,
'overall_link': comp_link_overall, 'job_location': job_addr, 'description': description
}, ignore_index=True)
df
df.to_csv('path\\web_scrape_Indeed.csv', sep=',', encoding='utf-8')
Please suggest if there is anyway.
Case 1: The code presented here is exactly what you have
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
The problem here is targetElements changes with every iteration in the first for loop.
To avoid this, indent the second for loop inside the first like so:
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
Case 2: Your the bug is not a result of improper indentation (i.e. not like what is in your original post)
If it is the case that your code is properly idented , then it may be the case that targetElements is an empty list. This means target.findAll('div', class_ =' row result') does not return anything. In that case, visit the sites, check out the dom, then modify your scraping program.

Execute loop based on a list - obtain result for each page (subpage)

I am trying to obtain the number of pages for each url from a list of urls. My code works as long as I have only one url, however as soon as I try it with a list of urls I only get the rest from one url. Guess the problem is related to my loop. given that I am new to python and beautifoul soup I dont manage to spot the mistake myself.
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
urls=[]
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
for i in range (1,last_page):
page = main_page.format(i)
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
urls = base_url + link.find('a').get('href')+"/-/p/{}"
print(urls)
So far, everything works, I obtain the max page number and get all the urls from each page. The problem lies in the code below (I believe):
for url in urls: #to loop through the list of urls
r = requests.get(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
anzahl=max(tes_int)
print(anzahl)
I am trying to apply the same concept as in the code above for each url from the list urls- but instead of obtaining the max page number for each url I obtain 241 every time, as if I am caught in a loop...
Any thoughts on that? Help is highly appreciated.
You're equating urls to last link generated by loop.
To build valid list of urls you need to replace = on append():
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try:
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
print(urls)
EDIT: okay, as far as I see you have several issues in your code. along with my initial fix I'm outlining my vision and understanding of how your code desired to work:
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.holidaycheck.de'
main_page = 'https://www.holidaycheck.de/dh/hotels-tunesien/e10cef63-45d4-3511-92f1-43df5cbd9fe1?p={}'
##Change URL into object (soup)
r = requests.get(main_page.format(0))
soup = BeautifulSoup(r.text, "html5lib")
#get max page number
soup = BeautifulSoup(r.text, 'lxml')
data = soup.find_all('a', {'class':'link'})
res = []
for i in data:
res.append(i.text) #writing each value to res list
res_int = []
for i in res:
try:
res_int.append(int(i))
except:
print("current value is not a number")
last_page=max(res_int)
#print(last_page)
urls = []
for i in range (1,last_page):
page = main_page.format(i)
r = requests.get(page) #these 2 rows added
soup = BeautifulSoup(r.text, 'lxml') #these 2 rows added
for link in soup.find_all('div', {'class':'hotel-reviews-bar'}):
try: #also adding try-except for escaping broken/unavailable links
urls.append(base_url + link.find('a').get('href')+"/-/p/{}")
except:
print('no link available', i)
urls = list(set(urls)) #check and drop duplicated in links list
for url in urls: #to loop through the list of urls
try:
r = requests.get(url.format(0))
print(url.format(0))
soup = BeautifulSoup(r.text, 'lxml')
daten = soup.find_all('a', {'class':'link'})
except:
print('broken link')
tes = []
for z in daten:
tes.append(z.text) #writing each value to res list
# print(tes)
tes_int = []
for z in tes:
try:
tes_int.append(int(z))
except:
print("current value is not a number")
try:
anzahl=max(tes_int)
print(anzahl)
except:
print('maximum cannot be calculated')

Scrape through website and iterate over seach results to get specific data

I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone
You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]

Categories