I'm trying to scrape this page
https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife
I scraped the first page this website and click with selenium to next page, but I only can get the first page content, when I scrape the second, it came the same content from first page. I dunno how to fix this or if the webpage has some protection to scraping.
Could someone help me?
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver
def scrape():
cont = [True,True,True,True,False]
for times in cont:
if times != True:
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
sleep(15)
titles = []
addresses = []
areas = []
rooms = []
bathes = []
values = []
start_time = time()
request = 0
soup = BeautifulSoup(page,'html.parser')
imov = soup.find_all('div', class_='property-card__main-content')
sleep(randint(8,15))
# Monitor
request += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if page.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, page.status_code))
# Break the loop if the number of requests is greater than expected
if request > 72:
warn('Number of requests was greater than expected.')
break
for container in imov:
# Título
title = container.h2.a.get_text()
t2 = title.strip()
titles.append(t2)
# Título
# Endereço
address = container.h2.span.get_text()
a2 = address.strip()
addresses.append(a2)
# Endereço
# Área
area = container.li.span.get_text()
ar2 = area.strip()
areas.append(ar2)
# Área
# Quartos
room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
r2 = room2.strip()
rooms.append(r2)
# Quartos
# Banheiros
bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
b2 = bath2.strip()
bathes.append(b2)
# Banheiros
# Valor
value = container.section.div.get_text()
v2 = value.strip()
values.append(v2)
# Valor
# Dataframe e salvar
vivareal = pd.DataFrame({
"title": titles,
"address": addresses,
"area": areas,
"rooms":rooms,
"baths":bathes,
"value":values
})
vivareal.to_csv(r'output.csv')
prox = driver.find_element_by_xpath('//*[#title="Próxima página"]')
prox.click()
else:
print('Done!')
scrape()```
Although you put the click command at the end, when it goes to the next loop, the first command is to create a new driver and then is called the command to get the main page of Viva Real to Pernambuco. This is unwanted. Instead of this you could do:
def scrape():
cont = [True,True,True,True,False]
# You create the driver and access the main page only once
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
for times in cont:
if times != True:
# Wait to load every page
sleep(15)
Your code is not working as espected, even with the fixes provided by #MarceloBaliu. Here is my code that (finally!) worked for me. I'm sharing because it can help someone, like I was helped by this website.
from selenium import webdriver
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
class ScraperVivaReal:
wait_time = 5
def __init__(self, url):
# Initializing the webdriver
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
self.driver = webdriver.Firefox(options=options)
self.driver.maximize_window()
self.driver.get(url)
time.sleep(self.wait_time)
# Handling cookies acception
WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="cookie-notifier-cta"]'))).click()
time.sleep(self.wait_time/2)
def __scrape_page__(self):
result = []
# Extracting data from the page
try:
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
except WebDriverException:
print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
return result
# Finding property cards containing search results
div_list = soup.find_all('div', {'class':'property-card__content'})
# Iterating each card
for d in div_list:
# Extracting info from card
title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
# Extracting the price
try:
price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
except AttributeError:
price = "N/I"
# Splitting the address
add_list = re.split(',|-', complete_address)
add_list = [ item.strip() for item in add_list ]
if len(add_list) == 2:
city, st = add_list
neibhood = 'N/I'
address = 'N/I'
number = 'N/I'
if len(add_list) == 3:
neibhood, city, st = add_list
address = 'N/I'
number = 'N/I'
if len(add_list) == 4:
address, neibhood, city, st = add_list
number = 'N/I'
elif len(add_list) == 5:
address, number, neibhood, city, st = add_list
# Adding the result into a dicionary and appending the dict to a result list
row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
result.append(row)
return result
def __next_page__(self):
# Finding the "Next Page" button element
next_element = self.driver.find_element_by_xpath('//*[#title="Próxima página"]')
try:
# Trying to click it
next_element.click()
time.sleep(self.wait_time)
return True
# Treating some exceptions (element not found and element not clickable)
except ElementClickInterceptedException:
print('"Próxima Página" element is not clickable!')
except NoSuchElementException:
print('"Próxima Página" element not found!')
return False
def run(self, output):
has_next = True
final_result = []
# Getting the information!
while has_next:
results = self.__scrape_page__()
final_result.extend(results)
print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
if len(results) == 0:
break
has_next = self.__next_page__()
# Quitting Firefox
self.driver.quit()
# Exporting results to CSV
df = pd.DataFrame(final_result)
df.to_csv(output, sep=',')
S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
S.run('output.csv')
Related
I am studying Naver web crawling in Korea.
In the code below, I am extracting market information in the while statement.
There's a problem. If you check Developer Tools with F12, you have to click the "[More same products]" tab to reveal hidden products.
A total of 24 elements need to be counted, but the code below can only crawl 20.
Look at the variables stored in the market and ask for a lot of answers on how to handle them.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://search.shopping.naver.com/catalog/10555224834?query=c922%20pro&NaPm=ct%3Dl7ebx6qw%7Cci%3D1bf003bb08f09f0f6b4a7c0713e56125c0984e64%7Ctr%3Dslsl%7Csn%3D95694%7Chk%3D5e07d721000d98f35225c07de4fb325fbd2a76c1'
url2 = 'https://search.shopping.naver.com/catalog/21052026769?query=clp-609&NaPm=ct%3Dl7ee8y6g%7Cci%3D030b02f64d9acedb523928f8f2258d16a2883a84%7Ctr%3Dslsl%7Csn%3D95694%7Chk%3D44d698f0e32b84eb243e7a84f624b5d54c5379d2'
driver = webdriver.Chrome(ChromeDriverManager().install())
browser = driver.get(url)
time.sleep(2)
product_section = driver.find_element(By.CSS_SELECTOR,'div.product_section_price__b6yrx') # 상세구입조건 / 상품 db / 버튼을 모두 가지고 있는 div태그
# 상품옵션 선택
option_dic = {}
ul = product_section.find_elements(By.CSS_SELECTOR,'div.filter_condition_group__h8Gss > ul')
for lis in ul:
labels = lis.find_elements(By.CSS_SELECTOR,'label.filter_label__3GLbR')
for label in labels:
option_text = label.find_element(By.CSS_SELECTOR,'span.filter_text__J8EIh').text
option_dic[option_text]=label
if len(option_dic) > 1:
try:
comment = input (f'상세구입조건을 선택하세요! {option_dic.keys()} :')
option_dic[comment].click()
time.sleep(5)
except Exception as e:
print('오류가 발생했습니다. 다시 입력하세요!',str(e))
# 총 상품수
ul = driver.find_element(By.CSS_SELECTOR,'div.floatingTab_detail_tab__akl87 > ul')
lis = ul.find_elements(By.CSS_SELECTOR,'li')
if lis:
item_count = lis[0].find_element(By.CSS_SELECTOR,'a > em').text
time.sleep(3)
# 상품 크롤링
naver_item = []
page = 1
while True:
print('-'*50+f'{page}페이지 진행중'+'-'*50)
items = product_section.find_elements(By.CSS_SELECTOR,'ul.productList_list_seller__XGhCk > li')
for index, item in enumerate(items):
same_item = item.find_elements(By.CSS_SELECTOR,'div.productList_same_area__ULPvk')
if same_item:
for a_button in same_item:
button = a_button.find_element(By.CSS_SELECTOR,'a.productList_same__0QQHk')
button.click()
time.sleep(5)
## 판매처 market ##
imgs = item.find_elements(By.CSS_SELECTOR,'img')
if imgs:
for img in imgs:
market = item.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
# if imgs:
# for img in imgs:
# market = item.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
else:
market = item.find_element(By.CSS_SELECTOR,'a.productList_mall_link__TrYxC > span').text
print(market)
# if img:
# market = item.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
# else:
# market = item.find_element(By.CSS_SELECTOR,'a.productList_mall_link__TrYxC > span').text
# ## 상품명 ##
# name = item.find_element(By.CSS_SELECTOR,'a.productList_title__R1qZP').text
# ## 사이트 ##
# site = item.find_element(By.CSS_SELECTOR,'a.productList_title__R1qZP').get_attribute('href')
# ## 판매가 ##
# price_list = item.find_elements(By.CSS_SELECTOR,'a.productList_value__B_IxM > span')
# for em in price_list:
# price = em.find_element(By.CSS_SELECTOR,'em').text
# ## 배송비 ##
# delivery = item.find_element(By.CSS_SELECTOR,'div.productList_delivery__WwSwL').text
# item_dic = {"순위":index+1,"판매처":market,"상품명":name,"판매가":price,"배송비":delivery,"url":site}
# naver_item.append(item_dic)
print(len(naver_item))
print()
print(int(item_count))
print()
print('-'*50+f'{page}페이지 종료'+'-'*50)
if len(naver_item) >= int(item_count):
print('-'*50+'크롤링이 종료되었습니다.'+'-'*50)
break
# 다음 페이지 클릭
btn_dic = {}
a_tags = product_section.find_elements(By.CSS_SELECTOR,'div.pagination_pagination__JW7zT > a')
for index, a in enumerate(a_tags):
btn_dic[index]=a
btn_dic[page].click()
time.sleep(5)
page += 1
print(len(naver_item))
print(naver_item)
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
I am working on a project that really blocked me I often asked questions here and you have helped me a lot since I am still a beginner, my project consists in making a competitive watch table for hotel rates for an agency It is a painful action that I wanted to automate it, I succeeded in extracting the tariffs and their prices, but the problem is that I want him to give me only the selected room
I provide you with the code and the output i removed the data that i want to elimnate in my output also i've addede images to better clarify things if any of you can help me and thank you in advance.
NB : thanks to pmadhu's answer problem solved but now it shows me the same rates for all hotels.
#!/usr/bin/env python
# coding: utf-8
import json
import time
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'Nabeul',
'date_from': '24/08/2021',
'date_to': '25/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# ----------------------------------------------------------------------------
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
urls = []
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
btn_passe = driver.find_element_by_xpath('//*[#id="resultat"]/div/form/div/div[2]/div[1]/div[2]/div[2]/div/div ')
btn_passe.click()
sleep(2)
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222'
}
# select civilite
civilite_acheteur = Select(driver.find_element_by_id('civilite_acheteur'))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementById('prenom_acheteur').value ='{params['prenom_acheteur']}';"
script += f"document.getElementById('nom_acheteur').value ='{params['nom_acheteur']}';"
script += f"document.getElementById('e_mail_acheteur').value ='{params['e_mail_acheteur']}';"
script += f"document.getElementById('portable_acheteur').value ='{params['portable_acheteur']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('titre_Hammamet')
btn_rechercher.click()
sleep(2)
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(3)
achat = driver.find_element_by_xpath('/html/body/header/div[2]/div[1]/div[1]/div[4]/div[2]/div[2]').text.replace(' TND', '')
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,achat,marge
driver.get(url)
btn_passe = driver.find_element_by_xpath('//*[#id="moteur_rech"]/form/div/div[3]/div')
btn_passe.click()
sleep(2)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {}".format(name,opt,optiondata))
Try below code once:
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {} - {}".format(name,opt,num,optiondata))
And the output:
Tui Blue Scheherazade Sousse : Double Standard Vue Mer - 1 - {'Demi Pension': '114'}
Golf Residence GAS Sousse : Double--Standard - 2 - {'Demi Pension': '51', 'Petit Dejeuner': '42'}
Sindbad Center GAS Sousse : Chambre Double - 2 - {'Petit Dejeuner': '27', 'Logement seul': '22'}
I have return code in selenium. It works fine. It scraps the portal and extracts the data in table. But now I am trying to shift either to scrapy or requests.
I tried learning both and failed misserably. The selenium structure is fit in my mind. It will take me long to understand basics of requests or scrappy and then use them. The shortcut is to get some tips on how to do it directly in connection with present code.
Why am I shifting? -
I posted the code to seek suggestions for refactoring the code (here). Two of the comments have suggested me to shift to requests. That has triggered the effort. Then after some primary search I realized, I can avoid selenium and requests or scrappy can save huge time for me.
I checked here. But that dose not solve my issue.
Can someone help with this? Thanks in advance.
The code (including URL) -
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, \
TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from FIR_logging import logger
import os
import time
import pandas as pd
# base function
def get_url(some_url):
while True:
try:
driver.get(some_url)
break
except WebDriverException:
time.sleep(60)
continue
driver.refresh()
# Some constants:
URL = r'https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx'
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
time.sleep(10)
Download_Directory = r'/some_directory/raw_footage7'
COLUMNS = ['Sr.No.', 'State', 'District', 'Police Station', 'Year', 'FIR No.', 'Registration Date', 'FIR No',
'Sections']
ALL_Districts = ['AKOLA', 'AMRAVATI CITY', 'AMRAVATI RURAL', 'AURANGABAD CITY',
'AURANGABAD RURAL', 'BEED', 'BHANDARA', 'BRIHAN MUMBAI CITY', 'BULDHANA',
'CHANDRAPUR', 'DHULE', 'GADCHIROLI', 'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA',
'KOLHAPUR', 'LATUR', 'NAGPUR CITY', 'NAGPUR RURAL', 'NANDED', 'NANDURBAR',
'NASHIK CITY', 'NASHIK RURAL', 'NAVI MUMBAI', 'OSMANABAD', 'PALGHAR', 'PARBHANI',
'PIMPRI-CHINCHWAD', 'PUNE CITY', 'PUNE RURAL', 'RAIGAD', 'RAILWAY AURANGABAD',
'RAILWAY MUMBAI', 'RAILWAY NAGPUR', 'RAILWAY PUNE', 'RATNAGIRI', 'SANGLI', 'SATARA',
'SINDHUDURG', 'SOLAPUR CITY', 'SOLAPUR RURAL', 'THANE CITY', 'THANE RURAL', 'WARDHA',
'WASHIM', 'YAVATMAL']
# other functions
def district_selection(name):
dist_list = Select(driver.find_element_by_css_selector(
"#ContentPlaceHolder1_ddlDistrict"))
dist_list_options = dist_list.options
names = [o.get_attribute("text")
for o in dist_list.options if o.get_attribute("text") not in (
'Select')]
if name not in names:
logger.info(f"{name} is not in list")
return False
dist_list.select_by_visible_text(name)
time.sleep(8)
def enter_date(date):
# enters start as well as end dates with "action chains."
WebDriverWait(driver, 160).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')))
from_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')
to_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationTo')
ActionChains(driver).click(from_date_field).send_keys(
date).move_to_element(to_date_field).click().send_keys(
date).perform()
logger.info(f'date entered: {date}')
def search():
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnSearch').click()
def number_of_records():
"""captures the text indicating number of records.
converts it to integer. if 0 returns and appends name of district to the list
if page is not loaded. it tries one more time for 15 secs."""
time_counter = 1
while time_counter < 19:
try:
records_number = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_lbltotalrecord').text
if records_number == '':
time.sleep(1)
continue
else:
records_number = int(records_number)
if records_number != 0:
logger.info(f"{district}: {records_number}")
return records_number
else:
logger.info(f"no records # {district}")
return False
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
logger.info("page is not loaded")
time_counter += 1
continue
def extract_table_current(name, single):
# entire table of record to be taken to the list.
soup = BS(driver.page_source, 'html.parser')
main_table = soup.find("table", {"id": "ContentPlaceHolder1_gdvDeadBody"})
time_counter = 1
while main_table is None:
if time_counter < 16:
logger.info(f"the table did not load # {name}")
time_counter += 1
else:
logger.info(f"the table did not load # {name}."
f"stopped trying")
return
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
rows = main_table.find_all("tr")
if links_for_pages is None:
for row in rows:
time.sleep(8)
if '...' not in row.text:
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
else:
for row in rows[0:(len(rows)) - 2]:
time.sleep(8)
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
def next_page(name, data):
# check if any link to next page is available
# iterate every page.
try:
driver.find_element_by_css_selector('.gridPager a')
except NoSuchElementException:
return False
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
for page in range(len(links_for_pages)):
# new list, to by pass stale element exception
links_for_pages_new = driver.find_elements_by_css_selector('.gridPager a')
# do not click on link for new page slot
if links_for_pages_new[page].text != '...':
links_for_pages_new[page].click()
# if this can be replaced with some other wait method to save the time
time.sleep(8)
extract_table_current(name, data)
def second_page_slot():
# find specific link for going to page 11 and click.
try:
link_for_page_slot = driver.find_element_by_link_text('...')
link_for_page_slot.click()
except NoSuchElementException:
return False
# main code
page_data = []
time.sleep(5)
view = Select(driver.find_element_by_css_selector(
'#ContentPlaceHolder1_ucRecordView_ddlPageSize'))
view.select_by_value('50')
driver.close()
for district in ALL_Districts:
b = "06"
c = "2020"
district_directory = os.path.join(Download_Directory, f'{district}{b}{c}')
if not os.path.exists(district_directory):
os.mkdir(district_directory)
for i in range(1, 30):
# reoping the page to wipe out the catch.
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
# entering date and assuring that 01 to 09 is entered correctly
if i < 10:
i = f'{str("0")}{str(i)}'
date_from = str(i) + b + c
enter_date(date_from)
# select district
district_selection(district)
time.sleep(3)
# start the search
search()
time.sleep(7)
if not number_of_records():
continue
extract_table_current(district, page_data)
time.sleep(3)
if not next_page(district, page_data):
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
continue
extract_table_current(district, page_data)
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
driver.close()
Request is a very nice and simple, but powerful package. When you have learned it then you will be grateful :) You can use request to navigate around the page and sometimes even to login or send messages.
I don't know scrappy but I have been using BeautifulSoup alot and that one is fairly simple to learn as well, you just get the "soup" of data from requests and then you use BS to filter your data.
My recommendation for you is to start from scratch, just one step at a time.
Start by getting your page and then get your data little by little :)
page = requests.get('https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx')
soup = BeautifulSoup(page.text, 'lxml')
I'm making a Craigslist scraper to scrape the titles, prices, date, and URL and exported that info to a CSV. Now, I want Selenium to click on the post URL to navigate to the actual page, parse the page to get a span tag "Odometer" (to get mileage), and return that to my CSV file.
Here's my code so far:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import schedule
from bs4 import BeautifulSoup
import urllib.request
import csv
import pandas as pd
class CraigslistScaper(object):
def __init__(self,query,location,max_price,transmission):
self.query = query
# self.sort=sort
self.location = location
# self.postal = postal
self.max_price = max_price
self.transmission = auto_transmission
#https://sfbay.craigslist.org/search/cta?query=mazda+miata&sort=rel&max_price=6000&auto_transmission=1
self.url = "https://{}.craigslist.org/search/cta?query={}&sort=rel&max_price={}&auto_transmission={}".format(self.location, self.query, self.max_price, self.transmission)
self.driver = webdriver.Chrome('/Users/MyUser/Desktop/chromedriver')
self.delay = 5
def load_craigslist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_element_located((By.ID,"searchform")))
print("page is ready")
except TimeoutError:
print('Loading took too much time')
#extracting the post information such as titles, dates, and prices
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name('result-row')
titles = []
dates = []
prices = []
post_info_list = []
for i in range(len(all_posts)):
post = all_posts[i]
title = post.text.split('$')
if title[0] == '':
title = title[1]
else:
title = title[0]
title = title.split("\n")
price = title[0]
title = title[-1]
title = title.split(' ')
month = title[0]
day = title[1]
date = month + " " + day
title = ' '.join(title[2:])
#print('PRICE: ' + (price))
#print('TITLE: ' + (title))
#print('DATE: ' + date)
lst = [price, title, date]
post_info_list.append(lst)
#f=open("miata_prices.csv", "a+")
#f.write(post_info_list)
#print(post_info_list)
#df = pd.DataFrame(post_info_list)
#df.to_csv('miata_prices.csv', index=False, header=False)
print(post_info_list)
return post_info_list
def save_post_info_and_urls_to_csv(self, post_info, post_urls):
for i in range(len(post_info)):
post_info[i].append(post_urls[i])
#print(post_info)
df = pd.DataFrame(post_info)
df.to_csv('miata_prices.csv', index=False, header=False)
return post_info
#extracting post URLs
def extract_post_urls(self):
url_list = []
soup = BeautifulSoup(self.driver.page_source,'html.parser')
aTagsInLi = self.driver.find_elements_by_css_selector('li a')
self.driver.find_elements_by_css_selector('li a')[0].click()
for a in aTagsInLi:
link = a.get_attribute('href')
print(link)
link = self.driver.find_element_by_link_text('Miata')
print(link)
link.click()
for link in soup.findAll('a', {'class': "result-title hdrlnk"}):
#print(link.get('href'))
url_list.append(link.get('href'))
return url_list
#to click on URL Links and parse the HTML
def click_next_page(self):
href = driver.find_element_by_partial_link_text("result-title hdrlink")
extract_post_urls(url_list).click(href)
def quit(self):
self.driver.close()
location = "sfbay"
max_price = "5000"
#radius = "250"
auto_transmission = 1
query = "Mazda Miata"
scraper = CraigslistScaper(query,location,max_price,auto_transmission)
scraper.load_craigslist_url()
post_info = scraper.extract_post_information()
#print(post_info)
post_urls = scraper.extract_post_urls()
#print(post_urls)
scraper.save_post_info_and_urls_to_csv(post_info, post_urls)
#print(post_info)
scraper.quit()
I manage to get everything to the CSV file, but I'm stuck on how I can get Selenium to open every link in a new tab, get the odometer information, then close the tab.
I'm using this to build a dataset and eventually do some analysis with it!
I have an example how to get Selenium to open every link and get the odometer information. I used a wrapper for Selenium (SeElements) for less code. I hope you will found out how it works. So:
I'm opening your link, scrapping all links from the titles to the list. Then open every link and trying to get odometer info.
from elementium.drivers.se import SeElements
from selenium import webdriver
browser = webdriver.Chrome()
url = 'https://sfbay.craigslist.org/search/ctaquery=mazda+miata&sort=rel&max_price=6000&auto_transmission=1'
browser.get(url)
se = SeElements(browser)
titles = se.xpath('//p[#class="result-info"]/a', wait=True, ttl=5)
try:
links = []
for link in titles:
links.append(link.attribute('href'))
for link in links:
print(link)
browser.get(link)
try:
odometer = se.xpath('//span[contains(text(), "odometer")]',wait=True, ttl=2).text()
except Exception:
continue
print(odometer)
except Exception as e:
browser.quit()
raise e