how to extract data from an element list python

how to extract data from an element list python - python

I am working on a project that really blocked me I often asked questions here and you have helped me a lot since I am still a beginner, my project consists in making a competitive watch table for hotel rates for an agency It is a painful action that I wanted to automate it, I succeeded in extracting the tariffs and their prices, but the problem is that I want him to give me only the selected room
I provide you with the code and the output i removed the data that i want to elimnate in my output also i've addede images to better clarify things if any of you can help me and thank you in advance.
NB : thanks to pmadhu's answer problem solved but now it shows me the same rates for all hotels.
#!/usr/bin/env python
# coding: utf-8
import json
import time
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'Nabeul',
'date_from': '24/08/2021',
'date_to': '25/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# ----------------------------------------------------------------------------
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
urls = []
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
btn_passe = driver.find_element_by_xpath('//*[#id="resultat"]/div/form/div/div[2]/div[1]/div[2]/div[2]/div/div ')
btn_passe.click()
sleep(2)
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222'
}
# select civilite
civilite_acheteur = Select(driver.find_element_by_id('civilite_acheteur'))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementById('prenom_acheteur').value ='{params['prenom_acheteur']}';"
script += f"document.getElementById('nom_acheteur').value ='{params['nom_acheteur']}';"
script += f"document.getElementById('e_mail_acheteur').value ='{params['e_mail_acheteur']}';"
script += f"document.getElementById('portable_acheteur').value ='{params['portable_acheteur']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('titre_Hammamet')
btn_rechercher.click()
sleep(2)
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(3)
achat = driver.find_element_by_xpath('/html/body/header/div[2]/div[1]/div[1]/div[4]/div[2]/div[2]').text.replace(' TND', '')
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,achat,marge
driver.get(url)
btn_passe = driver.find_element_by_xpath('//*[#id="moteur_rech"]/form/div/div[3]/div')
btn_passe.click()
sleep(2)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {}".format(name,opt,optiondata))

Try below code once:
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {} - {}".format(name,opt,num,optiondata))
And the output:
Tui Blue Scheherazade Sousse : Double Standard Vue Mer - 1 - {'Demi Pension': '114'}
Golf Residence GAS Sousse : Double--Standard - 2 - {'Demi Pension': '51', 'Petit Dejeuner': '42'}
Sindbad Center GAS Sousse : Chambre Double - 2 - {'Petit Dejeuner': '27', 'Logement seul': '22'}

Related

web scraping all universities with websites and description WHED website

anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.

Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)

You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()

scraping data after click on interactive code

I want to scrape prices of every hotel from a tourist site , i'm extracting names and arrangements butt he problem that the prices shows of after clic arrangments and i didn't know how to deal with it.
the out put i want to get :
{' Julius ': [('Petit Déjeuner', '216'),('Demi pension','264')]}
I put at your disposal my code if any of you can help me and thank you in advance.
#!/usr/bin/env python
# coding: utf-8
import json
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'El Jem',
'date_from': '08/08/2021',
'date_to': '09/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# click bouton search
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# click bouton details
#btn_plus = driver.find_element_by_id('plus_res')
#btn_plus.click()
#sleep(10)
# ----------------------------------------------------------------------------
# get list of all hotels
hotels_list = []
hotels_objects = driver.find_elements_by_xpath(
'//div[contains(#class, "enveloppe_produit")]'
)
for hotel_obj in hotels_objects:
# get price object
price_object = hotel_obj.find_element_by_xpath(
'.//div[#class="monaieprix"]'
)
price_value = price_object.find_element_by_xpath(
'.//div[1]'
).text.replace('\n', '')
# get title data
title_data = hotel_obj.find_element_by_xpath(
'.//span[contains(#class, "tittre_hotel")]'
)
# get arrangements
arrangements_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(#class, "angle")]//u'
)
arrangements = [ao.text for ao in arrangements_obj]
# get arrangements
prixM_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(#id, "prixtotal")]'
)
prixM = [ao.text for ao in prixM_obj]
# create new object
hotels_list.append({
'name': title_data.find_element_by_xpath('.//a//h3').text,
'arrangements': arrangements,
'prixM':prixM,
'price': f'{price_value}'
})
# ----------------------------------------------------------------
#for hotel in hotels_list:
# print(json.dumps(hotel, indent=4))
import pandas as pd
df = pd.DataFrame(hotels_list, columns=['name','arrangements','price'])
df.head()

It seems that the DOM keeps changing. So based on the answers from this question and StaleElementReferenceException, below code might be useful for you.
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
import time
driver = webdriver.Chrome(executable_path="path")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://tn.tunisiebooking.com/")
#Code to choose options.
hoteldata = {}
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
name = hotel.find_element_by_tag_name("h3").text
details = []
argmts = hotel.find_element_by_class_name("angle_active").text
prize = hotel.find_element_by_xpath(".//div[contains(#id,'prixtotal_')]").get_attribute("innerText")
details.append((argmts,prize))
inactive = hotel.find_elements_by_xpath(".//div[#class='angle_desactive']")
for item in inactive:
try:
n = item.get_attribute("innerText")
item.click()
time.sleep(2)
pri = hotel.find_element_by_xpath(".//div[contains(#id,'prixtotal_')]").get_attribute("innerText")
details.append((n,pri))
except StaleElementReferenceException:
pass
hoteldata[name]=details
print(hoteldata)
driver.quit()

The page doesn't scraping

I'm trying to scrape this page
https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife
I scraped the first page this website and click with selenium to next page, but I only can get the first page content, when I scrape the second, it came the same content from first page. I dunno how to fix this or if the webpage has some protection to scraping.
Could someone help me?
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver
def scrape():
cont = [True,True,True,True,False]
for times in cont:
if times != True:
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
sleep(15)
titles = []
addresses = []
areas = []
rooms = []
bathes = []
values = []
start_time = time()
request = 0
soup = BeautifulSoup(page,'html.parser')
imov = soup.find_all('div', class_='property-card__main-content')
sleep(randint(8,15))
# Monitor
request += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if page.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, page.status_code))
# Break the loop if the number of requests is greater than expected
if request > 72:
warn('Number of requests was greater than expected.')
break
for container in imov:
# Título
title = container.h2.a.get_text()
t2 = title.strip()
titles.append(t2)
# Título
# Endereço
address = container.h2.span.get_text()
a2 = address.strip()
addresses.append(a2)
# Endereço
# Área
area = container.li.span.get_text()
ar2 = area.strip()
areas.append(ar2)
# Área
# Quartos
room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
r2 = room2.strip()
rooms.append(r2)
# Quartos
# Banheiros
bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
b2 = bath2.strip()
bathes.append(b2)
# Banheiros
# Valor
value = container.section.div.get_text()
v2 = value.strip()
values.append(v2)
# Valor
# Dataframe e salvar
vivareal = pd.DataFrame({
"title": titles,
"address": addresses,
"area": areas,
"rooms":rooms,
"baths":bathes,
"value":values
})
vivareal.to_csv(r'output.csv')
prox = driver.find_element_by_xpath('//*[#title="Próxima página"]')
prox.click()
else:
print('Done!')
scrape()```

Although you put the click command at the end, when it goes to the next loop, the first command is to create a new driver and then is called the command to get the main page of Viva Real to Pernambuco. This is unwanted. Instead of this you could do:
def scrape():
cont = [True,True,True,True,False]
# You create the driver and access the main page only once
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
for times in cont:
if times != True:
# Wait to load every page
sleep(15)

Your code is not working as espected, even with the fixes provided by #MarceloBaliu. Here is my code that (finally!) worked for me. I'm sharing because it can help someone, like I was helped by this website.
from selenium import webdriver
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
class ScraperVivaReal:
wait_time = 5
def __init__(self, url):
# Initializing the webdriver
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
self.driver = webdriver.Firefox(options=options)
self.driver.maximize_window()
self.driver.get(url)
time.sleep(self.wait_time)
# Handling cookies acception
WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="cookie-notifier-cta"]'))).click()
time.sleep(self.wait_time/2)
def __scrape_page__(self):
result = []
# Extracting data from the page
try:
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
except WebDriverException:
print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
return result
# Finding property cards containing search results
div_list = soup.find_all('div', {'class':'property-card__content'})
# Iterating each card
for d in div_list:
# Extracting info from card
title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
# Extracting the price
try:
price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
except AttributeError:
price = "N/I"
# Splitting the address
add_list = re.split(',|-', complete_address)
add_list = [ item.strip() for item in add_list ]
if len(add_list) == 2:
city, st = add_list
neibhood = 'N/I'
address = 'N/I'
number = 'N/I'
if len(add_list) == 3:
neibhood, city, st = add_list
address = 'N/I'
number = 'N/I'
if len(add_list) == 4:
address, neibhood, city, st = add_list
number = 'N/I'
elif len(add_list) == 5:
address, number, neibhood, city, st = add_list
# Adding the result into a dicionary and appending the dict to a result list
row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
result.append(row)
return result
def __next_page__(self):
# Finding the "Next Page" button element
next_element = self.driver.find_element_by_xpath('//*[#title="Próxima página"]')
try:
# Trying to click it
next_element.click()
time.sleep(self.wait_time)
return True
# Treating some exceptions (element not found and element not clickable)
except ElementClickInterceptedException:
print('"Próxima Página" element is not clickable!')
except NoSuchElementException:
print('"Próxima Página" element not found!')
return False
def run(self, output):
has_next = True
final_result = []
# Getting the information!
while has_next:
results = self.__scrape_page__()
final_result.extend(results)
print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
if len(results) == 0:
break
has_next = self.__next_page__()
# Quitting Firefox
self.driver.quit()
# Exporting results to CSV
df = pd.DataFrame(final_result)
df.to_csv(output, sep=',')
S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
S.run('output.csv')

Shifting pythone code written in selenium to scrapy or requests

I have return code in selenium. It works fine. It scraps the portal and extracts the data in table. But now I am trying to shift either to scrapy or requests.
I tried learning both and failed misserably. The selenium structure is fit in my mind. It will take me long to understand basics of requests or scrappy and then use them. The shortcut is to get some tips on how to do it directly in connection with present code.
Why am I shifting? -
I posted the code to seek suggestions for refactoring the code (here). Two of the comments have suggested me to shift to requests. That has triggered the effort. Then after some primary search I realized, I can avoid selenium and requests or scrappy can save huge time for me.
I checked here. But that dose not solve my issue.
Can someone help with this? Thanks in advance.
The code (including URL) -
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, \
TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from FIR_logging import logger
import os
import time
import pandas as pd
# base function
def get_url(some_url):
while True:
try:
driver.get(some_url)
break
except WebDriverException:
time.sleep(60)
continue
driver.refresh()
# Some constants:
URL = r'https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx'
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
time.sleep(10)
Download_Directory = r'/some_directory/raw_footage7'
COLUMNS = ['Sr.No.', 'State', 'District', 'Police Station', 'Year', 'FIR No.', 'Registration Date', 'FIR No',
'Sections']
ALL_Districts = ['AKOLA', 'AMRAVATI CITY', 'AMRAVATI RURAL', 'AURANGABAD CITY',
'AURANGABAD RURAL', 'BEED', 'BHANDARA', 'BRIHAN MUMBAI CITY', 'BULDHANA',
'CHANDRAPUR', 'DHULE', 'GADCHIROLI', 'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA',
'KOLHAPUR', 'LATUR', 'NAGPUR CITY', 'NAGPUR RURAL', 'NANDED', 'NANDURBAR',
'NASHIK CITY', 'NASHIK RURAL', 'NAVI MUMBAI', 'OSMANABAD', 'PALGHAR', 'PARBHANI',
'PIMPRI-CHINCHWAD', 'PUNE CITY', 'PUNE RURAL', 'RAIGAD', 'RAILWAY AURANGABAD',
'RAILWAY MUMBAI', 'RAILWAY NAGPUR', 'RAILWAY PUNE', 'RATNAGIRI', 'SANGLI', 'SATARA',
'SINDHUDURG', 'SOLAPUR CITY', 'SOLAPUR RURAL', 'THANE CITY', 'THANE RURAL', 'WARDHA',
'WASHIM', 'YAVATMAL']
# other functions
def district_selection(name):
dist_list = Select(driver.find_element_by_css_selector(
"#ContentPlaceHolder1_ddlDistrict"))
dist_list_options = dist_list.options
names = [o.get_attribute("text")
for o in dist_list.options if o.get_attribute("text") not in (
'Select')]
if name not in names:
logger.info(f"{name} is not in list")
return False
dist_list.select_by_visible_text(name)
time.sleep(8)
def enter_date(date):
# enters start as well as end dates with "action chains."
WebDriverWait(driver, 160).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')))
from_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')
to_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationTo')
ActionChains(driver).click(from_date_field).send_keys(
date).move_to_element(to_date_field).click().send_keys(
date).perform()
logger.info(f'date entered: {date}')
def search():
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnSearch').click()
def number_of_records():
"""captures the text indicating number of records.
converts it to integer. if 0 returns and appends name of district to the list
if page is not loaded. it tries one more time for 15 secs."""
time_counter = 1
while time_counter < 19:
try:
records_number = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_lbltotalrecord').text
if records_number == '':
time.sleep(1)
continue
else:
records_number = int(records_number)
if records_number != 0:
logger.info(f"{district}: {records_number}")
return records_number
else:
logger.info(f"no records # {district}")
return False
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
logger.info("page is not loaded")
time_counter += 1
continue
def extract_table_current(name, single):
# entire table of record to be taken to the list.
soup = BS(driver.page_source, 'html.parser')
main_table = soup.find("table", {"id": "ContentPlaceHolder1_gdvDeadBody"})
time_counter = 1
while main_table is None:
if time_counter < 16:
logger.info(f"the table did not load # {name}")
time_counter += 1
else:
logger.info(f"the table did not load # {name}."
f"stopped trying")
return
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
rows = main_table.find_all("tr")
if links_for_pages is None:
for row in rows:
time.sleep(8)
if '...' not in row.text:
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
else:
for row in rows[0:(len(rows)) - 2]:
time.sleep(8)
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
def next_page(name, data):
# check if any link to next page is available
# iterate every page.
try:
driver.find_element_by_css_selector('.gridPager a')
except NoSuchElementException:
return False
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
for page in range(len(links_for_pages)):
# new list, to by pass stale element exception
links_for_pages_new = driver.find_elements_by_css_selector('.gridPager a')
# do not click on link for new page slot
if links_for_pages_new[page].text != '...':
links_for_pages_new[page].click()
# if this can be replaced with some other wait method to save the time
time.sleep(8)
extract_table_current(name, data)
def second_page_slot():
# find specific link for going to page 11 and click.
try:
link_for_page_slot = driver.find_element_by_link_text('...')
link_for_page_slot.click()
except NoSuchElementException:
return False
# main code
page_data = []
time.sleep(5)
view = Select(driver.find_element_by_css_selector(
'#ContentPlaceHolder1_ucRecordView_ddlPageSize'))
view.select_by_value('50')
driver.close()
for district in ALL_Districts:
b = "06"
c = "2020"
district_directory = os.path.join(Download_Directory, f'{district}{b}{c}')
if not os.path.exists(district_directory):
os.mkdir(district_directory)
for i in range(1, 30):
# reoping the page to wipe out the catch.
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
# entering date and assuring that 01 to 09 is entered correctly
if i < 10:
i = f'{str("0")}{str(i)}'
date_from = str(i) + b + c
enter_date(date_from)
# select district
district_selection(district)
time.sleep(3)
# start the search
search()
time.sleep(7)
if not number_of_records():
continue
extract_table_current(district, page_data)
time.sleep(3)
if not next_page(district, page_data):
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
continue
extract_table_current(district, page_data)
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
driver.close()

Request is a very nice and simple, but powerful package. When you have learned it then you will be grateful :) You can use request to navigate around the page and sometimes even to login or send messages.
I don't know scrappy but I have been using BeautifulSoup alot and that one is fairly simple to learn as well, you just get the "soup" of data from requests and then you use BS to filter your data.
My recommendation for you is to start from scratch, just one step at a time.
Start by getting your page and then get your data little by little :)
page = requests.get('https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx')
soup = BeautifulSoup(page.text, 'lxml')

how to scrapy information from one directory by selenium

scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?

Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to extract data from an element list python - python

Related

web scraping all universities with websites and description WHED website

scraping data after click on interactive code

The page doesn't scraping

Shifting pythone code written in selenium to scrapy or requests

how to scrapy information from one directory by selenium

Categories

Resources