scraping data after click on interactive code - python

I want to scrape prices of every hotel from a tourist site , i'm extracting names and arrangements butt he problem that the prices shows of after clic arrangments and i didn't know how to deal with it.
the out put i want to get :
{' Julius ': [('Petit Déjeuner', '216'),('Demi pension','264')]}
I put at your disposal my code if any of you can help me and thank you in advance.
#!/usr/bin/env python
# coding: utf-8
import json
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'El Jem',
'date_from': '08/08/2021',
'date_to': '09/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# click bouton search
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# click bouton details
#btn_plus = driver.find_element_by_id('plus_res')
#btn_plus.click()
#sleep(10)
# ----------------------------------------------------------------------------
# get list of all hotels
hotels_list = []
hotels_objects = driver.find_elements_by_xpath(
'//div[contains(#class, "enveloppe_produit")]'
)
for hotel_obj in hotels_objects:
# get price object
price_object = hotel_obj.find_element_by_xpath(
'.//div[#class="monaieprix"]'
)
price_value = price_object.find_element_by_xpath(
'.//div[1]'
).text.replace('\n', '')
# get title data
title_data = hotel_obj.find_element_by_xpath(
'.//span[contains(#class, "tittre_hotel")]'
)
# get arrangements
arrangements_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(#class, "angle")]//u'
)
arrangements = [ao.text for ao in arrangements_obj]
# get arrangements
prixM_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(#id, "prixtotal")]'
)
prixM = [ao.text for ao in prixM_obj]
# create new object
hotels_list.append({
'name': title_data.find_element_by_xpath('.//a//h3').text,
'arrangements': arrangements,
'prixM':prixM,
'price': f'{price_value}'
})
# ----------------------------------------------------------------
#for hotel in hotels_list:
# print(json.dumps(hotel, indent=4))
import pandas as pd
df = pd.DataFrame(hotels_list, columns=['name','arrangements','price'])
df.head()

It seems that the DOM keeps changing. So based on the answers from this question and StaleElementReferenceException, below code might be useful for you.
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
import time
driver = webdriver.Chrome(executable_path="path")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://tn.tunisiebooking.com/")
#Code to choose options.
hoteldata = {}
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
name = hotel.find_element_by_tag_name("h3").text
details = []
argmts = hotel.find_element_by_class_name("angle_active").text
prize = hotel.find_element_by_xpath(".//div[contains(#id,'prixtotal_')]").get_attribute("innerText")
details.append((argmts,prize))
inactive = hotel.find_elements_by_xpath(".//div[#class='angle_desactive']")
for item in inactive:
try:
n = item.get_attribute("innerText")
item.click()
time.sleep(2)
pri = hotel.find_element_by_xpath(".//div[contains(#id,'prixtotal_')]").get_attribute("innerText")
details.append((n,pri))
except StaleElementReferenceException:
pass
hoteldata[name]=details
print(hoteldata)
driver.quit()

Related

how to extract data from an element list python

I am working on a project that really blocked me I often asked questions here and you have helped me a lot since I am still a beginner, my project consists in making a competitive watch table for hotel rates for an agency It is a painful action that I wanted to automate it, I succeeded in extracting the tariffs and their prices, but the problem is that I want him to give me only the selected room
I provide you with the code and the output i removed the data that i want to elimnate in my output also i've addede images to better clarify things if any of you can help me and thank you in advance.
NB : thanks to pmadhu's answer problem solved but now it shows me the same rates for all hotels.
#!/usr/bin/env python
# coding: utf-8
import json
import time
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'Nabeul',
'date_from': '24/08/2021',
'date_to': '25/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# ----------------------------------------------------------------------------
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
urls = []
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
btn_passe = driver.find_element_by_xpath('//*[#id="resultat"]/div/form/div/div[2]/div[1]/div[2]/div[2]/div/div ')
btn_passe.click()
sleep(2)
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222'
}
# select civilite
civilite_acheteur = Select(driver.find_element_by_id('civilite_acheteur'))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementById('prenom_acheteur').value ='{params['prenom_acheteur']}';"
script += f"document.getElementById('nom_acheteur').value ='{params['nom_acheteur']}';"
script += f"document.getElementById('e_mail_acheteur').value ='{params['e_mail_acheteur']}';"
script += f"document.getElementById('portable_acheteur').value ='{params['portable_acheteur']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('titre_Hammamet')
btn_rechercher.click()
sleep(2)
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(3)
achat = driver.find_element_by_xpath('/html/body/header/div[2]/div[1]/div[1]/div[4]/div[2]/div[2]').text.replace(' TND', '')
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,achat,marge
driver.get(url)
btn_passe = driver.find_element_by_xpath('//*[#id="moteur_rech"]/form/div/div[3]/div')
btn_passe.click()
sleep(2)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {}".format(name,opt,optiondata))
Try below code once:
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {} - {}".format(name,opt,num,optiondata))
And the output:
Tui Blue Scheherazade Sousse : Double Standard Vue Mer - 1 - {'Demi Pension': '114'}
Golf Residence GAS Sousse : Double--Standard - 2 - {'Demi Pension': '51', 'Petit Dejeuner': '42'}
Sindbad Center GAS Sousse : Chambre Double - 2 - {'Petit Dejeuner': '27', 'Logement seul': '22'}

Web Scraping shopee.sg with selenium and BeautifulSoup in python

Whenever I am trying to scrape shopee.sg using selenium and BeautifulSoup I am not being able to extract all the data from a single page.
Example - For a search result consisting of 50 products information on the first 15 are getting extracted while the remaining are giving null values.
Now, I know this has got something to do with the scroller but I have no idea how to make it work. Any idea how to fix this?
Code as of now
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from time import sleep
import csv
# create object for chrome options
chrome_options = Options()
#base_url = 'https://shopee.sg/search?keyword=disinfectant'
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
#chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
def get_url(search_term):
"""Generate an url from the search term"""
template = "https://www.shopee.sg/search?keyword={}"
search_term = search_term.replace(' ','+')
#add term query to url
url = template.format(search_term)
#add page query placeholder
url+= '&page={}'
return url
def main(search_term):
# invoke the webdriver
driver = webdriver.Chrome(options = chrome_options)
item_cost = []
item_name = []
url=get_url(search_term)
for page in range(0,3):
driver.get(url.format(page))
delay = 5 #seconds
try:
WebDriverWait(driver, delay)
print ("Page is ready")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
#print(html)
soup = BeautifulSoup(html, "html.parser")
#find the product description
for item_n in soup.find_all('div',{'class':'col-xs-2-4 shopee-search-item-result__item'}):
try:
description_soup = item_n.find('div',{'class':'yQmmFK _1POlWt _36CEnF'})
name = description_soup.text.strip()
except AttributeError:
name = ''
print(name)
item_name.append(name)
# find the price of items
for item_c in soup.find_all('div',{'class':'col-xs-2-4 shopee-search-item-result__item'}):
try:
price_soup = item_c.find('div',{'class':'WTFwws _1lK1eK _5W0f35'})
price_final = price_soup.find('span',{'class':'_29R_un'})
price = price_final.text.strip()
except AttributeError:
price = ''
print(price)
item_cost.append(price)
except TimeoutException:
print ("Loading took too much time!-Try again")
sleep(5)
rows = zip(item_name, item_cost)
with open('shopee_item_list.csv','w',newline='',encoding='utf-8') as f:
writer=csv.writer(f)
writer.writerow(['Product Description', 'Price'])
writer.writerows(rows)```
The issue was that the products that you were trying to scrape load dynamically as you scroll down the page. There may be more elegant solutions than mine, but I implemented a simple javascript scroller, using driver.execute_script (additional resource: https://www.geeksforgeeks.org/execute_script-driver-method-selenium-python)
Scroller
which scrolls to a tenth of the page's height, pauses for 500 milliseconds, and then continues.
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
Additionally, you had two for loops, for item_n in soup.find_all(...), for item_c in soup.find_all(...) that were iterating over divs in the same class. I fixed that, in my code, so that you can get both the price and the name of each item while only using one for loop.
You also had try-except statements (in case there was an AttributeError, i.e. if the items you were finding in soup.find_all were NoneTypes). I simplified those into if statements, like this one
name = item.find('div', {'class': 'yQmmFK _1POlWt _36CEnF'})
if name is not None:
name = name.text.strip()
else:
name = ''
And finally, you were using zip for two different lists (names and prices), to add to a csv file. I combined those individual lists into a nested list in the for loop, instead of appending to two separate lists and zipping at the end. This saves a step, though it is optional and may not be what you need.
Full (updated) code
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
from time import sleep
# create object for chrome options
chrome_options = Options()
# base_url = 'https://shopee.sg/search?keyword=disinfectant'
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
# chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
def get_url(search_term):
"""Generate an url from the search term"""
template = "https://www.shopee.sg/search?keyword={}"
search_term = search_term.replace(' ', '+')
# add term query to url
url = template.format(search_term)
# add page query placeholder
url += '&page={}'
return url
def main(search_term):
# invoke the webdriver
driver = webdriver.Chrome(options=chrome_options)
rows = []
url = get_url(search_term)
for page in range(0, 3):
driver.get(url.format(page))
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', {'class': 'col-xs-2-4 shopee-search-item-result__item'}):
name = item.find('div', {'class': 'yQmmFK _1POlWt _36CEnF'})
if name is not None:
name = name.text.strip()
else:
name = ''
price = item.find('div', {'class': 'WTFwws _1lK1eK _5W0f35'})
if price is not None:
price = price.find('span', {'class': '_29R_un'}).text.strip()
else:
price = ''
print([name, price])
rows.append([name, price])
with open('shopee_item_list.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Product Description', 'Price'])
writer.writerows(rows)

Selenium switch window stop after a specific iteration

I don't know why I'm getting this error I tried many exception to handle my error , but sometimes it goes fluently till a specific page and stops and sometimes it doesn't even starts.
Here's my code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common import exceptions
from time import sleep
import pandas as pd
options = Options()
# Create our Players Salaries Dictionary
weekly_wages_players_dataset = pd.DataFrame(columns=['name', 'team', 'weekly wage'])
# We specify our Chrome driver path
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
web_driver = webdriver.Chrome(options=options, executable_path=path)
# Famous Championships
premier_league_url = "https://eurofootballrumours.com/premier-league-players-salaries/"
bundesliga_url = "https://eurofootballrumours.com/bundesliga-players-salaries/"
laliga_url = "https://eurofootballrumours.com/la-liga-players-salaries/"
seriea_url = "https://eurofootballrumours.com/serie-a-players-salaries/"
ligue_fr_url = "https://eurofootballrumours.com/ligue-1-players-salaries/"
championships = [premier_league_url, bundesliga_url, laliga_url, seriea_url, ligue_fr_url]
def players_info(driver, url, weekly_wages_players):
try:
driver.get(url)
except exceptions.InvalidSessionIdException as e:
print(e.message)
sleep(3)
# Let's get the teams values
teams = driver.find_elements_by_css_selector("h2")
for t in teams:
atags = t.find_elements_by_css_selector('a')
for atag in atags:
# In each atag, select the href
href = atag.get_attribute('href')
print(href)
# Open a new window
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(href)
sleep(2)
# We get players infos
player_team = driver.find_element_by_class_name('wp-caption-text').text
# We get table content since it has all players name and their weekly wages
table_id = driver.find_element(By.TAG_NAME, "table")
tbody = table_id.find_element(By.TAG_NAME, "tbody")
rows = tbody.find_elements(By.TAG_NAME, "tr")
for row in rows:
player_name = row.find_elements(By.TAG_NAME, "td")[0].text
player_week_salary = row.find_elements(By.TAG_NAME, "td")[1].text
print(player_name)
print(player_week_salary)
# Now we store our result to our dataframe
weekly_wages_players = weekly_wages_players.append(
{'name': player_name, 'team': player_team, 'weekly wage': player_week_salary}, ignore_index=True)
# Close the tab with URL B
driver.close()
# Switch back to the first tab with URL A
driver.switch_to.window(driver.window_handles[0])
# We call our function through all the championships links
for championship in championships:
players_info(web_driver, championship, weekly_wages_players_dataset)
web_driver.close()
# We store our dataframe in an excel file
weekly_wages_players_dataset.to_excel('Weekly_Players_Wages.xlsx', index=False)
And this is the error I get :
driver.switch_to.window(driver.window_handles[0])
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

Shifting pythone code written in selenium to scrapy or requests

I have return code in selenium. It works fine. It scraps the portal and extracts the data in table. But now I am trying to shift either to scrapy or requests.
I tried learning both and failed misserably. The selenium structure is fit in my mind. It will take me long to understand basics of requests or scrappy and then use them. The shortcut is to get some tips on how to do it directly in connection with present code.
Why am I shifting? -
I posted the code to seek suggestions for refactoring the code (here). Two of the comments have suggested me to shift to requests. That has triggered the effort. Then after some primary search I realized, I can avoid selenium and requests or scrappy can save huge time for me.
I checked here. But that dose not solve my issue.
Can someone help with this? Thanks in advance.
The code (including URL) -
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, \
TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from FIR_logging import logger
import os
import time
import pandas as pd
# base function
def get_url(some_url):
while True:
try:
driver.get(some_url)
break
except WebDriverException:
time.sleep(60)
continue
driver.refresh()
# Some constants:
URL = r'https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx'
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
time.sleep(10)
Download_Directory = r'/some_directory/raw_footage7'
COLUMNS = ['Sr.No.', 'State', 'District', 'Police Station', 'Year', 'FIR No.', 'Registration Date', 'FIR No',
'Sections']
ALL_Districts = ['AKOLA', 'AMRAVATI CITY', 'AMRAVATI RURAL', 'AURANGABAD CITY',
'AURANGABAD RURAL', 'BEED', 'BHANDARA', 'BRIHAN MUMBAI CITY', 'BULDHANA',
'CHANDRAPUR', 'DHULE', 'GADCHIROLI', 'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA',
'KOLHAPUR', 'LATUR', 'NAGPUR CITY', 'NAGPUR RURAL', 'NANDED', 'NANDURBAR',
'NASHIK CITY', 'NASHIK RURAL', 'NAVI MUMBAI', 'OSMANABAD', 'PALGHAR', 'PARBHANI',
'PIMPRI-CHINCHWAD', 'PUNE CITY', 'PUNE RURAL', 'RAIGAD', 'RAILWAY AURANGABAD',
'RAILWAY MUMBAI', 'RAILWAY NAGPUR', 'RAILWAY PUNE', 'RATNAGIRI', 'SANGLI', 'SATARA',
'SINDHUDURG', 'SOLAPUR CITY', 'SOLAPUR RURAL', 'THANE CITY', 'THANE RURAL', 'WARDHA',
'WASHIM', 'YAVATMAL']
# other functions
def district_selection(name):
dist_list = Select(driver.find_element_by_css_selector(
"#ContentPlaceHolder1_ddlDistrict"))
dist_list_options = dist_list.options
names = [o.get_attribute("text")
for o in dist_list.options if o.get_attribute("text") not in (
'Select')]
if name not in names:
logger.info(f"{name} is not in list")
return False
dist_list.select_by_visible_text(name)
time.sleep(8)
def enter_date(date):
# enters start as well as end dates with "action chains."
WebDriverWait(driver, 160).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')))
from_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')
to_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationTo')
ActionChains(driver).click(from_date_field).send_keys(
date).move_to_element(to_date_field).click().send_keys(
date).perform()
logger.info(f'date entered: {date}')
def search():
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnSearch').click()
def number_of_records():
"""captures the text indicating number of records.
converts it to integer. if 0 returns and appends name of district to the list
if page is not loaded. it tries one more time for 15 secs."""
time_counter = 1
while time_counter < 19:
try:
records_number = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_lbltotalrecord').text
if records_number == '':
time.sleep(1)
continue
else:
records_number = int(records_number)
if records_number != 0:
logger.info(f"{district}: {records_number}")
return records_number
else:
logger.info(f"no records # {district}")
return False
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
logger.info("page is not loaded")
time_counter += 1
continue
def extract_table_current(name, single):
# entire table of record to be taken to the list.
soup = BS(driver.page_source, 'html.parser')
main_table = soup.find("table", {"id": "ContentPlaceHolder1_gdvDeadBody"})
time_counter = 1
while main_table is None:
if time_counter < 16:
logger.info(f"the table did not load # {name}")
time_counter += 1
else:
logger.info(f"the table did not load # {name}."
f"stopped trying")
return
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
rows = main_table.find_all("tr")
if links_for_pages is None:
for row in rows:
time.sleep(8)
if '...' not in row.text:
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
else:
for row in rows[0:(len(rows)) - 2]:
time.sleep(8)
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
def next_page(name, data):
# check if any link to next page is available
# iterate every page.
try:
driver.find_element_by_css_selector('.gridPager a')
except NoSuchElementException:
return False
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
for page in range(len(links_for_pages)):
# new list, to by pass stale element exception
links_for_pages_new = driver.find_elements_by_css_selector('.gridPager a')
# do not click on link for new page slot
if links_for_pages_new[page].text != '...':
links_for_pages_new[page].click()
# if this can be replaced with some other wait method to save the time
time.sleep(8)
extract_table_current(name, data)
def second_page_slot():
# find specific link for going to page 11 and click.
try:
link_for_page_slot = driver.find_element_by_link_text('...')
link_for_page_slot.click()
except NoSuchElementException:
return False
# main code
page_data = []
time.sleep(5)
view = Select(driver.find_element_by_css_selector(
'#ContentPlaceHolder1_ucRecordView_ddlPageSize'))
view.select_by_value('50')
driver.close()
for district in ALL_Districts:
b = "06"
c = "2020"
district_directory = os.path.join(Download_Directory, f'{district}{b}{c}')
if not os.path.exists(district_directory):
os.mkdir(district_directory)
for i in range(1, 30):
# reoping the page to wipe out the catch.
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
# entering date and assuring that 01 to 09 is entered correctly
if i < 10:
i = f'{str("0")}{str(i)}'
date_from = str(i) + b + c
enter_date(date_from)
# select district
district_selection(district)
time.sleep(3)
# start the search
search()
time.sleep(7)
if not number_of_records():
continue
extract_table_current(district, page_data)
time.sleep(3)
if not next_page(district, page_data):
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
continue
extract_table_current(district, page_data)
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
driver.close()
Request is a very nice and simple, but powerful package. When you have learned it then you will be grateful :) You can use request to navigate around the page and sometimes even to login or send messages.
I don't know scrappy but I have been using BeautifulSoup alot and that one is fairly simple to learn as well, you just get the "soup" of data from requests and then you use BS to filter your data.
My recommendation for you is to start from scratch, just one step at a time.
Start by getting your page and then get your data little by little :)
page = requests.get('https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx')
soup = BeautifulSoup(page.text, 'lxml')

how to scrapy information from one directory by selenium

scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])

Categories