I don't know why I'm getting this error I tried many exception to handle my error , but sometimes it goes fluently till a specific page and stops and sometimes it doesn't even starts.
Here's my code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common import exceptions
from time import sleep
import pandas as pd
options = Options()
# Create our Players Salaries Dictionary
weekly_wages_players_dataset = pd.DataFrame(columns=['name', 'team', 'weekly wage'])
# We specify our Chrome driver path
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
web_driver = webdriver.Chrome(options=options, executable_path=path)
# Famous Championships
premier_league_url = "https://eurofootballrumours.com/premier-league-players-salaries/"
bundesliga_url = "https://eurofootballrumours.com/bundesliga-players-salaries/"
laliga_url = "https://eurofootballrumours.com/la-liga-players-salaries/"
seriea_url = "https://eurofootballrumours.com/serie-a-players-salaries/"
ligue_fr_url = "https://eurofootballrumours.com/ligue-1-players-salaries/"
championships = [premier_league_url, bundesliga_url, laliga_url, seriea_url, ligue_fr_url]
def players_info(driver, url, weekly_wages_players):
try:
driver.get(url)
except exceptions.InvalidSessionIdException as e:
print(e.message)
sleep(3)
# Let's get the teams values
teams = driver.find_elements_by_css_selector("h2")
for t in teams:
atags = t.find_elements_by_css_selector('a')
for atag in atags:
# In each atag, select the href
href = atag.get_attribute('href')
print(href)
# Open a new window
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(href)
sleep(2)
# We get players infos
player_team = driver.find_element_by_class_name('wp-caption-text').text
# We get table content since it has all players name and their weekly wages
table_id = driver.find_element(By.TAG_NAME, "table")
tbody = table_id.find_element(By.TAG_NAME, "tbody")
rows = tbody.find_elements(By.TAG_NAME, "tr")
for row in rows:
player_name = row.find_elements(By.TAG_NAME, "td")[0].text
player_week_salary = row.find_elements(By.TAG_NAME, "td")[1].text
print(player_name)
print(player_week_salary)
# Now we store our result to our dataframe
weekly_wages_players = weekly_wages_players.append(
{'name': player_name, 'team': player_team, 'weekly wage': player_week_salary}, ignore_index=True)
# Close the tab with URL B
driver.close()
# Switch back to the first tab with URL A
driver.switch_to.window(driver.window_handles[0])
# We call our function through all the championships links
for championship in championships:
players_info(web_driver, championship, weekly_wages_players_dataset)
web_driver.close()
# We store our dataframe in an excel file
weekly_wages_players_dataset.to_excel('Weekly_Players_Wages.xlsx', index=False)
And this is the error I get :
driver.switch_to.window(driver.window_handles[0])
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id
Related
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 20)
URL = 'https://mergr.com/firms/search/employees?page=1&firm%5BactiveInvestor%5D=2&sortColumn=employee_weight&sortDirection=asc'
driver.get(URL)
email=driver.find_element(By.CSS_SELECTOR,"input#username")
email.send_keys("timgr8#outlook.com")
password=driver.find_element(By.CSS_SELECTOR,"input#password")
password.send_keys("Cosmos1990$$$$$$$")
login=driver.find_element(By.CSS_SELECTOR,"button.btn").click()
urls=[]
product=[]
soup = BeautifulSoup(driver.page_source,"lxml")
details=soup.select("tbody tr")
for detail in details:
try:
t1 =detail.select_one("h5.profile-title a").text
except:
pass
wev={
'Name':t1,
}
product.append(wev)
page_links =driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
href=link.get_attribute("href")
urls.append(href)
for url in urls:
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
data={
'website':website
}
product.append(data)
df=pd.DataFrame(product)
df.to_csv('firm.csv')
The data of the website will be down in to CSV file as shown in pic is I am appending the data in wrong way why is data moving down where I am wrong ...Kindly recommend where I am wrong there .......
I want output in these format Kindly suggest solution for these...I want output in these format as you shown below...
You can't append wev and data separately - you need website and name in the same dictionary for pandas to know that they belong to same row.
You could add the websites in a separate list like
sites = []
# for url in urls:
# driver.get...
# soup = ....
# try:....except:....
data={
'website':website
}
sites.append(data)
and then zip and combine:
for pi, dictPair in enumerate(zip(product, sites)):
product[pi].update(dictPair[1])
df = pd.DataFrame(product)
df.to_csv('firm.csv')
However, I don't think it's the best way to make sure the right Names and Websites are matched up.
You should just add to the same dictionary for each row from the start instead of zipping and merging.
added_urls = []
product = []
soup = BeautifulSoup(driver.page_source,"lxml")
details = soup.select("tbody tr")
for detail in details:
try:
t1 = detail.select_one("h5.profile-title a").text
except:
# pass # then you'll just be using the previous row's t1
# [also, if this happens in the first loop, it will raise an error]
t1 = 'MISSING' # '' #
wev = {
'Name':t1,
}
href = detail.select_one("h5.profile-title + p a[href]")
if href and href.get("href", '').startswith('http'):
wev['page_link'] = href.get("href")
added_urls.append(href.get("href"))
product.append(wev)
### IF YOU WANT ROWS THAT CAN'T BE CONNECTED TO NAMES ###
page_links = driver.find_elements(By.CSS_SELECTOR, "h5.profile-title + p a")
for link in page_links:
if href in added_urls: continue # skip links that are already added
href = link.get_attribute("href")
# urls.append(href)
added_urls.append(href)
product.append({"page_link": href})
##########################################################
for pi, prod in enumerate(product):
if "page_link" not in prod or not prod["page_link"]: continue ## missing link
url = prod["page_link"]
driver.get(url)
soup = BeautifulSoup(driver.page_source,"lxml")
try:
website=soup.select_one("p.adress-info a[target='_blank']").text
except:
website=''
del product[pi]["page_link"] ## REMOVE this line IF you want a page_link column in csv
# data={'website':website}
# product.append(data)
product[pi]['website'] = website
df=pd.DataFrame(product)
df.to_csv('firm.csv')
I want to scrape prices of every hotel from a tourist site , i'm extracting names and arrangements butt he problem that the prices shows of after clic arrangments and i didn't know how to deal with it.
the out put i want to get :
{' Julius ': [('Petit Déjeuner', '216'),('Demi pension','264')]}
I put at your disposal my code if any of you can help me and thank you in advance.
#!/usr/bin/env python
# coding: utf-8
import json
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'El Jem',
'date_from': '08/08/2021',
'date_to': '09/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# click bouton search
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# click bouton details
#btn_plus = driver.find_element_by_id('plus_res')
#btn_plus.click()
#sleep(10)
# ----------------------------------------------------------------------------
# get list of all hotels
hotels_list = []
hotels_objects = driver.find_elements_by_xpath(
'//div[contains(#class, "enveloppe_produit")]'
)
for hotel_obj in hotels_objects:
# get price object
price_object = hotel_obj.find_element_by_xpath(
'.//div[#class="monaieprix"]'
)
price_value = price_object.find_element_by_xpath(
'.//div[1]'
).text.replace('\n', '')
# get title data
title_data = hotel_obj.find_element_by_xpath(
'.//span[contains(#class, "tittre_hotel")]'
)
# get arrangements
arrangements_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(#class, "angle")]//u'
)
arrangements = [ao.text for ao in arrangements_obj]
# get arrangements
prixM_obj = hotel_obj.find_elements_by_xpath(
'.//div[contains(#id, "prixtotal")]'
)
prixM = [ao.text for ao in prixM_obj]
# create new object
hotels_list.append({
'name': title_data.find_element_by_xpath('.//a//h3').text,
'arrangements': arrangements,
'prixM':prixM,
'price': f'{price_value}'
})
# ----------------------------------------------------------------
#for hotel in hotels_list:
# print(json.dumps(hotel, indent=4))
import pandas as pd
df = pd.DataFrame(hotels_list, columns=['name','arrangements','price'])
df.head()
It seems that the DOM keeps changing. So based on the answers from this question and StaleElementReferenceException, below code might be useful for you.
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
import time
driver = webdriver.Chrome(executable_path="path")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://tn.tunisiebooking.com/")
#Code to choose options.
hoteldata = {}
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
name = hotel.find_element_by_tag_name("h3").text
details = []
argmts = hotel.find_element_by_class_name("angle_active").text
prize = hotel.find_element_by_xpath(".//div[contains(#id,'prixtotal_')]").get_attribute("innerText")
details.append((argmts,prize))
inactive = hotel.find_elements_by_xpath(".//div[#class='angle_desactive']")
for item in inactive:
try:
n = item.get_attribute("innerText")
item.click()
time.sleep(2)
pri = hotel.find_element_by_xpath(".//div[contains(#id,'prixtotal_')]").get_attribute("innerText")
details.append((n,pri))
except StaleElementReferenceException:
pass
hoteldata[name]=details
print(hoteldata)
driver.quit()
Whenever I am trying to scrape shopee.sg using selenium and BeautifulSoup I am not being able to extract all the data from a single page.
Example - For a search result consisting of 50 products information on the first 15 are getting extracted while the remaining are giving null values.
Now, I know this has got something to do with the scroller but I have no idea how to make it work. Any idea how to fix this?
Code as of now
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from time import sleep
import csv
# create object for chrome options
chrome_options = Options()
#base_url = 'https://shopee.sg/search?keyword=disinfectant'
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
#chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
def get_url(search_term):
"""Generate an url from the search term"""
template = "https://www.shopee.sg/search?keyword={}"
search_term = search_term.replace(' ','+')
#add term query to url
url = template.format(search_term)
#add page query placeholder
url+= '&page={}'
return url
def main(search_term):
# invoke the webdriver
driver = webdriver.Chrome(options = chrome_options)
item_cost = []
item_name = []
url=get_url(search_term)
for page in range(0,3):
driver.get(url.format(page))
delay = 5 #seconds
try:
WebDriverWait(driver, delay)
print ("Page is ready")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
#print(html)
soup = BeautifulSoup(html, "html.parser")
#find the product description
for item_n in soup.find_all('div',{'class':'col-xs-2-4 shopee-search-item-result__item'}):
try:
description_soup = item_n.find('div',{'class':'yQmmFK _1POlWt _36CEnF'})
name = description_soup.text.strip()
except AttributeError:
name = ''
print(name)
item_name.append(name)
# find the price of items
for item_c in soup.find_all('div',{'class':'col-xs-2-4 shopee-search-item-result__item'}):
try:
price_soup = item_c.find('div',{'class':'WTFwws _1lK1eK _5W0f35'})
price_final = price_soup.find('span',{'class':'_29R_un'})
price = price_final.text.strip()
except AttributeError:
price = ''
print(price)
item_cost.append(price)
except TimeoutException:
print ("Loading took too much time!-Try again")
sleep(5)
rows = zip(item_name, item_cost)
with open('shopee_item_list.csv','w',newline='',encoding='utf-8') as f:
writer=csv.writer(f)
writer.writerow(['Product Description', 'Price'])
writer.writerows(rows)```
The issue was that the products that you were trying to scrape load dynamically as you scroll down the page. There may be more elegant solutions than mine, but I implemented a simple javascript scroller, using driver.execute_script (additional resource: https://www.geeksforgeeks.org/execute_script-driver-method-selenium-python)
Scroller
which scrolls to a tenth of the page's height, pauses for 500 milliseconds, and then continues.
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
Additionally, you had two for loops, for item_n in soup.find_all(...), for item_c in soup.find_all(...) that were iterating over divs in the same class. I fixed that, in my code, so that you can get both the price and the name of each item while only using one for loop.
You also had try-except statements (in case there was an AttributeError, i.e. if the items you were finding in soup.find_all were NoneTypes). I simplified those into if statements, like this one
name = item.find('div', {'class': 'yQmmFK _1POlWt _36CEnF'})
if name is not None:
name = name.text.strip()
else:
name = ''
And finally, you were using zip for two different lists (names and prices), to add to a csv file. I combined those individual lists into a nested list in the for loop, instead of appending to two separate lists and zipping at the end. This saves a step, though it is optional and may not be what you need.
Full (updated) code
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
from time import sleep
# create object for chrome options
chrome_options = Options()
# base_url = 'https://shopee.sg/search?keyword=disinfectant'
# set chrome driver options to disable any popup's from the website
# to find local path for chrome profile, open chrome browser
# and in the address bar type, "chrome://version"
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('start-maximized')
# chrome_options.add_argument('user-data-dir=C:\\Users\\username\\AppData\\Local\\Google\\Chrome\\User Data\\Default')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument("disable-infobars")
# Pass the argument 1 to allow and 2 to block
chrome_options.add_experimental_option("prefs", {
"profile.default_content_setting_values.notifications": 2
})
def get_url(search_term):
"""Generate an url from the search term"""
template = "https://www.shopee.sg/search?keyword={}"
search_term = search_term.replace(' ', '+')
# add term query to url
url = template.format(search_term)
# add page query placeholder
url += '&page={}'
return url
def main(search_term):
# invoke the webdriver
driver = webdriver.Chrome(options=chrome_options)
rows = []
url = get_url(search_term)
for page in range(0, 3):
driver.get(url.format(page))
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', {'class': 'col-xs-2-4 shopee-search-item-result__item'}):
name = item.find('div', {'class': 'yQmmFK _1POlWt _36CEnF'})
if name is not None:
name = name.text.strip()
else:
name = ''
price = item.find('div', {'class': 'WTFwws _1lK1eK _5W0f35'})
if price is not None:
price = price.find('span', {'class': '_29R_un'}).text.strip()
else:
price = ''
print([name, price])
rows.append([name, price])
with open('shopee_item_list.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Product Description', 'Price'])
writer.writerows(rows)
I need to get data/string from finance yahoo. However, relevant information is "hidden" under breakdown list.
As you see, I can access other data, e.g. total revenue, cost of revenue. Problem occurs when I try to access data hidden under breakdown list - Current Assets, Inventory (which is under Total Assets and Current Assets sections).
Python raises AttributeError: 'NoneType' object has no attribute 'find_next' error which I do not find illustrative.
P.S. I found that problem are these elements by commenting out each line
import urllib.request as url
from bs4 import BeautifulSoup
company = input('enter companies abbreviation')
income_page = 'https://finance.yahoo.com/quote/' + company + '/financials/'
balance_page = 'https://finance.yahoo.com/quote/' + company + '/balance-sheet/'
set_income_page = url.urlopen(income_page).read()
set_balance_page = url.urlopen(balance_page).read()
soup_income = BeautifulSoup(set_income_page, 'html.parser')
soup_balance = BeautifulSoup(set_balance_page, 'html.parser')
revenue_element = soup_income.find('span', string='Total Revenue').find_next('span').text
cogs_element = soup_income.find('span', string='Cost of Revenue').find_next('span').text
ebit_element = soup_income.find('span', string='Operating Income').find_next('span').text
net_element = soup_income.find('span', string='Pretax Income').find_next('span').text
short_assets_element = soup_balance.find('span', string='Current Assets').find_next('span').text
inventory_element = soup_balance.find('span', string='Inventory').find_next('span').text
Here is an example of parsing this web page using selenium. It allows emulate user behavior: wait till page is loaded, close pop-up, extend treenode by click it and extract some information from it.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
company = input('enter companies abbreviation: ')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
wd = webdriver.Chrome('<<PATH_TO_CHROMEDRIVER>>', options=chrome_options)
# delay (how long selenium waits for element to be loaded)
DELAY = 30
# maximize browser window
wd.maximize_window()
# load page via selenium
wd.get('https://finance.yahoo.com/quote/' + company + '/financials/')
# check for popup, close it
try:
btn = WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.XPATH, '//button[text()="I agree"]')))
wd.execute_script("arguments[0].scrollIntoView();", btn)
wd.execute_script("arguments[0].click();", btn)
except:
pass
# wait for page to load
results = WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.ID, 'Col1-1-Financials-Proxy')))
# parse content
soup_income = BeautifulSoup(results.get_attribute('innerHTML'), 'html.parser')
# extract values
revenue_element = soup_income.find('span', string='Total Revenue').find_next('span').text
cogs_element = soup_income.find('span', string='Cost of Revenue').find_next('span').text
ebit_element = soup_income.find('span', string='Operating Income').find_next('span').text
net_element = soup_income.find('span', string='Pretax Income').find_next('span').text
# load page via selenium
wd.get('https://finance.yahoo.com/quote/' + company + '/balance-sheet/')
# wait for page to load
results = WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.ID, 'Col1-1-Financials-Proxy')))
# expand total assets
btn = WebDriverWait(wd, DELAY).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Total Assets"]/preceding-sibling::button')))
wd.execute_script("arguments[0].scrollIntoView();", btn)
wd.execute_script("arguments[0].click();", btn)
# expand inventory
btn = WebDriverWait(wd, DELAY).until(EC.element_to_be_clickable((By.XPATH, '//span[text()="Current Assets"]/preceding-sibling::button')))
wd.execute_script("arguments[0].scrollIntoView();", btn)
wd.execute_script("arguments[0].click();", btn)
# parse content
soup_balance = BeautifulSoup(results.get_attribute('innerHTML'), 'html.parser')
# extract values
short_assets_element = soup_balance.find('span', string='Current Assets').find_next('span').text
inventory_element = soup_balance.find('span', string='Inventory').find_next('span').text
# close webdriver
wd.quit()
print(revenue_element)
print(cogs_element)
print(ebit_element)
print(net_element)
print(short_assets_element)
print(inventory_element)
I have return code in selenium. It works fine. It scraps the portal and extracts the data in table. But now I am trying to shift either to scrapy or requests.
I tried learning both and failed misserably. The selenium structure is fit in my mind. It will take me long to understand basics of requests or scrappy and then use them. The shortcut is to get some tips on how to do it directly in connection with present code.
Why am I shifting? -
I posted the code to seek suggestions for refactoring the code (here). Two of the comments have suggested me to shift to requests. That has triggered the effort. Then after some primary search I realized, I can avoid selenium and requests or scrappy can save huge time for me.
I checked here. But that dose not solve my issue.
Can someone help with this? Thanks in advance.
The code (including URL) -
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, \
TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from FIR_logging import logger
import os
import time
import pandas as pd
# base function
def get_url(some_url):
while True:
try:
driver.get(some_url)
break
except WebDriverException:
time.sleep(60)
continue
driver.refresh()
# Some constants:
URL = r'https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx'
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
time.sleep(10)
Download_Directory = r'/some_directory/raw_footage7'
COLUMNS = ['Sr.No.', 'State', 'District', 'Police Station', 'Year', 'FIR No.', 'Registration Date', 'FIR No',
'Sections']
ALL_Districts = ['AKOLA', 'AMRAVATI CITY', 'AMRAVATI RURAL', 'AURANGABAD CITY',
'AURANGABAD RURAL', 'BEED', 'BHANDARA', 'BRIHAN MUMBAI CITY', 'BULDHANA',
'CHANDRAPUR', 'DHULE', 'GADCHIROLI', 'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA',
'KOLHAPUR', 'LATUR', 'NAGPUR CITY', 'NAGPUR RURAL', 'NANDED', 'NANDURBAR',
'NASHIK CITY', 'NASHIK RURAL', 'NAVI MUMBAI', 'OSMANABAD', 'PALGHAR', 'PARBHANI',
'PIMPRI-CHINCHWAD', 'PUNE CITY', 'PUNE RURAL', 'RAIGAD', 'RAILWAY AURANGABAD',
'RAILWAY MUMBAI', 'RAILWAY NAGPUR', 'RAILWAY PUNE', 'RATNAGIRI', 'SANGLI', 'SATARA',
'SINDHUDURG', 'SOLAPUR CITY', 'SOLAPUR RURAL', 'THANE CITY', 'THANE RURAL', 'WARDHA',
'WASHIM', 'YAVATMAL']
# other functions
def district_selection(name):
dist_list = Select(driver.find_element_by_css_selector(
"#ContentPlaceHolder1_ddlDistrict"))
dist_list_options = dist_list.options
names = [o.get_attribute("text")
for o in dist_list.options if o.get_attribute("text") not in (
'Select')]
if name not in names:
logger.info(f"{name} is not in list")
return False
dist_list.select_by_visible_text(name)
time.sleep(8)
def enter_date(date):
# enters start as well as end dates with "action chains."
WebDriverWait(driver, 160).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')))
from_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')
to_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationTo')
ActionChains(driver).click(from_date_field).send_keys(
date).move_to_element(to_date_field).click().send_keys(
date).perform()
logger.info(f'date entered: {date}')
def search():
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnSearch').click()
def number_of_records():
"""captures the text indicating number of records.
converts it to integer. if 0 returns and appends name of district to the list
if page is not loaded. it tries one more time for 15 secs."""
time_counter = 1
while time_counter < 19:
try:
records_number = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_lbltotalrecord').text
if records_number == '':
time.sleep(1)
continue
else:
records_number = int(records_number)
if records_number != 0:
logger.info(f"{district}: {records_number}")
return records_number
else:
logger.info(f"no records # {district}")
return False
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
logger.info("page is not loaded")
time_counter += 1
continue
def extract_table_current(name, single):
# entire table of record to be taken to the list.
soup = BS(driver.page_source, 'html.parser')
main_table = soup.find("table", {"id": "ContentPlaceHolder1_gdvDeadBody"})
time_counter = 1
while main_table is None:
if time_counter < 16:
logger.info(f"the table did not load # {name}")
time_counter += 1
else:
logger.info(f"the table did not load # {name}."
f"stopped trying")
return
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
rows = main_table.find_all("tr")
if links_for_pages is None:
for row in rows:
time.sleep(8)
if '...' not in row.text:
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
else:
for row in rows[0:(len(rows)) - 2]:
time.sleep(8)
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
def next_page(name, data):
# check if any link to next page is available
# iterate every page.
try:
driver.find_element_by_css_selector('.gridPager a')
except NoSuchElementException:
return False
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
for page in range(len(links_for_pages)):
# new list, to by pass stale element exception
links_for_pages_new = driver.find_elements_by_css_selector('.gridPager a')
# do not click on link for new page slot
if links_for_pages_new[page].text != '...':
links_for_pages_new[page].click()
# if this can be replaced with some other wait method to save the time
time.sleep(8)
extract_table_current(name, data)
def second_page_slot():
# find specific link for going to page 11 and click.
try:
link_for_page_slot = driver.find_element_by_link_text('...')
link_for_page_slot.click()
except NoSuchElementException:
return False
# main code
page_data = []
time.sleep(5)
view = Select(driver.find_element_by_css_selector(
'#ContentPlaceHolder1_ucRecordView_ddlPageSize'))
view.select_by_value('50')
driver.close()
for district in ALL_Districts:
b = "06"
c = "2020"
district_directory = os.path.join(Download_Directory, f'{district}{b}{c}')
if not os.path.exists(district_directory):
os.mkdir(district_directory)
for i in range(1, 30):
# reoping the page to wipe out the catch.
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
# entering date and assuring that 01 to 09 is entered correctly
if i < 10:
i = f'{str("0")}{str(i)}'
date_from = str(i) + b + c
enter_date(date_from)
# select district
district_selection(district)
time.sleep(3)
# start the search
search()
time.sleep(7)
if not number_of_records():
continue
extract_table_current(district, page_data)
time.sleep(3)
if not next_page(district, page_data):
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
continue
extract_table_current(district, page_data)
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
driver.close()
Request is a very nice and simple, but powerful package. When you have learned it then you will be grateful :) You can use request to navigate around the page and sometimes even to login or send messages.
I don't know scrappy but I have been using BeautifulSoup alot and that one is fairly simple to learn as well, you just get the "soup" of data from requests and then you use BS to filter your data.
My recommendation for you is to start from scratch, just one step at a time.
Start by getting your page and then get your data little by little :)
page = requests.get('https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx')
soup = BeautifulSoup(page.text, 'lxml')