In particular I am trying to scrap this table (https://whalewisdom.com/filer/berkshire-hathaway-inc#tabholdings_tab_link) But I would like to scraping via python code, the first 50 rows.
For this reason I need to setup option value in order to see the first 50 rows per pages:
my currently code are:
test = {}
dict_scr = {}
for ii in range (0,12):
options = webdriver.FirefoxOptions()
options.binary_location = r'C:/Users/Mozilla Firefox/firefox.exe'
driver = selenium.webdriver.Firefox(executable_path='C:/Users/geckodriver.exe' , options=options)
driver.execute("get", {'url': link_scr['Links'][ii]})
Select(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='50']"))))
test[link_scr.index[ii]] = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table#current_holdings_table"))).get_attribute("outerHTML")
dict_scr[link_scr.index[ii]] = pd.read_html(test[link_scr.index[ii]])
print(test[link_scr.index[ii]])
How I can modify this code in order to get firs 50 rows scraping dataframe?
I write two samples, you can refer to github:
sample:
from time import sleep
from clicknium import clicknium as cc, locator
tab = cc.chrome.open("https://whalewisdom.com/filer/berkshire-hathaway-inc#tabholdings_tab_link")
tab.find_element(locator.chrome.whalewisdom.button_25).click()
tab.find_element(locator.chrome.whalewisdom.a_50).click()
sleep(3) #wait for table laoded
elems_sector = tab.find_elements(locator.chrome.whalewisdom.td_informationtechnology)
elemns_shares = tab.find_elements(locator.chrome.whalewisdom.td_890923410)
count = len(elems_sector)
for idx in range(count):
sector = elems_sector[idx].get_text()
shares = elemns_shares[idx].get_text()
print({'sector': sector, 'shares': shares})
sample1: don't change page number, scrape two pages data
from time import sleep
from clicknium import clicknium as cc, locator
tab = cc.chrome.open("https://whalewisdom.com/filer/berkshire-hathaway-inc#tabholdings_tab_link")
i = 0
while True:
elems_sector = tab.find_elements(locator.chrome.whalewisdom.td_informationtechnology)
elemns_shares = tab.find_elements(locator.chrome.whalewisdom.td_890923410)
count = len(elems_sector)
for idx in range(count):
sector = elems_sector[idx].get_text()
shares = elemns_shares[idx].get_text()
print({'sector': sector, 'shares': shares})
i += 1
if i>1:
break
tab.find_element(locator.chrome.whalewisdom.a).click()
sleep(2) #wait for table loaded
Related
I'm trying to scrape rental listing data on Zillow. Specifically, I want the link, price, and address of each property. However, after scraping the first page successfully and clicking the next arrow button, it just displays the same listings even though the page shows I'm on page 2, 3, etc. How do I get the next page(s) listings? The project is supposed to use BeautifulSoup and Selenium, but after some research it looks like using only selenium is the easiest way to do this since Zillow uses lazy-loading.
main.py code:
DRIVER_PATH = "D:\chromedriver.exe"
FORM_URL = "HIDDEN"
WEBPAGE = "https://www.zillow.com/toronto-on/rentals/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-79.40771727189582%2C%22east%22%3A-79.35750631913703%2C%22south%22%3A43.639155005365474%2C%22north%22%3A43.66405824004801%7D%2C%22mapZoom%22%3A15%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A792680%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22days%22%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22fr%22%3A%7B%22value%22%3Atrue%7D%2C%22sf%22%3A%7B%22value%22%3Afalse%7D%2C%22tow%22%3A%7B%22value%22%3Afalse%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D"
data_entry = DataEntry(DRIVER_PATH)
# Opens the webpage and gets count of total pages via self.next_btns_len)
data_entry.open_webpage(WEBPAGE)
# n is the iterator for the number of pages on the site.
n = 1
# Scrapes link, price, address data, adds each to a specified class list, and then goes to next page.
while n < (data_entry.next_btns_len + 1):
# Scrapes one page of data and adds data to list in class object
data_entry.scrape_data()
# Goes to next page for scraping
sleep(5)
data_entry.next_page()
n += 1
enter_data.py code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from time import sleep
class DataEntry:
"""Enters the data from soup into Google Form"""
def __init__(self, driver_path):
# Options keeps the browser open after execution.
self.chrome_options = Options()
self.chrome_options.add_experimental_option("detach", True)
self.driver = webdriver.Chrome(executable_path=driver_path, chrome_options=self.chrome_options)
self.links = []
self.prices = []
self.addresses = []
self.next_btns_len = 0
def open_webpage(self, webpage):
# Opens desired webpage and gives two seconds to load
self.driver.get(webpage)
sleep(2)
# Gets total page numbers for main.py while loop
page_nums = self.driver.find_element(By.CSS_SELECTOR, '.Text-c11n-8-69-2__sc-aiai24-0.gCvDSp')
self.next_btns_len = int(page_nums.text.split()[3])
def scrape_data(self):
# Scrolls to each listing to make it visible to Selenium.
n = 1
while n < 41:
listing = self.driver.find_element(By.XPATH, f'/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[{n}]')
self.driver.execute_script("arguments[0].scrollIntoView(true);", listing)
print(n)
n += 1
# todo: Create a list of links for all the listings you scraped.
links = self.driver.find_elements(By.CSS_SELECTOR, ".list-card-info .list-card-link")
link_list = [link.get_attribute("href") for link in links]
# The if statement is to check if the DOM class name has changed, which produces an empty list.
# If the list is empty, then changes the css_selector. The website alternates between two.
if len(link_list) == 0:
links = self.driver.find_elements(By.CSS_SELECTOR, ".StyledPropertyCardDataArea-c11n-8-69-2__sc-yipmu-0.dZxoFm.property-card-link")
link_list = [link.get_attribute("href") for link in links]
self.links.extend(link_list)
print(len(self.links))
print(self.links)
# todo: Create a list of prices for all the listings you scraped.
prices = self.driver.find_elements(By.CSS_SELECTOR, ".list-card-price")
price_list = [price.text for price in prices]
if len(price_list) == 0:
prices = self.driver.find_elements(By.CSS_SELECTOR, ".StyledPropertyCardDataArea-c11n-8-69-2__sc-yipmu-0.kJFQQX")
price_list = [price.text for price in prices]
split_price_list = [price.split() for price in price_list]
final_price_list = [price[0].strip("C+/mo") for price in split_price_list]
self.prices.extend(final_price_list)
print(len(self.prices))
print(self.prices)
# todo: Create a list of addresses for all the listings you scraped.
addresses = self.driver.find_elements(By.CSS_SELECTOR, ".list-card-addr")
address_list = [address.text for address in addresses]
if len(address_list) == 0:
addresses = self.driver.find_elements(By.CSS_SELECTOR, ".StyledPropertyCardDataArea-c11n-8-69-2__sc-yipmu-0.dZxoFm.property-card-link address")
address_list = [address.text for address in addresses]
self.addresses.extend(address_list)
print(len(self.addresses))
print(self.addresses)
def next_page(self):
# Clicks the next arrow and waits 2 seconds for page to load
next_arrow = self.driver.find_element(By.XPATH, "//a[#title='Next page']")
next_arrow.click()
sleep(5)
def close_webpage(self):
self.driver.quit()
def enter_data(self, form_url, address, rent, link):
# Opens the Google Form and waits 3 seconds to load.
self.driver.get(form_url)
sleep(2)
# Enters each address, rent, and link into the form. Clicks submit after.
address_input = self.driver.find_element(By.XPATH, '//*[#id="mG61Hd"]/div[2]/div/div[2]/div[1]/div/div/div['
'2]/div/div[1]/div/div[1]/input')
address_input.send_keys(address)
rent_input = self.driver.find_element(By.XPATH, '//*[#id="mG61Hd"]/div[2]/div/div[2]/div[2]/div/div/div['
'2]/div/div[1]/div/div[1]/input')
rent_input.send_keys(rent)
link_input = self.driver.find_element(By.XPATH, '//*[#id="mG61Hd"]/div[2]/div/div[2]/div[3]/div/div/div['
'2]/div/div[1]/div/div[1]/input')
link_input.send_keys(link)
submit_btn = self.driver.find_element(By.XPATH, '//*[#id="mG61Hd"]/div[2]/div/div[3]/div[1]/div['
'1]/div/span/span')
submit_btn.click()
There is a less complex way to obtain the data you're looking for, using cloudscraper and pandas (and tqdm for convenience). You might also be in for a surprise, considering the time taken to get the data:
import cloudscraper
import pandas as pd
from tqdm import tqdm
scraper = cloudscraper.create_scraper()
df_list = []
for current_page in tqdm(range(1, 21)):
url = f'https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=%7B%22pagination%22%3A%7B%22currentPage%22%3A{current_page}%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-79.44174913987678%2C%22east%22%3A-79.32347445115607%2C%22south%22%3A43.57772225826024%2C%22north%22%3A43.7254027835563%7D%2C%22mapZoom%22%3A13%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A792680%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22isForSaleForeclosure%22%3A%7B%22value%22%3Afalse%7D%2C%22isAllHomes%22%3A%7B%22value%22%3Atrue%7D%2C%22sortSelection%22%3A%7B%22value%22%3A%22days%22%7D%2C%22isAuction%22%3A%7B%22value%22%3Afalse%7D%2C%22isNewConstruction%22%3A%7B%22value%22%3Afalse%7D%2C%22isForRent%22%3A%7B%22value%22%3Atrue%7D%2C%22isSingleFamily%22%3A%7B%22value%22%3Afalse%7D%2C%22isTownhouse%22%3A%7B%22value%22%3Afalse%7D%2C%22isForSaleByOwner%22%3A%7B%22value%22%3Afalse%7D%2C%22isComingSoon%22%3A%7B%22value%22%3Afalse%7D%2C%22isForSaleByAgent%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%7D&wants=%7B%22cat1%22:[%22listResults%22,%22mapResults%22]%7D&requestId=6'
r = scraper.get(url)
for x in r.json()['cat1']['searchResults']['listResults']:
status = x['statusText']
address = x['address']
try:
price = x['units'][0]['price']
except Exception as e:
price = x['price']
if not 'https://www.' in x['detailUrl']:
url = 'https://zillow.com' + x['detailUrl']
else:
url = x['detailUrl']
df_list.append((address, price, url))
df = pd.DataFrame(df_list, columns = ['Address', 'Price', 'Url'])
df.to_csv('renting_in_toronto.csv')
print(df)
This will save the data in a csv file, and print out:
100%
20/20 [00:16<00:00, 1.19it/s]
Address Price Url
0 2221 Yonge St, Toronto, ON C$1,900+ https://zillow.com/b/Toronto-ON/43.70606,-79.3...
1 10 Yonge St, Toronto, ON C$2,100+ https://zillow.com/b/10-yonge-st-toronto-on-BM...
2 924 Avenue Rd, Toronto, ON M5P 2K6 C$1,895/mo https://www.zillow.com/homedetails/924-Avenue-...
3 797 Don Mills Rd, Toronto, ON C$1,850+ https://zillow.com/b/Toronto-ON/43.71951,-79.3...
4 15 Queens Quay E, Toronto, ON C$2,700+ https://zillow.com/b/Toronto-ON/43.64202,-79.3...
... ... ...
You can install the packages with pip install cloudscraper & pip install tqdm. The urls accessed are visible in Dev Tools, Network tab, and are providing JSON data which is loaded by Javascript into page.
I am trying to scrape the price of allergy products in Target. For each product, i will input all the US zip codes to see the effect of changing ZIPCODE on price. And i use selenium to input the ZIPCODE for each products. However, i have more than 40000 ZIPCODES and 200 products total to scrape. If I run my code, the run time of the code will be too long(almost 90 days..) because each time it need 2 seconds for selenium to input the zipcode. What should I do to reduce the time of running?
while(True):
priceArray = []
nameArray = []
zipCodeArray =[]
GMTArray = []
wait_imp = 10
CO = webdriver.ChromeOptions()
CO.add_experimental_option('useAutomationExtension', False)
CO.add_argument('--ignore-certificate-errors')
CO.add_argument('--start-maximized')
wd = webdriver.Chrome(r'D:\chromedriver\chromedriver_win32new\chromedriver_win32 (2)\chromedriver.exe',options=CO)
for url in urlList:
wd.get(url)
wd.implicitly_wait(wait_imp)
for zipcode in zipCodeList:
try:
#click the delivery address
address = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[1]/button[2]")
address.click()
#click the Edit location
editLocation = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[2]/button")
editLocation.click()
except:
#directly click he Edit location
editLocation = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div[1]/div/div[1]/button")
editLocation.click()
#input ZipCode
inputZipCode = wd.find_element(by=By.XPATH, value="//*[#id='enter-zip-or-city-state']")
inputZipCode.clear()
inputZipCode.send_keys(zipcode)
#click submit
clickSubmit = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[4]/div/div[2]/div/div/div[3]/div/button[1]")
clickSubmit.click()
#start scraping
name = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[1]/h1/span").text
nameArray.append(name)
price = wd.find_element(by=By.XPATH, value="//*[#id='pageBodyContainer']/div[1]/div[2]/div[2]/div/div[1]/div[1]/span").text
priceArray.append(price)
currentZipCode = zipcode
zipCodeArray.append(currentZipCode)
tz = pytz.timezone('Europe/London')
GMT = datetime.now(tz)
GMTArray.append(GMT)
data = {'prod-name': nameArray,
'Price': priceArray,
'currentZipCode': zipCodeArray,
"GMT": GMTArray
}
df = pd.DataFrame(data, columns= ['prod-name', 'Price','currentZipCode',"GMT"])
df.to_csv(r'C:\Users\12987\PycharmProjects\Network\priceingAlgoriCoding\export_Target_dataframe.csv', mode='a', index = False, header=True)
For selenium -
Use python concurrent.futures to run drivers parallelly.
You may checkout this answer - link
here is a snippet for ThreadPoolExecutor
from selenium import webdriver
from concurrent import futures
def selenium_title(url):
wdriver = webdriver.Chrome() # chrome webdriver
wdriver.get(url)
title = wdriver.title
wdriver.quit()
return title
links = ["https://www.amazon.com", "https://www.google.com"]
with futures.ThreadPoolExecutor() as executor: # default/optimized number of threads or pass `max_workers` param value like max_workers=10
titles = list(executor.map(selenium_title, links))
You can also use ProcessPoolExecutor
with futures.ProcessPoolExecutor() as executor: # default/optimized number of processes
titles = list(executor.map(selenium_title, links))
Thus you can achieve x times boost (x = number of workers)
I try to scrape this webpage https://www.oddsportal.com/moving-margins
But the code sometime work, and sometimes don't, and even if work don't scrape all the data I need per match.
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
table_data = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//tr[#class='odd' or #class='dark']")
table =[]
# Creating a list of lists, where each list consist all data in each row either with class dark or odd
for data in table_data:
row = []
dark_row = data.find_elements_by_xpath((".//th//a"))
for col in dark_row:
row.append(col.text.replace("\n"," "))
odd_row = data.find_elements_by_xpath((".//following-sibling::tr[#class='odd']//td"))
for col in odd_row:
row.append(col.text.replace("\n", " "))
table.append(row)
My goal is to store data into csv file with those columns:
sport country competiton handicap match_date match hdp_open hdp_close bookmaker
Tennis Czech Ostrava.. AH 0 Games Today12:00 Karatsev A. - Otte O. 0.5 -1.5 Nordicbet
I think the problem in you code is that the page has, in some cases, a single "dark" row for many "odds" rows. So when you loop the elements, you create a single record for a table that actually has more records.
This code should fit you needs, but keep in mind that it's not optimal since doesn't take care of possible exceptions, but it's a starting point:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
tables = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//table")
tableData =[]
for table in tables:
trDark = table.find_element_by_xpath('.//tr[#class="dark"]')
trOdds = table.find_elements_by_xpath('.//tr[#class="odd"]')
row = [trDark.text.strip().replace("\n", " ")]
for odd in trOdds:
tds = [
td.text.strip().replace("\n", " ")
for td in odd.find_elements_by_xpath('.//td')
]
row = row + tds
tableData.append(row)
print(tableData)
I have return code in selenium. It works fine. It scraps the portal and extracts the data in table. But now I am trying to shift either to scrapy or requests.
I tried learning both and failed misserably. The selenium structure is fit in my mind. It will take me long to understand basics of requests or scrappy and then use them. The shortcut is to get some tips on how to do it directly in connection with present code.
Why am I shifting? -
I posted the code to seek suggestions for refactoring the code (here). Two of the comments have suggested me to shift to requests. That has triggered the effort. Then after some primary search I realized, I can avoid selenium and requests or scrappy can save huge time for me.
I checked here. But that dose not solve my issue.
Can someone help with this? Thanks in advance.
The code (including URL) -
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, \
TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from FIR_logging import logger
import os
import time
import pandas as pd
# base function
def get_url(some_url):
while True:
try:
driver.get(some_url)
break
except WebDriverException:
time.sleep(60)
continue
driver.refresh()
# Some constants:
URL = r'https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx'
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
time.sleep(10)
Download_Directory = r'/some_directory/raw_footage7'
COLUMNS = ['Sr.No.', 'State', 'District', 'Police Station', 'Year', 'FIR No.', 'Registration Date', 'FIR No',
'Sections']
ALL_Districts = ['AKOLA', 'AMRAVATI CITY', 'AMRAVATI RURAL', 'AURANGABAD CITY',
'AURANGABAD RURAL', 'BEED', 'BHANDARA', 'BRIHAN MUMBAI CITY', 'BULDHANA',
'CHANDRAPUR', 'DHULE', 'GADCHIROLI', 'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA',
'KOLHAPUR', 'LATUR', 'NAGPUR CITY', 'NAGPUR RURAL', 'NANDED', 'NANDURBAR',
'NASHIK CITY', 'NASHIK RURAL', 'NAVI MUMBAI', 'OSMANABAD', 'PALGHAR', 'PARBHANI',
'PIMPRI-CHINCHWAD', 'PUNE CITY', 'PUNE RURAL', 'RAIGAD', 'RAILWAY AURANGABAD',
'RAILWAY MUMBAI', 'RAILWAY NAGPUR', 'RAILWAY PUNE', 'RATNAGIRI', 'SANGLI', 'SATARA',
'SINDHUDURG', 'SOLAPUR CITY', 'SOLAPUR RURAL', 'THANE CITY', 'THANE RURAL', 'WARDHA',
'WASHIM', 'YAVATMAL']
# other functions
def district_selection(name):
dist_list = Select(driver.find_element_by_css_selector(
"#ContentPlaceHolder1_ddlDistrict"))
dist_list_options = dist_list.options
names = [o.get_attribute("text")
for o in dist_list.options if o.get_attribute("text") not in (
'Select')]
if name not in names:
logger.info(f"{name} is not in list")
return False
dist_list.select_by_visible_text(name)
time.sleep(8)
def enter_date(date):
# enters start as well as end dates with "action chains."
WebDriverWait(driver, 160).until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')))
from_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationFrom')
to_date_field = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_txtDateOfRegistrationTo')
ActionChains(driver).click(from_date_field).send_keys(
date).move_to_element(to_date_field).click().send_keys(
date).perform()
logger.info(f'date entered: {date}')
def search():
driver.find_element_by_css_selector('#ContentPlaceHolder1_btnSearch').click()
def number_of_records():
"""captures the text indicating number of records.
converts it to integer. if 0 returns and appends name of district to the list
if page is not loaded. it tries one more time for 15 secs."""
time_counter = 1
while time_counter < 19:
try:
records_number = driver.find_element_by_css_selector(
'#ContentPlaceHolder1_lbltotalrecord').text
if records_number == '':
time.sleep(1)
continue
else:
records_number = int(records_number)
if records_number != 0:
logger.info(f"{district}: {records_number}")
return records_number
else:
logger.info(f"no records # {district}")
return False
except (NoSuchElementException, TimeoutException, StaleElementReferenceException):
logger.info("page is not loaded")
time_counter += 1
continue
def extract_table_current(name, single):
# entire table of record to be taken to the list.
soup = BS(driver.page_source, 'html.parser')
main_table = soup.find("table", {"id": "ContentPlaceHolder1_gdvDeadBody"})
time_counter = 1
while main_table is None:
if time_counter < 16:
logger.info(f"the table did not load # {name}")
time_counter += 1
else:
logger.info(f"the table did not load # {name}."
f"stopped trying")
return
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
rows = main_table.find_all("tr")
if links_for_pages is None:
for row in rows:
time.sleep(8)
if '...' not in row.text:
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
else:
for row in rows[0:(len(rows)) - 2]:
time.sleep(8)
cells = row.find_all('td')
cells = cells[0:9] # drop the last column
# store data in list
single.append([cell.text for cell in cells])
def next_page(name, data):
# check if any link to next page is available
# iterate every page.
try:
driver.find_element_by_css_selector('.gridPager a')
except NoSuchElementException:
return False
links_for_pages = driver.find_elements_by_css_selector('.gridPager a')
for page in range(len(links_for_pages)):
# new list, to by pass stale element exception
links_for_pages_new = driver.find_elements_by_css_selector('.gridPager a')
# do not click on link for new page slot
if links_for_pages_new[page].text != '...':
links_for_pages_new[page].click()
# if this can be replaced with some other wait method to save the time
time.sleep(8)
extract_table_current(name, data)
def second_page_slot():
# find specific link for going to page 11 and click.
try:
link_for_page_slot = driver.find_element_by_link_text('...')
link_for_page_slot.click()
except NoSuchElementException:
return False
# main code
page_data = []
time.sleep(5)
view = Select(driver.find_element_by_css_selector(
'#ContentPlaceHolder1_ucRecordView_ddlPageSize'))
view.select_by_value('50')
driver.close()
for district in ALL_Districts:
b = "06"
c = "2020"
district_directory = os.path.join(Download_Directory, f'{district}{b}{c}')
if not os.path.exists(district_directory):
os.mkdir(district_directory)
for i in range(1, 30):
# reoping the page to wipe out the catch.
options = FirefoxOptions()
options.add_argument("--headless")
options.add_argument("--private-window")
driver = webdriver.Firefox(options=options)
get_url(URL)
# entering date and assuring that 01 to 09 is entered correctly
if i < 10:
i = f'{str("0")}{str(i)}'
date_from = str(i) + b + c
enter_date(date_from)
# select district
district_selection(district)
time.sleep(3)
# start the search
search()
time.sleep(7)
if not number_of_records():
continue
extract_table_current(district, page_data)
time.sleep(3)
if not next_page(district, page_data):
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
continue
extract_table_current(district, page_data)
district_data = pd.DataFrame(page_data, columns=COLUMNS)
district_data.to_csv(os.path.join(district_directory, f'{district}{i}{b}{c}.csv'))
driver.close()
Request is a very nice and simple, but powerful package. When you have learned it then you will be grateful :) You can use request to navigate around the page and sometimes even to login or send messages.
I don't know scrappy but I have been using BeautifulSoup alot and that one is fairly simple to learn as well, you just get the "soup" of data from requests and then you use BS to filter your data.
My recommendation for you is to start from scratch, just one step at a time.
Start by getting your page and then get your data little by little :)
page = requests.get('https://www.mhpolice.maharashtra.gov.in/Citizen/MH/PublishedFIRs.aspx')
soup = BeautifulSoup(page.text, 'lxml')
I wrote some code to extract Insider Trading information from the Toronto Stock Exchange. Using Selenium, open up this link and then, using a list of stocks, one by one input each into the form, retrieve the data and put it into another list, then do the same for the next stock.
Here is the code:
from selenium import webdriver
stocks = ['RKN','MG','GTE','IMO','REI.UN','RY']
dt = []
url = 'https://app.tmxmoney.com/research/insidertradesummaries?locale=EN'
driver = webdriver.Firefox()
driver.get(url)
search = driver.find_element_by_xpath('//ul[#class="nav nav-pills"]/li[3]')
search.click()
stock_form = driver.find_element_by_name('text')
for stock in stocks:
stock_form.clear()
stock_form.send_keys(stock)
stock_form.submit()
data = driver.find_element_by_xpath('//div[#class="insider-trades-symbol-search-container"]/div[#class="ng-binding"]')
a = data.text.split('\n')
if len(a) > 1:
dt.append(a[-1].split())
else:
dt.append([])
driver.close()
If you run the code, you can see each stock being input into the form, the data will pop up and I attempt to retrieve it. However, when I get the text from "data", its as if its retrieved from what was visible on the page prior to submitting the form. I tried adding a wait to the code to no avail.
added a time.sleep(1) and the code works as intended.
from selenium import webdriver
import time
stocks = ['RKN','MG','GTE','IMO','REI.UN','RY']
dt = []
url = 'https://app.tmxmoney.com/research/insidertradesummaries?locale=EN'
driver = webdriver.Firefox()
driver.get(url)
search = driver.find_element_by_xpath('//ul[#class="nav nav-pills"]/li[3]')
search.click()
stock_form = driver.find_element_by_name('text')
for stock in stocks:
stock_form.clear()
stock_form.send_keys(stock)
stock_form.submit()
data = driver.find_element_by_xpath('//div[#class="insider-trades-symbol-search-container"]/div[#class="ng-binding"]')
a = data.text.split('\n')
**time.sleep(1)**
if len(a) > 1:
dt.append(a[-1].split())
else:
dt.append([])
driver.close()