How to scrape a table with selenium? - python

I'm having a weird issue trying to scrape a table with selenium. For reference, the table is the item table here, although ideally I would like to be able to scrape any item table for any hero on this site.
self.item_table_xpath = '//table[descendant::thead[descendant::tr[descendant::th[contains(text(), "Item")]]]]'
def retrieve_hero_stats(self, url):
self.driver.get(url)
try:
win_rate_span = self.driver.find_element(by = By.XPATH, value = '//dd[descendant::*[#class = "won"]]/span')
except:
win_rate_span = self.driver.find_element(by = By.XPATH, value = '//dd[descendant::*[#class = "lost"]]/span')
win_rate = win_rate_span.text
hero_name = url.split('/')[-1]
values = list()
for i in range(1, 13):
values.append({
'Item Name': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[2]').text,
'Matches Played': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[3]').text,
'Matches Won': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[4]').text,
'Win Rate': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[5]').text
})
print(hero_name)
print(values)
The issue is the output of the code is inconsistent; sometimes the fields in the values list are populated, and sometimes they are not. This changes each time I run my code. I don't necessarily need someone to write this code for me, in fact, I'd prefer you didn't, I'm just stumped as to why the output changes every time I run?

Related

unable to interate a list

am trying to webscrape the information using selenium ,code is working for single item, but when am passing the list am getting the below output,
Actual Output
Expected output
term=["Atta","Sugar"]
def get_link(term,page):
for term in term:
pin(Pincode)
grocery="https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next=url_template+str(page)
#print(next)
return next
def PID():
for page in range(1,5):
path=get_link(term,page)
driver.get(path)
id=driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results=i.get_attribute('data-id')
#print(results)
PIDs.append(results)
Search_Term.append(term)
PID()
ID={'Query':Search_Term,'PID_s':PIDs}
Output=pd.DataFrame(ID)
print(Output)
May be it would be better to put the for loop for term inside the PID function. Try like below once:
terms = ["Atta", "Sugar"]
def get_link(term, page):
# Not sure what pin(Pincode) line is doing
grocery = "https://www.flipkart.com/search?q={}&otracker=search&otracker1=search&marketplace=GROCERY&as-show=on&as=off"
term = term.replace(' ', '+')
#print(term)
stem = grocery.format(term)
url_template = stem + '&as-pos=1&as-type=HISTORY&as-backfill=on&page='
next = url_template + str(page)
# print(next)
return next
def PID():
for term in terms:
for page in range(1, 5):
path = get_link(term, page)
driver.get(path)
id = driver.find_elements_by_xpath('//div[#data-id]')
for i in id:
results = i.get_attribute('data-id')
print(f"{term}:{results}")
# PIDs.append(results)
# Search_Term.append(term)
PID()
Atta:FLRFDPRFNGYJ95KD
Atta:FLRETEFHENWKNJQE
...
Sugar:SUGG4SFGSP6TCQ48
Sugar:SUGEUD25B6YCCNGM
...

How to optimize a Selenium webdriver crawler?

So, I have to crawl a table in each webpage in a website, there are 324 web pages (meaning 324 tables) and each table has 1000 rows and 7 columns, but 1 column is useless and I didn't use that one.
The code is kind of okay but the problem is it's very slow and it takes a lot of time.I was wondering if I could do some changes to the code to make it faster!
Here's the code:
driver = webdriver.Chrome('./chromedriver.exe')
driver.get('https://beheshtezahra.tehran.ir/Default.aspx?tabid=92')
driver.maximize_window()
part_count = 1
li = []
for i in range(0, 324):
start = timeit.default_timer()
firstname = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='dnn$ctr1877$DeadSearch$txtname']")))
lastname = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='dnn$ctr1877$DeadSearch$txtFamily']")))
part = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='dnn$ctr1877$DeadSearch$txtPart']")))
firstname.clear()
firstname.send_keys("%")
lastname.clear()
lastname.send_keys("%")
part.clear()
part.send_keys(str(part_count))
driver.find_element_by_xpath('//*[#id="dnn_ctr1877_DeadSearch_btnSearch"]').click()
print('Saving the information..')
first_name = driver.find_elements_by_xpath('//table/tbody/tr/td[2]')
last_name = driver.find_elements_by_xpath('//table/tbody/tr/td[3]')
fathers_name = driver.find_elements_by_xpath('//table/tbody/tr/td[4]')
birth_date = driver.find_elements_by_xpath('//table/tbody/tr/td[5]')
death_date = driver.find_elements_by_xpath('//table/tbody/tr/td[6]')
grave_info = driver.find_elements_by_xpath('//table/tbody/tr/td[7]')
print('Appending the information..')
for j in range(0, 1000):
li.append(first_name[j].text)
li.append(last_name[j].text)
li.append(fathers_name[j].text)
li.append(birth_date[j].text)
li.append(death_date[j].text)
li.append(grave_info[j].text)
print('Page ' + str(part_count) + ' is crawled!')
stop = timeit.default_timer()
part_count += 1
print('Time: ', stop - start)
And in the end, I wrote the list into a CSV file. Any suggestions would be appreciated!
What you could do after the print('Saving the information..') part:
print('Saving the information..')
page_snapshot = lxml.html.document_fromstring(driver.page_source)
first_name = page_snapshot.xpath('//table/tbody/tr/td[2]')
last_name = page_snapshot.xpath('//table/tbody/tr/td[3]')
fathers_name = page_snapshot.xpath('//table/tbody/tr/td[4]')
birth_date = page_snapshot.xpath('//table/tbody/tr/td[5]')
death_date = page_snapshot.xpath('//table/tbody/tr/td[6]')
grave_info = page_snapshot.xpath('//table/tbody/tr/td[7]')
print('Appending the information..')
for j in range(0, 1000):
li.append(first_name[j].text)
li.append(last_name[j].text)
li.append(fathers_name[j].text)
li.append(birth_date[j].text)
li.append(death_date[j].text)
li.append(grave_info[j].text)
lxml is blazing fast, just import lxml.html (after pip install ofc. :)) and make sure that the page is completely loaded before you grab the snapshot.

Selenium - Get text inside of table cells

Trying to get the text inside of the table cells, but have no luck.
I am trying to get the text inside of these cells:
(th and td)
The code works, kind of. It prints out the value as a normal " " (space).
code:
driver.get('https://www.komplett.se/product/1165487/datorutrustning/datorkomponenter/chassibarebone/big-tower/phanteks-eclipse-p500-air')
parent_table = driver.find_element_by_xpath("/html/body/div[2]/main/div[2]/div[2]/div[3]/div/div[2]/div/section[2]/div/div/div")
count_of_tables = len(parent_table.find_elements_by_xpath("./table"))
for x in range(count_of_tables):
parent_tr = driver.find_element_by_xpath(f"/html/body/div[2]/main/div[2]/div[2]/div[3]/div/div[2]/div/section[2]/div/div/div/table[{x + 1}]/tbody")
count_of_tr = len(parent_tr.find_elements_by_xpath("./tr"))
print(count_of_tr)
for y in range(count_of_tr):
th = driver.find_element_by_xpath(f'/html/body/div[2]/main/div[2]/div[2]/div[3]/div/div[2]/div/section[2]/div/div/div/table[{x + 1}]/tbody/tr[{y+1}]/th')
td = driver.find_element_by_xpath(f'/html/body/div[2]/main/div[2]/div[2]/div[3]/div/div[2]/div/section[2]/div/div/div/table[{x + 1}]/tbody/tr[{y + 1}]/td')
print(th.text)
print(td.text)
for y in range(count_of_tr):
th = driver.find_element_by_xpath(
f'/html/body/div[2]/main/div[2]/div[2]/div[3]/div/div[2]/div/section[2]/div/div/div/table[{x + 1}]/tbody/tr[{y+1}]/th')
td = driver.find_element_by_xpath(
f'/html/body/div[2]/main/div[2]/div[2]/div[3]/div/div[2]/div/section[2]/div/div/div/table[{x + 1}]/tbody/tr[{y + 1}]/td')
print(th.get_attribute("textContent"))
print(td.get_attribute("textContent"))
use get attribute text content as , text will retrieve text visible in view port only

Python writing on wrong row

I'm coding a scraper that send some client codes to a page, and If the code Is right It must write on the sheet some info of the page that will load. If the client code is wrong, then It must go to the next code on the sheet.
The same sheet send and receive the info. The client codes are on the first column and each one have It's own row to get the info.
The problem is that if the client code is wrong, the code is not ignoring that row when writing the info that get on the page previously. So in the end I get all the info written on sequenced rows (2, 3, 4, 5....) without skip none of them (witch should have be skipped if the client code does not returns any info).
k_bot.py
def search_cpfs(self):
# SEARCH THROUGH THE LIST OF CLIENT CODES (1ST COLUMN OF THE SPREADSHEET), AND OBTAIN THESE INFO
nomes = []
idades = []
beneficios = []
concessoes = []
salarios = []
bancoss = []
bancoscard = []
consigs = []
cards = []
for cpf in self.cpfs:
print(f"Procurando {cpf}.")
self.driver.get(self.bot_url)
#IF THE CLIENT CODE IS RIGHT
try:
cpf_input = self.driver.find_element_by_xpath('//*[#id="search"]/div/div[1]/input')
cpf_input.send_keys(cpf)
cpfButton = self.driver.find_element_by_xpath('//*[#id="search"]/div/div[2]/button')
cpfButton.click()
time.sleep(2)
self.delay = 2 # seconds
nome = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[1]/h2").text
idade = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[1]/ul/li[2]").text
age = re.search(r'\((.*?)Anos', idade).group(1)
beneficio = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/div[5]/span/b").text
concessao = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/div[2]/span").text
salario = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[2]/div/div[3]/div[1]/div[1]/span").text
bancos = self.driver.find_element_by_xpath('//*[#id="loans"]').text
bancosw = re.findall(r'(?<=Banco )(\w+)', bancos)
bankslist = ', '.join(bancosw)
bancocard = self.driver.find_element_by_xpath('//*[#id="cards"]').text
bcardw = re.findall(r'(?<=Banco )(\w+)', bancocard)
bcardlist = ', '.join(bcardw)
consig = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[3]/div[2]/span").text
card = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[3]/div[3]/span").text
print('CPF Valido')
print(nome, age, beneficio, concessao, salario, bankslist, bcardlist, consig, card)
# IF THE CLIENT CODE IS WRONG
except (NoSuchElementException, UnexpectedAlertPresentException):
print('CPF Invalido')
continue
nomes.append(nome)
idades.append(age)
beneficios.append(beneficio)
concessoes.append(concessao)
salarios.append(salario)
bancoss.append(bankslist)
bancoscard.append(bcardlist)
consigs.append(consig)
cards.append(card)
return nomes, idades, beneficios, concessoes, salarios, bancoss, bancoscard, consigs, cards
cpf_updater.py
class CpfSearch(object):
def __init__(self, spreadsheet_name):
self.cpf_col = 1
self.nome_col = 2
self.age_col = 3
self.beneficio_col = 4
self.concessao_col = 5
self.salario_col = 6
self.bancos_col = 7
self.bancocard_col = 8
self.consig_col = 9
self.card_col = 15
scope = ['https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive.readonly']
creds = ServiceAccountCredentials.from_json_keyfile_name('CONSULTAS.json', scope)
client = gspread.authorize(creds)
self.sheet = client.open(spreadsheet_name).sheet1
def process_cpf_list(self):
# SKIP OVER COLUMN HEADING IN THE SPREADSHEET
cpfs = self.sheet.col_values(self.cpf_col)[1:]
bot_url = BOT(cpfs)
nomes, idades, beneficios, concessoes, salarios, bancoss, bancoscard, consigs, cards = bot_url.search_cpfs()
# UPDATE THE SHEET
print("Atualizando...")
for cpfs in range(len(nomes)):
self.sheet.update_cell(cpfs + 2, self.nome_col, nomes[cpfs])
self.sheet.update_cell(cpfs + 2, self.age_col, idades[cpfs])
self.sheet.update_cell(cpfs + 2, self.beneficio_col, beneficios[cpfs])
self.sheet.update_cell(cpfs + 2, self.concessao_col, concessoes[cpfs])
self.sheet.update_cell(cpfs + 2, self.salario_col, salarios[cpfs])
self.sheet.update_cell(cpfs + 2, self.bancos_col, bancoss[cpfs])
self.sheet.update_cell(cpfs + 2, self.bancocard_col, bancoscard[cpfs])
self.sheet.update_cell(cpfs + 2, self.consig_col, consigs[cpfs])
self.sheet.update_cell(cpfs + 2, self.card_col, cards[cpfs])
cpf_updater = CpfSearch('TESTE')
cpf_updater.process_cpf_list()
Issue:
You are using the lists nomes, idades, beneficios, etc. to store the data to write to your sheet. These lists are just a succession of values, and don't contain any information on which row is supposed to belong to each value. You are using the index of each element in the list to keep track of that (the first value in the nomes list should be written to the first row, and so on).
On the other side, you are appending values to nomes, idades, etc. only if the code is "right". If the code is "wrong", no value is appended to these lists because the keyword continue ends current iteration. This is a problem, because these lists should be keeping track of the rows in which the code is "wrong" (that is, where the cells should remain empty).
Solution:
You shouldn't just ignore the iterations in which the code is "wrong". Values should be appended to the corresponding lists disregarding whether the iteration's code is right or wrong. nomes, idades and so on should have empty elements in between non-empty ones to account for the rows in which the cells should remain empty.
An option in this case, if you want to keep blank cells for those rows, is to do the following in the except block: (1) assign nome, idade, and so on to empty strings, and (2) remove the continue keyword so that these empty strings get appended to the lists nomes, idades, etc.
Code sample:
# IF THE CLIENT CODE IS WRONG
except (NoSuchElementException, UnexpectedAlertPresentException):
print('CPF Invalido')
nome = ""
idade = ""
beneficio = ""
# Assign remaining variables to empty string...

Scraping a widget

I am scraping data and it was scraping and printing what was appearing on the first page, however there was tons more data below. So, next I added code to scroll down to the bottom of the page so everything could be scraped. The problem now is that it scrolls to the bottom but then it just waits and never prints. Anyone know how to get this to print and eventually I'd the results to go to an excel file if anyone knows how to that too. Thanks so much
from selenium import webdriver
url = 'http://www.tradingview.com/screener'
driver = webdriver.Firefox()
driver.get(url)
SCROLL_PAUSE_TIME = 2
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# will give a list of all tickers
tickers = driver.find_elements_by_css_selector('a.tv-screener__symbol')
# will give a list of all company names
company_names = driver.find_elements_by_css('span.tv-screener__description')
# will give a list of all close values
close_values = driver.find_elements_by_xpath("//td[#class = 'tv-data-table__cell tv-screener-table__cell tv-screener-table__cell--numeric']/span")
# will give a list of all percentage changes
percentage_changes = driver.find_elements_by_xpath('//tbody/tr/td[3]')
# will give a list of all value changes
value_changes = driver.find_elements_by_xpath('//tbody/tr/td[4]')
# will give a list of all ranks
ranks = driver.find_elements_by_xpath('//tbody/tr/td[5]/span')
# will give a list of all volumes
volumes = driver.find_elements_by_xpath('//tbody/tr/td[6]')
# will give a list of all market caps
market_caps = driver.find_elements_by_xpath('//tbody/tr/td[7]')
# will give a list of all PEs
pes = driver.find_elements_by_xpath('//tbody/tr/td[8]')
# will give a list of all EPSs
epss = driver.find_elements_by_xpath('//tbody/tr/td[9]')
# will give a list of all EMPs
emps = driver.find_elements_by_xpath('//tbody/tr/td[10]')
# will give a list of all sectors
sectors = driver.find_elements_by_xpath('//tbody/tr/td[11]')
for index in range(len(tickers)):
print("Row " + index + " " + tickers[index].text + " " + company_names[index].text + " ")
You are trying to locate a wrong element. This:
element = driver.find_elements_by_id('js-screener-container')
should be replaced with:
# will give a list of all tickers
tickers = driver.find_elements_by_css_selector('a.tv-screener__symbol')
# will give a list of all company names
company_names = driver.find_elements_by_css_selector('span.tv-screener__description')
# will give a list of all close values
close_values = driver.find_elements_by_xpath("//td[#class = 'tv-data-table__cell tv-screener-table__cell tv-screener-table__cell--numeric']/span")
# will give a list of all percentage changes
percentage_changes = driver.find_elements_by_xpath('//tbody/tr/td[3]')
# will give a list of all value changes
value_changes = driver.find_elements_by_xpath('//tbody/tr/td[4]')
# will give a list of all ranks
ranks = driver.find_elements_by_xpath('//tbody/tr/td[5]/span')
# will give a list of all volumes
volumes = driver.find_elements_by_xpath('//tbody/tr/td[6]')
# will give a list of all market caps
market_caps = driver.find_elements_by_xpath('//tbody/tr/td[7]')
# will give a list of all PEs
pes = driver.find_elements_by_xpath('//tbody/tr/td[8]')
# will give a list of all EPSs
epss = driver.find_elements_by_xpath('//tbody/tr/td[9]')
# will give a list of all EMPs
emps = driver.find_elements_by_xpath('//tbody/tr/td[10]')
# will give a list of all sectors
sectors = driver.find_elements_by_xpath('//tbody/tr/td[11]')
So now you have all data stored in lists. If you want to build a rows of data, you can use something like this:
for index in range(len(tickers)):
print("Row " + tickers[index].text + " " + company_names[index].text + " " + ....)
Output will be something like this:
Row AAPL APPLE INC. 188.84 -1.03% -1.96 Neutral 61.308M 931.386B 17.40 10.98 123K Technology
Row AMZN AMAZON.COM INC 1715.97 -0.46% -7.89 Buy 4.778M 835.516B 270.53 6.54 566K Consumer Cyclicals
...
PS:
I think
SCROLL_PAUSE_TIME = 0.5
is too small ammount of time, since sometimes loading new content by scrolling on the page bottom may be longer as 0.5 seconds. I would increase this value to make sure that all content will be loaded.

Categories