Python writing on wrong row - python

I'm coding a scraper that send some client codes to a page, and If the code Is right It must write on the sheet some info of the page that will load. If the client code is wrong, then It must go to the next code on the sheet.
The same sheet send and receive the info. The client codes are on the first column and each one have It's own row to get the info.
The problem is that if the client code is wrong, the code is not ignoring that row when writing the info that get on the page previously. So in the end I get all the info written on sequenced rows (2, 3, 4, 5....) without skip none of them (witch should have be skipped if the client code does not returns any info).
k_bot.py
def search_cpfs(self):
# SEARCH THROUGH THE LIST OF CLIENT CODES (1ST COLUMN OF THE SPREADSHEET), AND OBTAIN THESE INFO
nomes = []
idades = []
beneficios = []
concessoes = []
salarios = []
bancoss = []
bancoscard = []
consigs = []
cards = []
for cpf in self.cpfs:
print(f"Procurando {cpf}.")
self.driver.get(self.bot_url)
#IF THE CLIENT CODE IS RIGHT
try:
cpf_input = self.driver.find_element_by_xpath('//*[#id="search"]/div/div[1]/input')
cpf_input.send_keys(cpf)
cpfButton = self.driver.find_element_by_xpath('//*[#id="search"]/div/div[2]/button')
cpfButton.click()
time.sleep(2)
self.delay = 2 # seconds
nome = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[1]/h2").text
idade = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[1]/ul/li[2]").text
age = re.search(r'\((.*?)Anos', idade).group(1)
beneficio = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/div[5]/span/b").text
concessao = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[1]/div[2]/div[2]/span").text
salario = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[2]/div/div[3]/div[1]/div[1]/span").text
bancos = self.driver.find_element_by_xpath('//*[#id="loans"]').text
bancosw = re.findall(r'(?<=Banco )(\w+)', bancos)
bankslist = ', '.join(bancosw)
bancocard = self.driver.find_element_by_xpath('//*[#id="cards"]').text
bcardw = re.findall(r'(?<=Banco )(\w+)', bancocard)
bcardlist = ', '.join(bcardw)
consig = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[3]/div[2]/span").text
card = self.driver.find_element_by_xpath(
"/html/body/main[1]/div[1]/div[1]/div[3]/div[3]/span").text
print('CPF Valido')
print(nome, age, beneficio, concessao, salario, bankslist, bcardlist, consig, card)
# IF THE CLIENT CODE IS WRONG
except (NoSuchElementException, UnexpectedAlertPresentException):
print('CPF Invalido')
continue
nomes.append(nome)
idades.append(age)
beneficios.append(beneficio)
concessoes.append(concessao)
salarios.append(salario)
bancoss.append(bankslist)
bancoscard.append(bcardlist)
consigs.append(consig)
cards.append(card)
return nomes, idades, beneficios, concessoes, salarios, bancoss, bancoscard, consigs, cards
cpf_updater.py
class CpfSearch(object):
def __init__(self, spreadsheet_name):
self.cpf_col = 1
self.nome_col = 2
self.age_col = 3
self.beneficio_col = 4
self.concessao_col = 5
self.salario_col = 6
self.bancos_col = 7
self.bancocard_col = 8
self.consig_col = 9
self.card_col = 15
scope = ['https://www.googleapis.com/auth/spreadsheets',
'https://www.googleapis.com/auth/drive.readonly']
creds = ServiceAccountCredentials.from_json_keyfile_name('CONSULTAS.json', scope)
client = gspread.authorize(creds)
self.sheet = client.open(spreadsheet_name).sheet1
def process_cpf_list(self):
# SKIP OVER COLUMN HEADING IN THE SPREADSHEET
cpfs = self.sheet.col_values(self.cpf_col)[1:]
bot_url = BOT(cpfs)
nomes, idades, beneficios, concessoes, salarios, bancoss, bancoscard, consigs, cards = bot_url.search_cpfs()
# UPDATE THE SHEET
print("Atualizando...")
for cpfs in range(len(nomes)):
self.sheet.update_cell(cpfs + 2, self.nome_col, nomes[cpfs])
self.sheet.update_cell(cpfs + 2, self.age_col, idades[cpfs])
self.sheet.update_cell(cpfs + 2, self.beneficio_col, beneficios[cpfs])
self.sheet.update_cell(cpfs + 2, self.concessao_col, concessoes[cpfs])
self.sheet.update_cell(cpfs + 2, self.salario_col, salarios[cpfs])
self.sheet.update_cell(cpfs + 2, self.bancos_col, bancoss[cpfs])
self.sheet.update_cell(cpfs + 2, self.bancocard_col, bancoscard[cpfs])
self.sheet.update_cell(cpfs + 2, self.consig_col, consigs[cpfs])
self.sheet.update_cell(cpfs + 2, self.card_col, cards[cpfs])
cpf_updater = CpfSearch('TESTE')
cpf_updater.process_cpf_list()

Issue:
You are using the lists nomes, idades, beneficios, etc. to store the data to write to your sheet. These lists are just a succession of values, and don't contain any information on which row is supposed to belong to each value. You are using the index of each element in the list to keep track of that (the first value in the nomes list should be written to the first row, and so on).
On the other side, you are appending values to nomes, idades, etc. only if the code is "right". If the code is "wrong", no value is appended to these lists because the keyword continue ends current iteration. This is a problem, because these lists should be keeping track of the rows in which the code is "wrong" (that is, where the cells should remain empty).
Solution:
You shouldn't just ignore the iterations in which the code is "wrong". Values should be appended to the corresponding lists disregarding whether the iteration's code is right or wrong. nomes, idades and so on should have empty elements in between non-empty ones to account for the rows in which the cells should remain empty.
An option in this case, if you want to keep blank cells for those rows, is to do the following in the except block: (1) assign nome, idade, and so on to empty strings, and (2) remove the continue keyword so that these empty strings get appended to the lists nomes, idades, etc.
Code sample:
# IF THE CLIENT CODE IS WRONG
except (NoSuchElementException, UnexpectedAlertPresentException):
print('CPF Invalido')
nome = ""
idade = ""
beneficio = ""
# Assign remaining variables to empty string...

Related

How to scrape a table with selenium?

I'm having a weird issue trying to scrape a table with selenium. For reference, the table is the item table here, although ideally I would like to be able to scrape any item table for any hero on this site.
self.item_table_xpath = '//table[descendant::thead[descendant::tr[descendant::th[contains(text(), "Item")]]]]'
def retrieve_hero_stats(self, url):
self.driver.get(url)
try:
win_rate_span = self.driver.find_element(by = By.XPATH, value = '//dd[descendant::*[#class = "won"]]/span')
except:
win_rate_span = self.driver.find_element(by = By.XPATH, value = '//dd[descendant::*[#class = "lost"]]/span')
win_rate = win_rate_span.text
hero_name = url.split('/')[-1]
values = list()
for i in range(1, 13):
values.append({
'Item Name': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[2]').text,
'Matches Played': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[3]').text,
'Matches Won': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[4]').text,
'Win Rate': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[5]').text
})
print(hero_name)
print(values)
The issue is the output of the code is inconsistent; sometimes the fields in the values list are populated, and sometimes they are not. This changes each time I run my code. I don't necessarily need someone to write this code for me, in fact, I'd prefer you didn't, I'm just stumped as to why the output changes every time I run?

Show excel data in only one Sheet

I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one

How to add auto unfollow module to the fallowing code?

This is a python code and list users that I only follow them. I would like to use the same list by adding a module for auto unfollowing:
.
.
.
def get_unfollowers(browser):
"""
Opens the profile, obtains the follower and following list
Returns the names of the users who are in the following list, but not in the follower list
"""
to_profile = browser.find_element_by_xpath("//a[contains(#href, '/{}')]".format(username))
to_profile.click()
sleep(3)
to_following = browser.find_element_by_xpath("//a[contains(#href, '/following')]")
to_following.click()
following_list = get_name(browser)
to_followers = browser.find_element_by_xpath("//a[contains(#href, '/followers')]")
to_followers.click()
followers_list = get_name(browser)
not_following_back = [user for user in following_list if user not in followers_list]
print(not_following_back) # prints a list with every name separated by a comma
def get_name(browser):
sleep(2)
scroll_box = browser.find_element_by_class_name('isgrP')
p_height, height = 0, 1
while p_height != height:
p_height = height
sleep(2)
height = browser.execute_script(
"arguments[0].scrollTo(0, arguments[0].scrollHeight); return arguments[0].scrollHeight;", scroll_box)
total_list = scroll_box.find_elements_by_tag_name('li')
names = [name.text for name in total_list if name.text != '']
close_dub = browser.find_element_by_xpath("/html/body/div[4]/div/div/div[1]/div/div[2]/button")
close_dub.click()
return names
get_unfollowers(browser)
This is the code that I wrote and worked to unfollow list I want
def do_unfolow(list):
for x in list:
print(x)
browser.get('https://www.instagram.com/'+x)
# tempy2 = browser.find_element_by_css_selector(".vBF20._1OSdk")
#if not tempy2:
tempy2 = browser.find_element_by_css_selector(".glyphsSpriteFriend_Follow")
#tempy2 = browser.find_element_by_css_selector("Igw0E.rBNOH.YBx95._4EzTm")
tempy2.click()
tempy2 = browser.find_element_by_xpath("//button[contains(text(), 'Unfollow')]")
tempy2.click()
sleep(10)

How to convert text table to dataframe

I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)

How to get big amount of data as fast as possible

I am trying to return an array of constructed objects that are build on top of objects that I retrieve from some url plus another fields that I get from another url.
I have an array that consists of two arrays that each has about 8000 objects...
I have tried to make each object construction as a thread however it still takes a lot of time...
Any solution? Here is my code:
def get_all_players_full_data(ea_players_json):
all = []
ea_players_json = list(ea_players_json.values())
for i in range(len(ea_players_json)):
for player_obj in ea_players_json[i]:
all.append(player_obj)
for player_obj in range(len(all)):
all_data = []
with concurrent.futures.ThreadPoolExecutor(len(all)) as executor:
for player_data in all:
future = executor.submit(build_full_player_data_obj, player_data)
print(future.result())
all_data.append(future.result())
def build_full_player_data_obj(ea_player_data):
if ea_player_data.get("c") is not None:
player_full_name = ea_player_data.get("c")
else:
player_full_name = ea_player_data.get("f") + " " + ea_player_data.get("l")
player_id = ea_player_data.get("id")
# go to futhead to find all cards of that player
futhead_url_player_data = f'{FUTHEAD_PLAYER}{player_full_name}'
details_of_specific_player = json.loads(requests.get(futhead_url_player_data).content)
cards_from_the_same_id = []
for player_in_json_futhead in details_of_specific_player:
if player_in_json_futhead["player_id"] == player_id:
rating = player_in_json_futhead["rating"]
specific_card_id = player_in_json_futhead["def_id"]
revision = player_in_json_futhead["revision_type"]
name = player_in_json_futhead["full_name"]
nation = player_in_json_futhead["nation_name"]
position = player_in_json_futhead["position"]
club = player_in_json_futhead["club_name"]
cards_from_the_same_id.append(Player(specific_card_id, name, rating, revision, nation,
position, club))
return cards_from_the_same_id

Categories