Making function iterative instead of recursive [closed] - python

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 4 years ago.
Improve this question
This loop is using massive amounts of ram. For a 20kb text file, can anyone help me format it to be iterative instead of recursive? As I keep getting recursion errors when it gets into the 3-4gb of ram usage. I tried using with open to close the stream and make it more pythonic. This method loop can only read data for about 10 minutes before it quits out on me.
def getgameticks():
gameticksurl = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId=' + variable + '&sectionId=0'
print(gameticksurl)
# options = Options()
# options.add_argument("--headless")
# browser = webdriver.Firefox()#firefox_options=options)
browser.get(gameticksurl)
global wait
wait = WebDriverWait(browser, 30)
sleep(3)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(3)
wait.until(expected_conditions.presence_of_element_located((By.ID, 'listingsPerPage')))
browser.find_element_by_id('listingsPerPage').click
sleep(2)
select = Select(browser.find_element_by_id('listingsPerPage'))
select.select_by_visible_text('150')
gameinfo()
global trip
trip = False
def gameinfo():
wait.until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="filterBtn"]')))
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
html_doc = browser.page_source
soup = BeautifulSoup(html_doc, 'html.parser')
wait.until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="listingPageNumber"]')))
try:
select = Select(browser.find_element_by_xpath('//*[#id="listingPageNumber"]'))
current = select.all_selected_options[0].text
last = [option.text for option in select.options][-1]
pronto = False
except:
print('Something broke...Getting around it though...')
gameinfo()
if current == last:
global trip
trip = True
browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
wait.until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="filterBtn"]')))
browser.find_element_by_xpath('//*[#id="filterBtn"]').click()
wait.until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="filterBtn"]')))
gameinfo()
else:
wait.until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="listingNextBtn"]')))
browser.find_element_by_xpath('//*[#id="listingNextBtn"]').click()
pass
dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = (dir_path+'\Sheets')
try:
os.makedirs(file_path)
except:
pass
#######################
for mytable in soup.find_all('table'):
for trs in mytable.find_all('tr'):
tds = trs.find_all('td')
row1 = [elem.text.strip() for elem in tds]
row = str(row1)
cool = row.replace("[", "")
coolp = cool.replace("]", "")
cool2 = coolp.replace("'", "")
cool3 = cool2.replace(" , ", "")
row = cool3
rowtest = (row.split(','))
if len(rowtest) != 5:
rowtest = ['NULL', 'NULL', 'NULL', 'NULL', 'NULL']
row = (','.join(rowtest))
rowtest0 = rowtest[:4] # LISTING WITHOUT DAYS LISTED
rowtest1 = rowtest[0:1] # SECTION LOCATION
rowtest2 = rowtest[1:2] # TICKET PRICE
rowtest3 = rowtest[2:3] # ROW
rowtest4 = rowtest[3:4] # TICKET QTY
rowtest5 = rowtest[4:5] # DAYS LISTED
###TABLE STUFF#
row0 = (','.join(rowtest0)) #ROW STRING WITHOUT DAYS LISTED
with open(file_path+'\\'+variable+'.txt', "a+") as openit:
pass
#TABLE STUFF
with open(file_path+'\\'+variable+'.txt', "r+") as file:
for line in file:
linez = (line.split(',')) #LINE AS LIST
linezprice = (linez[-3]) #LINE PRICE
if row0+"\n" in line:
break
else:
file.write(row0+"\n")
print(row)
if trip == False:
pass
else:
slack_token1 = 'xoxb-420561995540-420693438947-JAZmP1pdfg6FkqnTTziPdggr'
sc1 = SlackClient(slack_token1)
sc1.api_call(
"chat.postMessage",
channel=channame,
text=row
)
while True:
gameinfo()

It seems like you want to continuously scrape some site -
just remove all the calls to gameinfo besides the endless loop - there's no reason to do this as a recursion

Related

How to scrape a table with selenium?

I'm having a weird issue trying to scrape a table with selenium. For reference, the table is the item table here, although ideally I would like to be able to scrape any item table for any hero on this site.
self.item_table_xpath = '//table[descendant::thead[descendant::tr[descendant::th[contains(text(), "Item")]]]]'
def retrieve_hero_stats(self, url):
self.driver.get(url)
try:
win_rate_span = self.driver.find_element(by = By.XPATH, value = '//dd[descendant::*[#class = "won"]]/span')
except:
win_rate_span = self.driver.find_element(by = By.XPATH, value = '//dd[descendant::*[#class = "lost"]]/span')
win_rate = win_rate_span.text
hero_name = url.split('/')[-1]
values = list()
for i in range(1, 13):
values.append({
'Item Name': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[2]').text,
'Matches Played': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[3]').text,
'Matches Won': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[4]').text,
'Win Rate': self.driver.find_element(by = By.XPATH, value = self.item_table_xpath + f'/tbody/tr[{i}]' + '/td[5]').text
})
print(hero_name)
print(values)
The issue is the output of the code is inconsistent; sometimes the fields in the values list are populated, and sometimes they are not. This changes each time I run my code. I don't necessarily need someone to write this code for me, in fact, I'd prefer you didn't, I'm just stumped as to why the output changes every time I run?

Scraping data beach volleyball on multiple pages

I am trying to scrape all the possible data from this webpage Gstaad 2017
Here is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from selenium.webdriver.support.ui import Select
#Starts the driver and goes to our starting webpage
driver = webdriver.Chrome( "C:/Users/aldi/Downloads/chromedriver.exe")
driver.get('http://www.bvbinfo.com/Tournament.asp?ID=3294&Process=Matches')
#Imports HTML into python
page = requests.get('http://www.bvbinfo.com/Tournament.asp?ID=3294&Process=Matches')
soup = BeautifulSoup(driver.page_source, 'lxml')
stages = soup.find_all('div')
stages = driver.find_elements_by_class_name('clsTournBracketHeader')[-1].text
#TODO the first row (country quota matches) has no p tag and therefore it is not included in the data
rows = []
paragraphs = []
empty_paragraphs = []
for x in soup.find_all('p'):
if len(x.get_text(strip=True)) != 0:
paragraph = x.extract()
paragraphs.append(paragraph)
if len(x.get_text(strip=True)) == 0:
empty_paragraph = x.extract()
empty_paragraphs.append(empty_paragraph)
# players
home_team_player_1 = ''
home_team_player_2 = ''
away_team_player_1 = ''
away_team_player_2 = ''
for i in range(0, len(paragraphs)):
#round and satege of the competition
round_n= paragraphs[i].find('u').text
paragraph_rows = paragraphs[i].text.split('\n')[1:-1]
counter = 0
for j in range(0,len(paragraph_rows)):
#TODO tournament info, these can vary from tournament to tournament
tournament_info = soup.find('td', class_ = 'clsTournHeader').text.strip().split()
tournament_category = [' '.join(tournament_info[0 : 2])][0]
tournament_prize_money = tournament_info[2]
#TODO tournament city can also have two elements, not just one
tournament_city = tournament_info[3]
tournament_year = tournament_info[-1]
tournament_days = tournament_info[-2][:-1].split("-")
tournament_starting_day = tournament_days[0]
tournament_ending_day = tournament_days[-1]
tournament_month = tournament_info[-3]
tournament_stars = [' '.join(tournament_info[5 : 7])][0]
players = paragraphs[i].find_all('a', {'href':re.compile('.*player.*')})
home_team_player_1 = players[counter+0].text
home_team_player_2 = players[counter+1].text
away_team_player_1 = players[counter+2].text
away_team_player_2 = players[counter+3].text
#matches
match= paragraph_rows[j].split(":")[0].split()[-1].strip()
#nationalities
nationalities = ["United", "States"]
if paragraph_rows[j].split("def.")[0].split("/")[1].split("(")[0].split(" ")[3] in nationalities:
home_team_country = "United States"
else:
home_team_country = paragraph_rows[j].split("def.")[0].split("/")[1].split("(")[0].split(" ")[-2]
if paragraph_rows[j].split("def.")[1].split("/")[1].split(" ")[3] in nationalities:
away_team_country = "United States"
else:
away_team_country = paragraph_rows[j].split("def.")[1].split("/")[1].split("(")[0].split(" ")[-2]
parentheses = re.findall(r'\(.*?\)', paragraph_rows[j])
if "," in parentheses[0]:
home_team_ranking = parentheses[0].split(",")[0]
home_team_ranking = home_team_ranking[1:-1]
home_team_qualification_round = parentheses[0].split(",")[1]
home_team_qualification_round = home_team_qualification_round[1:-1]
else:
home_team_ranking = parentheses[0].split(",")[0]
home_team_ranking = home_team_ranking[1:-1]
home_team_qualification_round = None
if "," in parentheses[1]:
away_team_ranking = parentheses[1].split(",")[0]
away_team_ranking = away_team_ranking[1:-1]
away_team_qualification_round = parentheses[1].split(",")[1]
away_team_qualification_round = away_team_qualification_round[1:-1]
else:
away_team_ranking = parentheses[1].split(",")[0]
away_team_ranking = away_team_ranking[1:-1]
match_duration = parentheses[2]
match_duration = match_duration[1:-1]
away_team_qualification_round = None
# sets
sets = re.findall(r'\).*?\(', paragraph_rows[j])
sets = sets[1][1:-1]
if len(sets.split(",")) == 2:
score_set1 = sets.split(",")[0]
score_set2 = sets.split(",")[1]
score_set3 = None
if len(sets.split(",")) == 3:
score_set1 = sets.split(",")[0]
score_set2 = sets.split(",")[1]
score_set3 = sets.split(",")[2]
row = { " home_team_player_1 ": home_team_player_1 ,
" home_team_player_2": home_team_player_2,
"away_team_player_1": away_team_player_1,
"away_team_player_2":away_team_player_1,
"match": match,
"home_team_country":home_team_country,
"away_team_country": away_team_country,
"home_team_ranking": home_team_ranking,
"away_team_ranking": away_team_ranking,
"match_duration": match_duration,
"home_team_qualification_round": home_team_qualification_round,
"away_team_qualification_round": away_team_qualification_round,
"score_set1":score_set1,
"score_set2":score_set2,
"score_set3":score_set3,
"tournament_category": tournament_category,
"tournament_prize_money": tournament_prize_money,
"tournament_city": tournament_city,
"tournament_year": tournament_year,
"tournament_starting_day": tournament_starting_day,
"tournament_ending_day":tournament_ending_day,
"tournament_month":tournament_month,
"tournament_stars":tournament_stars,
"round_n": round_n
}
counter += 4
rows.append(row)
data = pd.DataFrame(rows)
data.to_csv("beachvb.csv", index = False)
I am not really experienced in web scraping. I have just started as a self-taught and find the HTML source code quite messy and poorly structured.
I want to improve my code in two ways:
Include all the missing matches (country quota matches, semifinals, bronze medal, and gold medal) and the respective category for each match (country quota matches, pool, winner's bracket, semifinals, bronze medal, and gold medal)
iterate the code for more years and tournaments from the dropdown menu at the top of the webpage
I have tried to iterate through different years but my code does not work
tournament_years = {"FIVB 2015", "FIVB 2016"}
dfs = []
for year in tournament_years:
# select desired tournament
box_year = Select(driver.find_element_by_xpath("/html/body/table[3]/tbody/tr/td/table[1]/tbody/tr[1]/td[2]/select"))
box_year.select_by_visible_text(year)
box_matches = Select(driver.find_element_by_xpath("/html/body/table[3]/tbody/tr/td/table[1]/tbody/tr[2]/td[2]/select"))
box_matches.select_by_visible_text("Matches")
The main idea was to create a list of dataframes for each year and each tournament by adding a new loop at the beginning of the code.
If someone has a better idea and technique to do so, it is really appreciated!

Only after performing click method on the webelement i am getting selenium.common.exceptions.StaleElementReferenceException

I have been scraping the data from https://jamabandi.nic.in/land%20records/NakalRecord. below is the code.
after clicking on one option in the ownerName_options, unable to click or get the text of the next option I have tried webdriverwait but still, it's not working. I am getting the info if I just extract the data but am unable to get data if performing click() operation. please help me here.
#################### FOR DISTRICT NAMES AND SUBMITTING IN THE FORMS #########################################
districts = {}
dist_names = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddldname')
dist_options = [x for x in dist_names.find_elements_by_tag_name("option")]
for option in dist_options:
districts[option.get_attribute('value')] = option.text
# districts.append(option.text)
del districts[next(iter(districts))]
print(districts)
for dist_key, dist_value in districts.items():
dist_select = Select(driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddldname'))
dist_select.select_by_visible_text(dist_value)
time.sleep(1)
tehsils = {}
tehsil_names = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddltname')
tehsil_options = [x for x in tehsil_names.find_elements_by_tag_name("option")]
for option in tehsil_options:
tehsils[option.get_attribute('value')] = option.text
# districts.append(option.text)
del tehsils[next(iter(tehsils))]
print(tehsils)
for tehsil_key, tehsil_value in tehsils.items():
tehsil_select = Select(driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddltname'))
tehsil_select.select_by_visible_text(tehsil_value)
time.sleep(1)
# ******************** FOR VILLAGE NAMES WITH VALUE #########################################
villages = {}
village_names = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlvname')
village_options = [x for x in village_names.find_elements_by_tag_name("option")]
for option in village_options:
villages[option.get_attribute('value')] = option.text
time.sleep(0.2)
del villages[next(iter(villages))]
print(villages)
for villa_key, villa_value in villages.items():
village_select = Select(driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlvname'))
village_select.select_by_visible_text(villa_value)
time.sleep(1)
# ******************** FOR Years #########################################
years = []
year_select = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlPeriod')
year_options = [x for x in year_select.find_elements_by_tag_name("option")]
for option in year_options:
years.append(option.text)
time.sleep(1)
years.pop(0)
print(years)
for year in years:
year_select = Select(driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlPeriod'))
year_select.select_by_visible_text(year)
time.sleep(1)
dict_header = True
# ******************** FOR VILLAGE NAMES WITH VALUE #########################################
owners = {}
owner_names = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlOwner')
owner_options = [x for x in owner_names.find_elements_by_tag_name("option")]
for option in owner_options:
owners[option.get_attribute('value')] = option.text
time.sleep(0.2)
del owners[next(iter(owners))]
print(owners)
for owner_key, owner_value in owners.items():
owner_select = Select(driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlOwner'))
owner_select.select_by_visible_text(owner_value)
time.sleep(2)
# ******************** FOR Names in the box #########################################
ownerNames = []
ownerName_select = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ListBox1')
ownerName_options = [x for x in ownerName_select.find_elements_by_tag_name("option")]
time.sleep(1)
for option in ownerName_options:
time.sleep(3)
val = option.text
if val.find('??') != -1:
continue
print("text in the box = ", val)
option.click()
# time.sleep(2)
# driver.switch_to.default_content()
print(driver.current_url)```

How to optimize a Selenium webdriver crawler?

So, I have to crawl a table in each webpage in a website, there are 324 web pages (meaning 324 tables) and each table has 1000 rows and 7 columns, but 1 column is useless and I didn't use that one.
The code is kind of okay but the problem is it's very slow and it takes a lot of time.I was wondering if I could do some changes to the code to make it faster!
Here's the code:
driver = webdriver.Chrome('./chromedriver.exe')
driver.get('https://beheshtezahra.tehran.ir/Default.aspx?tabid=92')
driver.maximize_window()
part_count = 1
li = []
for i in range(0, 324):
start = timeit.default_timer()
firstname = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='dnn$ctr1877$DeadSearch$txtname']")))
lastname = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='dnn$ctr1877$DeadSearch$txtFamily']")))
part = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='dnn$ctr1877$DeadSearch$txtPart']")))
firstname.clear()
firstname.send_keys("%")
lastname.clear()
lastname.send_keys("%")
part.clear()
part.send_keys(str(part_count))
driver.find_element_by_xpath('//*[#id="dnn_ctr1877_DeadSearch_btnSearch"]').click()
print('Saving the information..')
first_name = driver.find_elements_by_xpath('//table/tbody/tr/td[2]')
last_name = driver.find_elements_by_xpath('//table/tbody/tr/td[3]')
fathers_name = driver.find_elements_by_xpath('//table/tbody/tr/td[4]')
birth_date = driver.find_elements_by_xpath('//table/tbody/tr/td[5]')
death_date = driver.find_elements_by_xpath('//table/tbody/tr/td[6]')
grave_info = driver.find_elements_by_xpath('//table/tbody/tr/td[7]')
print('Appending the information..')
for j in range(0, 1000):
li.append(first_name[j].text)
li.append(last_name[j].text)
li.append(fathers_name[j].text)
li.append(birth_date[j].text)
li.append(death_date[j].text)
li.append(grave_info[j].text)
print('Page ' + str(part_count) + ' is crawled!')
stop = timeit.default_timer()
part_count += 1
print('Time: ', stop - start)
And in the end, I wrote the list into a CSV file. Any suggestions would be appreciated!
What you could do after the print('Saving the information..') part:
print('Saving the information..')
page_snapshot = lxml.html.document_fromstring(driver.page_source)
first_name = page_snapshot.xpath('//table/tbody/tr/td[2]')
last_name = page_snapshot.xpath('//table/tbody/tr/td[3]')
fathers_name = page_snapshot.xpath('//table/tbody/tr/td[4]')
birth_date = page_snapshot.xpath('//table/tbody/tr/td[5]')
death_date = page_snapshot.xpath('//table/tbody/tr/td[6]')
grave_info = page_snapshot.xpath('//table/tbody/tr/td[7]')
print('Appending the information..')
for j in range(0, 1000):
li.append(first_name[j].text)
li.append(last_name[j].text)
li.append(fathers_name[j].text)
li.append(birth_date[j].text)
li.append(death_date[j].text)
li.append(grave_info[j].text)
lxml is blazing fast, just import lxml.html (after pip install ofc. :)) and make sure that the page is completely loaded before you grab the snapshot.

Python reach variable inside loop [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 3 years ago.
Improve this question
I am writing a function to get the length of words and count them. Inside this function, I have tried to get intersection of two dictionary but I couldn't reach one of them in properly.
My problem is in line 37 (before the last line), I couldn't reach di_Title which is in line 13 di_Title = dic()
I have tried global variable but it didn't work for me!
My function:
def Text_Analyze(Raw_Text):
Title_Length = []
Title_Dictionary = []
Article_Length = []
Article_Dictionary = []
Intersection_Dictionary = []
Title_Info = Raw_Text.PageTitle
for Each_Line in Title_Info:
Title = remove_punctuation(Each_Line)
Title = Title.lower()
Title = Title.split()
Title_Length.append(len(Title))
di_Title = dict()
for w_Title in Title:
Root_Title_Split = TurkishStemmer()
Root_Title_Word = Root_Title_Split.stem(w_Title)
if Root_Title_Word in di_Title:
di_Title[Root_Title_Word] = di_Title[Root_Title_Word] + 1
else:
di_Title[Root_Title_Word] = 1
Title_Dictionary.append(di_Title)
Article_Info = Raw_Text.PageArticle
for each_Line in Article_Info:
Article = remove_punctuation(each_Line)
Article = Article.lower()
Article = Article.split()
Article_Length.append(len(Article))
di_Article = dict()
for w_Article in Article:
root_Article_Split = TurkishStemmer()
root_Article_Word = root_Article_Split.stem(w_Article)
if root_Article_Word in di_Article:
di_Article[root_Article_Word] = di_Article[root_Article_Word] + 1
else:
di_Article[root_Article_Word] = 1
Article_Dictionary.append(di_Article)
Int_Word_Dic = intersect(di_Title, di_Article)
Intersection_Dictionary.append(Int_Word_Dic)
Variables declared inside a scope (in your case, in a loop) will be inaccessible outside the scope in which it is declared.
You can declare it at a higher scope (in your case, before the loop) and change its value depending on your needs.
I dont get your question but this might do it, as long as each title_info row matches with the same row in article_info
def Text_Analyze(Raw_Text):
Title_Length = []
Title_Dictionary = []
Article_Length = []
Article_Dictionary = []
Intersection_Dictionary = []
Title_Info = Raw_Text.PageTitle
for Each_Line in Title_Info:
Title = remove_punctuation(Each_Line)
Title = Title.lower()
Title = Title.split()
Title_Length.append(len(Title))
di_Title = dict()
for w_Title in Title:
Root_Title_Split = TurkishStemmer()
Root_Title_Word = Root_Title_Split.stem(w_Title)
if Root_Title_Word in di_Title:
di_Title[Root_Title_Word] = di_Title[Root_Title_Word] + 1
else:
di_Title[Root_Title_Word] = 1
Title_Dictionary.append(di_Title)
Article_Info = Raw_Text.PageArticle
for c,each_Line in enumerate(Article_Info):
Article = remove_punctuation(each_Line)
Article = Article.lower()
Article = Article.split()
Article_Length.append(len(Article))
di_Article = dict()
for w_Article in Article:
root_Article_Split = TurkishStemmer()
root_Article_Word = root_Article_Split.stem(w_Article)
if root_Article_Word in di_Article:
di_Article[root_Article_Word] = di_Article[root_Article_Word] + 1
else:
di_Article[root_Article_Word] = 1
Article_Dictionary.append(di_Article)
Int_Word_Dic = intersect(Title_Dictionary[c], di_Article)
Intersection_Dictionary.append(Int_Word_Dic)

Categories