I have developed following code. Normally it works fine. But, sometimes while downloading the records (record()), it fails as server fails to respond or if internet is disconnected. To avoid breaking of code I used 'try-exceptions' but then, I can not break out from record() as well as outer function simultaneously. So, looking for solution. Code around 197th line doesn't help in continuing the code from outer while loop. Code is here:
import glob
import datetime
import cv2
import base64
from PIL import Image
from io import BytesIO
import time
import selenium
import self as self
from pytesseract import pytesseract
from selenium.webdriver.common.keys import Keys
import os
from selenium.webdriver.support import expected_conditions as EC, expected_conditions
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException, \
WebDriverException, ElementNotInteractableException, UnexpectedAlertPresentException
main_Directory = r'/home/sangharshmanuski/Documents/e_courts/mha/downloads4'
log_Directory = r'/home/sangharshmanuski/Documents/e_courts/mha/log'
driver = selenium.webdriver.Firefox()
url = r'https://districts.ecourts.gov.in/'
driver.get(url)
# create wait time variable for regular, short and mid
wait = WebDriverWait(driver, 180)
waitShort = WebDriverWait(driver, 3)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#sateist > option:nth-child(22)")))
select = Select(driver.find_element_by_css_selector('#sateist'))
options = select.options
select.select_by_visible_text('Maharashtra')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.region')))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#sateist')))
districtListDropdown = Select(driver.find_element_by_css_selector("#sateist"))
distOptions = districtListDropdown.options
# iterate over each district
i = 1
while i < len(distOptions):
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#sateist')))
newDistDropDown = Select(driver.find_element_by_css_selector("#sateist"))
except:
continue
newDistOptions = newDistDropDown.options
distName = newDistOptions[i].text
print(distName)
newDistDropDown.select_by_index(i)
# for creating directory as per each district.
district_directory = os.path.join(
main_Directory, distName) # create new
if not os.path.exists(district_directory): # if not directory exists, create one
os.mkdir(district_directory)
district_log_directory = os.path.join(log_Directory, distName)
if not os.path.exists(district_log_directory): # if not directory exists, create one
os.mkdir(district_log_directory)
headingDist = driver.find_element_by_css_selector('.heading')
if headingDist.text.lower() == distName.lower():
wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.accordion2:nth-child(2)'))).click()
current = driver.window_handles[0]
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR,
'div.panel:nth-child(3) > ul:nth-child(1) > li:nth-child(6) > a:nth-child(1)'))).click()
# wait until new tab opens.
wait.until(EC.number_of_windows_to_be(2))
# define new tab by differentiating from current tab.
newWindow = [window for window in driver.window_handles if window != current][0]
# switch to the new tab. ref: https://stackoverflow.com/questions/41571217/python-3-5-selenium-how-to-handle-a-new-window-and-wait-until-it-is-fully-lo
driver.switch_to.window(newWindow)
# wait till court complex list appears.
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#court_complex_code')))
# create list of all court complex.
# 2 approaches - 1 select 2 click.
time.sleep(3)
def complex_and_act():
this = driver.current_window_handle
def imgtotxt():
elem = driver.find_element_by_id("captcha_image")
loc = elem.location
size = elem.size
left = loc['x']
top = loc['y']
width = size['width']
height = size['height']
box = (int(left), int(top), int(left + width), int(top + height))
screenshot = driver.get_screenshot_as_base64()
img = Image.open(BytesIO(base64.b64decode(screenshot)))
area = img.crop(box)
area.save('/home/sangharshmanuski/Documents/e_courts/captcha/file_trial.png', 'PNG')
fullPath = r'/home/sangharshmanuski/Documents/e_courts/captcha'
f = os.listdir(fullPath)
desPath = r"/home/sangharshmanuski/Documents/e_courts/editC"
img = cv2.imread(os.path.join(fullPath, 'file_trial.png'), 0)
ret, thresh1 = cv2.threshold(img, 111, 255, cv2.THRESH_BINARY)
cv2.imwrite('/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png', thresh1)
# know the text with pytesseract
captchaText = pytesseract.image_to_string(
Image.open('/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png'))
captcha = driver.find_element_by_id('captcha')
captcha.send_keys(captchaText)
driver.find_element_by_css_selector('input.button:nth-child(1)').click()
time.sleep(1)
def proceed():
while True:
try:
waitShort.until(EC.alert_is_present())
driver.switch_to.alert.accept()
driver.switch_to.window(this)
driver.find_element_by_css_selector(
'#captcha_container_2 > div:nth-child('
'1) > div:nth-child(1) > span:nth-child(3) > a:nth-child(7) > img:nth-child(1)').click()
log_file = open(os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('alert was present' + '\n')
print('alert was present')
imgtotxt()
except:
# if the waitmsg is on, wait for 5 sec
log_file = open(os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('no alert' + '\n')
print('no alert')
waitmsg = 0
while driver.find_element_by_css_selector('#waitmsg').is_displayed():
if waitmsg < 7:
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('wait' + '\n')
print('waitmsg')
time.sleep(1)
waitmsg += 1
else:
log_file = open(os.path.join(
log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('waiting finished' + '\n')
print('waiting finished')
break
invalidCaptcha = "Invalid Captcha"
norecord = "Record Not Found"
try:
waitShort.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#errSpan > p:nth-child(1)')))
incorrect = driver.find_element_by_css_selector('#errSpan > p:nth-child(1)').text
if incorrect == invalidCaptcha:
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('Invalid Captcha' + '\n')
print('invalid captcha')
imgtotxt()
continue
else:
if incorrect == norecord:
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('Record not Found' + '\n')
return print('record not found')
except:
pass
def record():
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('Record Found' + '\n')
print('record fun started')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.someclass')))
listAllView = driver.find_elements_by_css_selector(
'a.someclass')
# make new dirctory by name of Court Complex
distDir2 = os.path.join(
main_Directory, distName, nameCourtComp)
if not os.path.exists(distDir2):
os.makedirs(distDir2)
x = 0
for view in listAllView:
try:
view.click()
wait.until(EC.presence_of_element_located((By.ID, 'back_top')))
openFile = open(
os.path.join(distDir2, "file_" + str(x) + ".html"), "w")
openFile.write(driver.page_source)
openFile.close()
back = driver.find_element_by_id('back_top')
back.click()
x += 1
except (TimeoutException, ElementNotInteractableException):
driver.refresh()
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...' + '\n')
nonlocal courtComp
courtComp -= 1
return print(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...')
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('record completed, ' + str(x) + ' records found' + '\n')
print('record completed, ' + str(x) + ' records found')
return
record()
return
courtComp = 1
courtComplexDownload = Select(
driver.find_element_by_css_selector('#court_complex_code'))
courtComplexDownloadList = courtComplexDownload.options
courtComplexLen = len(courtComplexDownloadList)
while courtComp < courtComplexLen:
nameCourtComp = courtComplexDownloadList[courtComp].text
log_file = open(os.path.join(log_Directory, nameCourtComp + '.txt'), 'w')
log_file.write(nameCourtComp + '\n' + '\n')
print(nameCourtComp)
courtComplexDownload.select_by_index(courtComp)
acts = Select(driver.find_element_by_css_selector('#actcode'))
actsOpt = acts.options
act = 0
while len(actsOpt) < 2:
if act < 10:
time.sleep(1)
act += 1
else:
#if there is no list to populate break out of this loop & go to next complex
raise Exception()
try:
acts.select_by_value('33')
except NoSuchElementException:
print('PoA not applicable')
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('No PoA' + '\n')
courtComp += 1
continue
imgtotxt()
proceed()
courtComp += 1
complex_and_act()
driver.close()
print("all court complexes in " + distName + " completed")
driver.switch_to.window(current)
driver.back()
else:
time.sleep(5)
continue
i += 1
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#sateist > option:nth-child(22)")))
select = Select(driver.find_element_by_css_selector('#sateist'))
options = select.options
select.select_by_visible_text('Maharashtra')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.region')))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#sateist')))
This is exactly what exceptions are for. Whenever you raise an exception, it's going to go to the containing scope, and if that scope doesn't catch it, it'll stop whatever it's doing and the exception will go up to the next scope, et cetera. It's a very easy way to make sure that if something goes wrong, you can break out of multiple levels of loop or multiple levels of function call without having to have each level check a return value.
When you swallow an exception, as you're doing here:
except (TimeoutException, ElementNotInteractableException):
...
return print(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...')
you're thwarting that process. (Note that you're returning None, so the caller gets no information whatsoever!) Instead, maybe do something like:
except (TimeoutException, ElementNotInteractableException):
...
print(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...')
raise
The raise will just re-raise the same exception up to the next level, where they can do their own handling. Alternatively, you could create your own exception (maybe with more information) and raise that instead; it's completely valid to catch an exception and then raise a different type of exception as a way of translating it for the caller.
Related
I've got the following code that I'm running on spyder to extract comments by using chrome webdriver.
I am getting this error message: bad operand type for unary -: 'WebElement'
Could you please help me to fix this problem?
Thanks in advance.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from sys import exit
def log(log_text):
log_text = str(time.strftime("%Y.%m.%d %H:%M:%S")) + " ➾ " + log_text
print(log_text)
log_file = open("log.txt", "a", encoding='utf-8')
log_file.write(log_text + "\n")
log_file.close()
global_delay = 0.5
driver = webdriver.Chrome()
log('Bu program Can Tarafından Yapılmıştır.')
log('https://fastuptime.com ve https://speedsmm.com üzerinden bize ulaşabilirsiniz.')
log('Program başlatıldı')
urun_url = input("Ürün Linki Sonuna / Koymayın: ") #'https://www.trendyol.com/trendypassion/sirt-pusula-baskili-tshirt-p-260271556' Ürün URL NOT SONUNA / KOYMAYIN
try:
driver.get(urun_url + "/yorumlar")
time.sleep(5)
kac_yorum_var = -driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div/div/div[2]/div/div[2]/div[1]/div/div[2]/span[2]')
kac_yorum_var = kac_yorum_var.replace(" Yorum", "")
log('Toplam ' + kac_yorum_var + ' yorum var.')
for i in range(int(kac_yorum_var)):
try:
yorum = driver.find_element_by_xpath('/html/body/div[1]/div[3]/div/div/div[2]/div/div[2]/div[3]/div[4]/div[' + str(i) + ']/div[1]/div/p').text
log('Yorum: '+ yorum)
yorum_file = open("yorumlar.txt", "a", encoding='utf-8')
yorum_file.write(yorum + "\n")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(global_delay)
except:
continue
except Exception as e:
log('Hata: ' + str(e))
log('Program sonlandı')
driver.quit()
exit()
I think I have to change something in the url link but I couldn't find an answer on here.
I have created simple automation which will first read the read subdomain folder and then it will get into it and then find for the specific html file. If it will find it (i.e: .html) then edit it and i want to add the specific tag somewhere in that file but i am unable to do it.
staticWord = "Hair"
htmlTag = "<a href='"+ staticRandomPathList[0] + "'>" + staticWord + "</a>"
print(htmlTag)
# Now on we are working statically
folderfound = 0
filefound = 0
for domainNameFolder in range(len(staticRandomPathList)):
subDomainSelectedFilesAddress = driver.find_element(By.XPATH,"//table/tbody/tr[" + str(domainNameFolder + 1) + "]/td[" + str(1) + "]")
subDomainName = new_list[domainNameFolder] + '.' + domain_list[domain_variable]
if subDomainSelectedFilesAddress.text == "logs" or subDomainSelectedFilesAddress.text == "public_html":
continue
else:
if subDomainSelectedFilesAddress.text == "test1.testlab.com":
action = ActionChains(driver)
action.double_click(subDomainSelectedFilesAddress).perform()
time.sleep(1)
for file in range(0, 10):
time.sleep(1)
selectedFile = driver.find_element(By.XPATH, "//table/tbody/tr[" + str(
file + 1) + "]/td[" + str(1) + "]")
if selectedFile.text == "5.html":
selectedFile.click()
editFile = driver.find_element(By.XPATH, "//a[#ng-click='showHTMLEditorModal()']")
editFile.click()
# addHtmlTag = WebDriverWait(driver, 20).until(
# EC.visibility_of_element_located((By.CLASS_NAME, "ace_content")))
# insertAnchorTag = driver.find_element(By.CLASS_NAME, "ace_content")
# insertAnchorTag.click()
#
time.sleep(2)
textinput = driver.find_element(By.CLASS_NAME, "ace_text-layer")
print(textinput.text)
gettingTextFromServer = textinput.text
Html_file = open("HTMLParsing.html", "w")
newHTMLFile = Html_file.write(gettingTextFromServer)
html = newHTMLFile
print(html)
# soup = Soup(html)
# bodyTag = soup.find('body')
# anchor = soup.new_tag('a')
# anchor['href'] = staticRandomPathList[0]
# bodyTag.insert(anchor)
Html_file.close()
# print(insertAnchorTag.text)
# mapHTMLTag = driver.find_element(By.ID, "id='htmlEditorContent'")
# mapHTMLTag.send_keys(htmlTag)
# addHtmlTag.send_keys(htmlTag)
filefound = 1
break
else:
continue
if filefound == 1:
break
folderfound = 1
break
else:
continue
print("Successfully Outside Loop")
I am attaching the picture so you would be able to see where I want to place that tag.
This is nodejs selenium code that works. May be you can reproduce in python. Don't forget to navigate to any page page before editing page html. Good luck.
async navigateToHomePage() {
logger.info(`inside navigateToHomePage`)
await driver.get(this.baseUrl)
}
async insertHtmlIntoDocumentBody(html) {
logger.info(`inside insertHtmlIntoDocumentBody`)
let htmlElement = driver.findElement(By.css('html'))
let headElement = driver.findElement(By.css('head'))
let bodyElement = driver.findElement(By.css('body'))
logger.info(`starting html edition`)
await driver.executeScript(`let div =
document.createElement('html');
div.innerHTML='${JSON.stringify(html)}';
arguments[0].removeChild(arguments[1]);
arguments[0].removeChild(arguments[2]);
arguments[0].appendChild(div)`, htmlElement, headElement,
bodyElement)
logger.info(`completed html edition`)
}
I have used the following code (not mine) to scrape status_ids of different users because of the 3200 limit. It was working fine and I have done most of the scraping but lately (from last 1 month) I am not able to scrape and a try again error comes up in the browser. I am new to coding and this is my first go at scraping.
Error:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException
from time import sleep
import json
import datetime
# edit these three variables
user = 'TheDemocrats'
start = datetime.datetime(2008, 4, 13) # year, month, day
end = datetime.datetime(2019, 1, 31) # year, month, day
# only edit these if you're having problems
delay = 1 # time to wait on each page load before reading the page
driver = webdriver.Firefox(executable_path=r'C:\ProgramData\Microsoft\Windows\Start Menu\Programs\geckodriver.exe') # options are Chrome() Firefox() Safari()
# don't mess with this stuff
twitter_ids_filename = 'all_ids_dem.json'
days = (end - start).days + 1
id_selector = '.time a.tweet-timestamp'
tweet_selector = 'li.js-stream-item'
user = user.lower()
ids = []
def format_day(date):
day = '0' + str(date.day) if len(str(date.day)) == 1 else str(date.day)
month = '0' + str(date.month) if len(str(date.month)) == 1 else str(date.month)
year = str(date.year)
return '-'.join([year, month, day])
def form_url(since, until):
p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=from%3A'
p2 = user + '%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd'
return p1 + p2
def increment_day(date, i):
return date + datetime.timedelta(days=i)
for day in range(days):
d1 = format_day(increment_day(start, 0))
d2 = format_day(increment_day(start, 1))
url = form_url(d1, d2)
print(url)
print(d1)
driver.get(url)
sleep(delay)
try:
found_tweets = driver.find_elements_by_css_selector(tweet_selector)
increment = 10
while len(found_tweets) >= increment:
print('scrolling down to load more tweets')
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(delay)
found_tweets = driver.find_elements_by_css_selector(tweet_selector)
increment += 10
print('{} tweets found, {} total'.format(len(found_tweets), len(ids)))
for tweet in found_tweets:
try:
id = tweet.find_element_by_css_selector(id_selector).get_attribute('href').split('/')[-1]
ids.append(id)
except StaleElementReferenceException as e:
print('lost element reference', tweet)
except NoSuchElementException:
print('no tweets on this day')
except WebDriverException as e:
if "networkProtocolError" in e()._str_():
print('failed to load page on this day, retrying')
start = increment_day(start,-1)
else:
raise e
start = increment_day(start, 1)
try:
with open(twitter_ids_filename) as f:
all_ids = ids + json.load(f)
data_to_write = list(set(all_ids))
print('tweets found on this scrape: ', len(ids))
print('total tweet count: ', len(data_to_write))
except FileNotFoundError:
with open(twitter_ids_filename, 'w') as f:
all_ids = ids
data_to_write = list(set(all_ids))
print('tweets found on this scrape: ', len(ids))
print('total tweet count: ', len(data_to_write))
with open(twitter_ids_filename, 'w') as outfile:
json.dump(data_to_write, outfile)
print('all done here')
driver.close()
1) i have a list of product links and it contain 3385 links
2) i have a function get_pro_info(link) it take link of product and append item to the json file.
3) i want selenium open 5 browser and 5 link parallel and get information of product and append in a file or list..
or 3) selenium open 1 browser and 5 tab(having 5 links) and append file.
Question how can i apply threading on my code?
my code...
new_url=''
def get_pro_info(pro_url):
driver = webdriver.Chrome(executable_path=r'C:\Users\Beenu\PycharmProjects/chromedriver.exe')
try:
new_url = 'https://pk.studiobytcs.com' + pro_url
print('new product URL: ' + new_url)
driver.execute_script("window.open('');")
sleep(1)
# use to switch control
driver.switch_to.window(driver.window_handles[0])
# sleep(1)
driver.get(new_url)
except(WebDriverException, selenium.common.exceptions.TimeoutException, Exception) as e:
print('There is error in getting Product by URL in get_pro_info()! \n' + str(e.stacktrace))
pass
description_source_code = ''
# description_soup = BeautifulSoup()
description_soup: BeautifulSoup = object
# global description_soup
try:
# description_soup = BeautifulSoup('html.parser')
description: WebElement = driver.find_element_by_xpath(
'//*[#id="shopify-section-product-template"]/div[2]/div[1]/div/div[2]')
description_source_code = description.get_attribute("innerHTML")
description_soup: BeautifulSoup = BeautifulSoup(description_source_code, 'html.parser')
except NoSuchElementException as e:
print('Product description taag not found! \n' + str(e.stacktrace))
pass
# 179 here
# This is for getting heading product name
head = ''
r_j_title = ''
try:
head = description_soup.find_all("h1", class_="product_name")
# print(head)
r_j_title = head[0].string.strip()
print("Title: " + r_j_title)
except (HTMLParser, IndexError):
print('Fail to get heading/title Tag! \n' + str(HTMLParser))
# This is for get brand name from heading/title
r_j_brand_and_designer = ''
try:
brand_and_designer = head[0].string.strip().split("-")[0]
r_j_brand_and_designer = str(brand_and_designer).strip()
print('Brand and designer: ' + r_j_brand_and_designer)
except (IndexError, ValueError) as e:
print('Fail to Split Brand from heading/title ! \n' + str(e.stacktrace))
# This is for getting price in integer
r_j_price_in_int = ''
try:
price = description_soup.find_all("span", class_="money")
# print(price)
price_new = price[0].string.strip()
print("New price: " + price_new)
# this is for getting price from string
r_c_price = price[0].string.strip().split(".")[1]
r_j_price_in_int = str(r_c_price).replace(",", "")
# price could ha ,
print('Price: ' + r_j_price_in_int)
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get Tag or failed to Split Brand from heading/title ! \n' + str(e.stacktrace))
# this is for getting full description
description_all = ''
r_j_desc = ''
try:
description_all = description_soup.find_all("div", class_="description")
final_des = str(description_all[0].get_text())
ch = final_des.split()
r_j_desc = str(' '.join(ch))
print("with split ch : " + r_j_desc) # addtion of .string.strip()
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get all description Tag or failed to Split and removing endline chr from description ! \n' + str(
e.stacktrace))
# This is for trying if fibric tag is not avaliable
try:
get_split_fibric = description_all[0].get_text().split("Fabric", 1)[1]
get_split_des = get_split_fibric.split("Disclaimer")[0]
r_j_fabric = str(get_split_des).strip()
print("getting fibric: " + r_j_fabric)
except IndexError as e:
r_j_fabric = 'N/A'
print('Fabric is not avaliable: ' + r_j_fabric)
item['brand_name'] = str(r_j_brand_and_designer)
item['designer'] = str(r_j_brand_and_designer)
item['title'] = str(r_j_title)
item['description'] = str(r_j_desc)
item['price'] = int(r_j_price_in_int)
item['currency'] = "PKR"
item['product_id'] = str(r_j_title)
item['source'] = str(new_url)
item['fabric'] = str(r_j_fabric)
item['gender'] = "woman"
print(item)
cloth = {
"cloth": item
}
# instruction
print(cloth)
list_before_dump.append(cloth)
driver.close()
driver.quit()
with open('product_link_read.txt', 'r') as file:
data = file.readlines()
# rd_pro_link_list=rd_pro_link_list+data.replace('\n', '')
print(data)
for line in data:
# fap=
rd_pro_link_list.append(str(line).strip())
print(rd_pro_link_list)
print(len(rd_pro_link_list))
for pro_link in rd_pro_link_list:
get_pro_info(pro_link)
print('Pro count = ' + str(pro_count))
pro_count = pro_count + 1
list_before_dump_file.write(json.dumps(list_before_dump))
driver.close()
list_before_dump_file.close()
if you want to iterate list and get always 20 links then you can use range(start, stop, step) with step=20
all_t = []
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
Other method is good if you will no need later your list
all_t = []
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
BTW: args= needs tuple - even if you have only one arguments so you need , in ( ) to create tuple with one element.
BTW: If you want it to run only 20 threads in every moment then better see multiprocessing and Pool(20)
from multiprocessing import Pool
def get_product_info(link):
result = ....
return result
if __name__ == '__main__':
with Pool(20) as p:
all_results = p.map(get_product_info, list_of_product_link)
medium_timeout = Data.medium_timeout
#click multiple buttons
#i = 0
for steps in buttonData:
if steps == "wait":
time.sleep(2)
elif steps == "back":
driver.back()
else:
try:
#wait for element to show up
WebDriverWait(driver, medium_timeout).until(EC.presence_of_element_located((By.XPATH, steps)))
WebDriverWait(driver, medium_timeout).until(EC.visibility_of_element_located((By.XPATH, steps)))
#crashpicturename = "before" + "error" + str(i) + ".png"
#driver.save_screenshot(crashpicturename)
except TimeoutException:
logging.debug(str(datetime.datetime.now()) + " TimeoutException: Unable to locate " + steps)
return driver
#Input some data where necessary
if steps == "//input[#id = 'fulfill']":
driver.find_element_by_xpath("//div[#id= 'orderItemsContainer']//input").send_keys(str(randint(10000, 99999)))
logging.debug(str(datetime.datetime.now()) + " Clicking on " + steps)
try:
driver.find_element_by_xpath(steps).click()
logging.debug(str(datetime.datetime.now()) + " after Clicking on " + steps)
crashpicturename = "after" + "error" + str(i) + ".png"
driver.save_screenshot(crashpicturename)
i += 1
except Exception as e:
logging.debug(str(datetime.datetime.now()) + format(e))
time.sleep(10)
driver.find_element_by_xpath(steps).click()
return driver
During this testing, I plan to click "user" first, then click "edit", then click"change email" to test these steps work normally, at the same time, save screenshots after every step. But it seems save_screenshot for PhantomJS can not work normally. For example, the screenshot after clicking "user" has the same screenshot after clicking "change email". IT is messy.