Infinite Loop Exits Automatically while Using multithreading and Queues - python

I'm trying to run a function that has an infinite loop (to check data after few seconds of delay) using multithreading. Since I read data from a csv file, I'm also using Queues.
My current function fine when I do not use multithreading/queues but when I use them, the function only loops once and then stops.
Here's my function that has a infinite loop. Please note that the first while True loop is for threads (in case I use less number of threads than rows in csv) , the function only requires the second while True loop.
def doWork(q):
while True:
#logging.info('Thread Started')
row=q.get()
url = row[0]
target_price = row[1]
#logging.info('line 79')
while True:
delay=randint(5,10)
headers = {'User-Agent': generate_user_agent()}
print datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')+': '+'Sleeping for ' + str(delay) + ' seconds'
#logging.info('line 81')
eventlet.sleep(delay)
try:
#logging.info('line 85')
with requests.Session() as s:
#logging.info('line 87')
with eventlet.Timeout(10, False):
page = s.get(url,headers=headers,proxies=proxyDict,verify=False)
#logging.info('line 89')
tree = html.fromstring(page.content)
#logging.info('line 91')
price = tree.xpath('//div[#class="a-row a-spacing-mini olpOffer"]/div[#class="a-column a-span2 olpPriceColumn"]/span[#class="a-size-large a-color-price olpOfferPrice a-text-bold"]/text()')[0]
title = tree.xpath('//h1/text()')[0]
#logging.info('line 93')
new_price = re.findall("[-+]?\d+[\.]?\d+[eE]?[-+]?\d*", price)[0]
#logging.info('line 95')
old_price = new_price
#logging.info('line 97')
#print price
print new_price
print title + 'Current price:' + new_price
if float(new_price)<float(target_price):
print 'Lower price found!'
mydriver = webdriver.Chrome()
send_simple_message()
login(mydriver)
print 'Old Price: ' + old_price
print 'New Price: ' + new_price
else:
print 'Trying again'
q.task_done()
except Exception as e:
print e
print 'Error!'
q.task_done()
And here is my thread driver function;
q = Queue(concurrent * 2)
if __name__ == "__main__":
for i in range(concurrent):
t = Thread(target=doWork,args=(q,))
t.daemon = True
t.start()
try:
with open('products.csv','r') as f:
reader = csv.reader(f.read().splitlines())
for row in reader:
q.put((row[0],row[1]))
q.join()
except KeyboardInterrupt:
sys.exit(1)

For anyone that is facing the same issue, here's how I solved it.
I removed q.task_done() from the while loop and put it outside the while loop. This is working as intended but I'm not sure if this is the right approach.

Related

Python multiprocessing.Process can not stop when after connecting the network

When I try to crawl thesis information in multiple threads, I cannot close the process after getting the information:
error
And when I comment the code which function is get the information from network, these processes can end normally.
normal
This error is trouble me and I don't have any idea, my network connect is by requests and set the response.close()
so can any handsome brother or beautiful lady help this confused person? Thanks
This is whole code:
my python is python 3.7
from multiprocessing import Process, Queue, Pool,Manager,Value
import time, random
import requests
import re
from bs4 import BeautifulSoup
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
'Connection': 'close'
}
## Just get the html text
def GetUrlInfo(url):
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
response.close()
SoupData = BeautifulSoup(response.text, 'lxml')
return SoupData
def GetVolumeUrlfromUrl(url:str)->str:
"""input is Journal's url and output is a link and a text description to each issue of the journal"""
url = re.sub('http:', 'https:', url)
SoupDataTemp = GetUrlInfo(url+'index.html')
SoupData = SoupDataTemp.find_all('li')
UrlALL = []
for i in SoupData:
if i.find('a') != None:
volumeUrlRule = '(.*?)'
volumeUrlTemp = re.findall(volumeUrlRule,str(i),re.I)
# u = i.find('a')['href']
# # print(u)
for u in volumeUrlTemp:
if re.findall(url, u[0]):
# print(u)
UrlALL.append((u[0], u[1]), )
# print(UrlALL)
return UrlALL
def GetPaperBaseInfoFromUrlAll(url:str)->str:
"""The input is the url and the output is all the paper information obtained from the web page,
including, doi, title, author, and the date about this volume """
soup = GetUrlInfo(url)
temp1 = soup.find_all('li',class_='entry article')
temp2= soup.find_all('h2')
temp2=re.sub('\\n',' ',temp2[1].text)
# print(temp2)
volumeYear = re.split(' ',temp2)[-1]
paper = []
for i in temp1:
if i.find('div',class_='head').find('a')== None:
paperDoi = ''
else:
paperDoi = i.find('div',class_='head').find('a')['href']
title = i.find('cite').find('span',class_='title').text[:-2]
paper.append([paperDoi,title])
return paper,volumeYear
# test start
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
# put the url into the query
def Write(query,value,num):
for count in range(num):
query.put(value[count][0],True)
# time.sleep(random.random())
print('write end')
# from the query get the url and get the paper info with this url
def Read(query,num,PaperInfo1,COUNT,i,paperNumber):
while True:
count = COUNT.get(True)
# print("before enter" + str(i) + ' - ' + str(count)+' - '+str(num))
COUNT.put(count, True)
if not query.empty():
value = query.get(True)
count = COUNT.get(True)
count = count + 1
COUNT.put(count,True)
paper, thisYear = GetPaperBaseInfoFromUrlAll(value) # just commented
print("connected " + str(i) + ' - ' + str(count) + ' - ' + str(num))
numb = paperNumber.get(True)
numb = numb + len(paper)
paperNumber.put(numb) # just commented
# print(paper,thisYear)
PaperInfo1.put((paper,thisYear),) # just commented
print("the process "+str(i)+' - '+ str(count)+ ' : '+value)
if not COUNT.empty():
count = COUNT.get(True)
# print("after enter" + str(i) + ' - ' + str(count) + ' - ' + str(num))
COUNT.put(count,True)
if int(count) == int(num):
print("the process "+str(i)+" end ")
break
print('read end')
# print the paper info
def GetPaperInfo(PaperInfo1,paperNumber):
for i in range(paperNumber.get(True)):
value = PaperInfo1.get(True)
print(value)
if __name__=='__main__':
r_num = 10 # th read process number
w_num = 1 # th write process number
w_cnt = UrlLen # the write counter
q = Queue(UrlLen) # the volune url queue
paperNumber = Queue(1) # the all paper number
COUNT = Queue(1) # the end tag
COUNT.put(int(0)) # first is zero
paperNumber.put(int(0)) # first is zero
PaperInfo1 = Queue()
r_list = [Process( target=Read, args=(q,w_cnt,PaperInfo1,COUNT,i,paperNumber) ) for i in range(r_num)]
w_list = [Process( target=Write, args=(q,UrlALL,w_cnt) )]
time_start = time.time()
[task.start() for task in w_list]
[task.start() for task in r_list]
[task.join() for task in w_list]
[task.join() for task in r_list]
time_used = time.time() - time_start
GetPaperInfo(PaperInfo1, paperNumber)
print('time_used:{}s'.format(time_used))
I have no idea, with debug the process finally enter the process.py -> row:297: try: self.run() and then enter the row:300: util._exit_function() and just a connected
the debug
but I dont know why the network can cause this error and how to solve this
that's all Thank you!
Hi,this is me again,I tried a concurrent implementation of threads,and global variables for threads are much more comfortable than process queue data sharing. By thread it does implement but my main function can't be stopped, previously with processes it was not possible to proceed to the next step when fetching concurrently, the fetching of data was implemented through threads and continued in the main function but the main function can't be stopped anymore. How interesting!
I have designed three functions similar to the previous ones.
GetUrlintoQueue is to write the fetched url UrlALL to the queue UrlQueue, UrlLen is the number of the url.
import threading
import queue
count = 0 # Record the number of times a value is fetched from the queue
paperNumber = 0 # Record the number of papers
def GetUrlintoQueue(UrlQueue,UrlALL,UrlLen):
for index in range(UrlLen):
UrlQueue.put(UrlALL[index][0], True)
print('Write End')
UrlQueue.task_done()
The other is GetPaperInfofromUrl. Get the url from the UrlQueue and write the information of the corresponding page to PaperInfo, index is the thread number.
def GetPaperInfofromUrl(UrlQueue,PaperInfo,index,UrlLen):
global count,paperNumber
while True:
if not UrlQueue.empty():
url = UrlQueue.get(True)
count = count + 1
paper, thisYear = GetPaperBaseInfoFromUrlAll(url) # just commented
print("connected " + str(index) + '-nd - ' + str(count) + ' - ' + str(UrlLen))
print(paper,thisYear)
paperNumber = paperNumber + len(paper)
PaperInfo.put((paper, thisYear), True)
if count == UrlLen:
print("the process " + str(index) + " end ")
break
UrlQueue.task_done()
PaperInfo.task_done()
print('the process ' + str(index) +' get paper info end')
GetPaperInfo is to show the results about PaperInfo, and it don't change.
def GetPaperInfo(PaperInfo,paperNumber):
for i in range(paperNumber):
value = PaperInfo.get(True)
print(value)
The main function first sets the corresponding variables, then writes directly first, then 10 threads crawl paper information, and finally shows the results, but after displaying the results still can not exit, I can not understand why.
if __name__ == '__main__':
url = 'http://dblp.uni-trier.de/db/journals/talg/'
UrlALL = GetVolumeUrlfromUrl(url)
UrlLen = len(UrlALL)
UrlQueue = queue.Queue(UrlLen)
PaperInfo = queue.Queue(1000)
WriteThread = 1
ReadThread = 10
# url write
GetUrlThread = [threading.Thread(target=GetUrlintoQueue, args=(UrlQueue,UrlALL,UrlLen,))]
time_start = time.time()
[geturl.start() for geturl in GetUrlThread]
[geturl.join() for geturl in GetUrlThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# url write end
# paperinfo get
PaperinfoGetThread = [threading.Thread(target=GetPaperInfofromUrl, args=(UrlQueue,PaperInfo,index,UrlLen,)) for index in range(ReadThread)]
time_start = time.time()
[getpaper.start() for getpaper in PaperinfoGetThread]
[getpaper.join() for getpaper in PaperinfoGetThread]
time_used = time.time() - time_start
print('time_used:{}s'.format(time_used))
# paperinfo get end
GetPaperInfo(PaperInfo,paperNumber) # show the results
import sys # it does not work
sys.exit()
The debug shows: debug.gif
(I dont have 10 reputation so the picture is the type of link. )
Here is how your process might look using concurrent.futures to manage all the threads and data transport. (not tested)
Adapting an example in the documentation.
from concurrent.futures import ThreadPoolExecutor
def GetPaperInfofromUrl(index,url):
paper, thisYear = GetPaperBaseInfoFromUrlAll(url)
return (index,url,paper,thisYear)
if __name__ == "__main__":
url = 'http://dblp.uni-trier.de/db/journals/talg/'
urls,descr = zip(*GetVolumeUrlfromUrl(url))
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
futs = [executor.submit(GetPaperInfofromUrl, index,url) for index,url in enumerate(urls)]
for future in concurrent.futures.as_completed(futs):
results.append(future.result())
GetPaperInfofromUrl seems superfluous, you could probably refactor GetPaperBaseInfoFromUrlAll and avoid a function call.

How to execute single function in multiple thread and thread instance create in loop in python?

1) i have a list of product links and it contain 3385 links
2) i have a function get_pro_info(link) it take link of product and append item to the json file.
3) i want selenium open 5 browser and 5 link parallel and get information of product and append in a file or list..
or 3) selenium open 1 browser and 5 tab(having 5 links) and append file.
Question how can i apply threading on my code?
my code...
new_url=''
def get_pro_info(pro_url):
driver = webdriver.Chrome(executable_path=r'C:\Users\Beenu\PycharmProjects/chromedriver.exe')
try:
new_url = 'https://pk.studiobytcs.com' + pro_url
print('new product URL: ' + new_url)
driver.execute_script("window.open('');")
sleep(1)
# use to switch control
driver.switch_to.window(driver.window_handles[0])
# sleep(1)
driver.get(new_url)
except(WebDriverException, selenium.common.exceptions.TimeoutException, Exception) as e:
print('There is error in getting Product by URL in get_pro_info()! \n' + str(e.stacktrace))
pass
description_source_code = ''
# description_soup = BeautifulSoup()
description_soup: BeautifulSoup = object
# global description_soup
try:
# description_soup = BeautifulSoup('html.parser')
description: WebElement = driver.find_element_by_xpath(
'//*[#id="shopify-section-product-template"]/div[2]/div[1]/div/div[2]')
description_source_code = description.get_attribute("innerHTML")
description_soup: BeautifulSoup = BeautifulSoup(description_source_code, 'html.parser')
except NoSuchElementException as e:
print('Product description taag not found! \n' + str(e.stacktrace))
pass
# 179 here
# This is for getting heading product name
head = ''
r_j_title = ''
try:
head = description_soup.find_all("h1", class_="product_name")
# print(head)
r_j_title = head[0].string.strip()
print("Title: " + r_j_title)
except (HTMLParser, IndexError):
print('Fail to get heading/title Tag! \n' + str(HTMLParser))
# This is for get brand name from heading/title
r_j_brand_and_designer = ''
try:
brand_and_designer = head[0].string.strip().split("-")[0]
r_j_brand_and_designer = str(brand_and_designer).strip()
print('Brand and designer: ' + r_j_brand_and_designer)
except (IndexError, ValueError) as e:
print('Fail to Split Brand from heading/title ! \n' + str(e.stacktrace))
# This is for getting price in integer
r_j_price_in_int = ''
try:
price = description_soup.find_all("span", class_="money")
# print(price)
price_new = price[0].string.strip()
print("New price: " + price_new)
# this is for getting price from string
r_c_price = price[0].string.strip().split(".")[1]
r_j_price_in_int = str(r_c_price).replace(",", "")
# price could ha ,
print('Price: ' + r_j_price_in_int)
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get Tag or failed to Split Brand from heading/title ! \n' + str(e.stacktrace))
# this is for getting full description
description_all = ''
r_j_desc = ''
try:
description_all = description_soup.find_all("div", class_="description")
final_des = str(description_all[0].get_text())
ch = final_des.split()
r_j_desc = str(' '.join(ch))
print("with split ch : " + r_j_desc) # addtion of .string.strip()
except (HTMLParser, IndexError, ValueError) as e:
print('Fail to get all description Tag or failed to Split and removing endline chr from description ! \n' + str(
e.stacktrace))
# This is for trying if fibric tag is not avaliable
try:
get_split_fibric = description_all[0].get_text().split("Fabric", 1)[1]
get_split_des = get_split_fibric.split("Disclaimer")[0]
r_j_fabric = str(get_split_des).strip()
print("getting fibric: " + r_j_fabric)
except IndexError as e:
r_j_fabric = 'N/A'
print('Fabric is not avaliable: ' + r_j_fabric)
item['brand_name'] = str(r_j_brand_and_designer)
item['designer'] = str(r_j_brand_and_designer)
item['title'] = str(r_j_title)
item['description'] = str(r_j_desc)
item['price'] = int(r_j_price_in_int)
item['currency'] = "PKR"
item['product_id'] = str(r_j_title)
item['source'] = str(new_url)
item['fabric'] = str(r_j_fabric)
item['gender'] = "woman"
print(item)
cloth = {
"cloth": item
}
# instruction
print(cloth)
list_before_dump.append(cloth)
driver.close()
driver.quit()
with open('product_link_read.txt', 'r') as file:
data = file.readlines()
# rd_pro_link_list=rd_pro_link_list+data.replace('\n', '')
print(data)
for line in data:
# fap=
rd_pro_link_list.append(str(line).strip())
print(rd_pro_link_list)
print(len(rd_pro_link_list))
for pro_link in rd_pro_link_list:
get_pro_info(pro_link)
print('Pro count = ' + str(pro_count))
pro_count = pro_count + 1
list_before_dump_file.write(json.dumps(list_before_dump))
driver.close()
list_before_dump_file.close()
if you want to iterate list and get always 20 links then you can use range(start, stop, step) with step=20
all_t = []
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
for i in range(0, len(list_of_product_link), 20):
twenty_links = list_of_product_link[i:i+20]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
Other method is good if you will no need later your list
all_t = []
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
t = threading.Thread(target=get_product_info, args=(twenty_links,))
t.start()
all_t.append(t)
# --- later ---
for t in all_t:
t.join()
or
while list_of_product_link:
twenty_links = list_of_product_link[:20]
list_of_product_link = list_of_product_link[20:]
all_t = []
for link in twenty_links:
t = threading.Thread(target=get_product_info, args=(link,))
t.start()
all_t.append(t)
# --- inside first `for` loop ---
for t in all_t:
t.join()
BTW: args= needs tuple - even if you have only one arguments so you need , in ( ) to create tuple with one element.
BTW: If you want it to run only 20 threads in every moment then better see multiprocessing and Pool(20)
from multiprocessing import Pool
def get_product_info(link):
result = ....
return result
if __name__ == '__main__':
with Pool(20) as p:
all_results = p.map(get_product_info, list_of_product_link)

web-scraping multiprocessing doesn't work

I'm trying to use web-scraping on large number of urls and I apply the multiprocessing to speed up but don't know why it cannot speed up at all. Here is part of my code:
def scrape(url,output_path):
page = urlopen(URL)
soup = BeautifulSoup(page, 'html.parser')
item_text = soup.select('#scatter6001 script')[0].text
table = soup.find_all('table',{'class':'noborder dark'})
df1 = pd.read_html(str(table),header = 0)
df1 = pd.DataFrame(df1[0])
...
# function for scraping the data from url
rootPath = '...'
urlp1 = "https://www.proteinatlas.org/"
try:
df1 = pd.read_csv(rootPath + "cancer_list1_2(1).csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
cancer_list = df1.as_matrix().tolist()
URLs = []
for cancer in cancer_list:
urlp2 = "/pathology/tissue/" + cancer[1]
f = cancer[0]
try:
df1 = pd.read_csv(rootPath + f + ".csv", header=0);
except Exception as e:
print("File " + f + " doesn't exist")
print(str(e))
sys.exit()
...
# list of URLs
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=6)
records = p.map(scrape(url,output_path))
p.terminate()
p.join()
Not sure how to speed up the web-scraping using multiprocessing.
You're not actually using multiprocessing. You're running the scrape function once and passing the result as an argument to p.map(). Instead, you need to pass a callable taking one argument, for example:
func = lambda url: scrape(url, output_path)
p.map(func, list_of_urls)

Python loop slowing down

I have a python loop, which uses selenium to get some data from a website and store that in a SQL Database. At the beginning every loop takes about one second but after some time it is slowing down more and more... I think the problem is a memory problem but I don't know how to solve it.
This is my code:
count = 0
driver = webdriver.PhantomJS()
driver.set_window_size(1120, 550)
con = sql.connect(user="user", passwd="passwd", db="db", host="localhost")
cur = con.cursor()
def create():
if random.random() < 0.5:
driver.get('http://www.example.com/w')
else:
driver.get('http://www.example.com/p')
name = driver.find_element_by_xpath("//div[#class='address']/h3").text
name1 = name.split(" ")[0]
name2 = name.split(" ")[1]
test = driver.find_element_by_xpath("//div[#class='adr']").text
test2 = test.replace("\n", " ")
dd = driver.find_element_by_xpath("(//dl[#class='dl-horizontal')[1]/dd").text
dd2 = driver.find_element_by_xpath("(//dl[#class='dl-horizontal'])[2]/dd/a").text
day = driver.find_element_by_xpath("(//dl[#class='dl-horizontal'])[5]/dd").text
i = "','"
try:
values = unidecode("'" + name1 + i + name2 + i + dd + i + dd2 + i + day + i + test2 + "'")
cur.execute("INSERT INTO accounts (name1,name2,dd,dd2,day,test2) VALUES (" + values + ")")
con.commit()
global anzahl
anzahl += 1
sys.stdout.write('.')
sys.stdout.flush()
gc.collect()
except sql.Error as e:
print("Error %d: %s" % (e.args[0], e.args[1]))
gc.collect()
start = time.time()
for _ in range(200):
create()
cur.close()
con.close()
end = time.time()
I don't see anything what yould slow down the loop. I tried gc.collect() but it doesn't change anything.
What can i do to that my loop does not slow down after a some time?
Things that can slow down you code:
The web server, which can reduce bandwidth to prevent DoS,
You driver object?
Network (Database) can be slow,
I/O access (with sys.stdout.write and print), depending on the real stream. Is it a console?

Python Exception Handling with in a loop

An exception occurs when my program can't find the element its looking for, I want to log the event within the CSV, Display a message the error occurred and continue. I have successfully logged the event in the CSV and display the message, Then my program jumps out of the loop and stops. How can I instruct python to continue. Please check out my code.
sites = ['TCF00670','TCF00671','TCF00672','TCF00674','TCF00675','TCF00676','TCF00677']`
with open('list4.csv','wb') as f:
writer = csv.writer(f)
try:
for s in sites:
adrs = "http://turnpikeshoes.com/shop/" + str(s)
driver = webdriver.PhantomJS()
driver.get(adrs)
time.sleep(5)
LongDsc = driver.find_element_by_class_name("productLongDescription").text
print "Working.." + str(s)
writer.writerows([[LongDsc]])
except:
writer.writerows(['Error'])
print ("Error Logged..")
pass
driver.quit()
print "Complete."
Just put the try/except block inside the loop. And there is no need in that pass statement at the end of the except block.
with open('list4.csv','wb') as f:
writer = csv.writer(f)
for s in sites:
try:
adrs = "http://turnpikeshoes.com/shop/" + str(s)
driver = webdriver.PhantomJS()
driver.get(adrs)
time.sleep(5)
LongDsc = driver.find_element_by_class_name("productLongDescription").text
print "Working.." + str(s)
writer.writerows([[LongDsc]])
except:
writer.writerows(['Error'])
print ("Error Logged..")
NOTE It's generally a bad practice to use except without a particular exception class, e.g. you should do except Exception:...

Categories