Python ProcessPoolExecutor don't want to execute the function - python

I using python 3 and mcstatus
My source code is:
from mcstatus import JavaServer
import concurrent.futures
import json
imput_file_name = "dockerformcstatus\dockerMyScript\pyscript\input.txt"
def scaner(data):
print("Processing data: ", data)
global serverJsonData
global timeout
server = JavaServer.lookup(data,timeout)
print(server)
try:
serverJson = server.status().raw
print(serverJson)
serverJsonData.append(serverJson)
except:
print("can't connect")
if __name__ == "__main__":
serverJsonData = []
timeout = 5
data = None
with open(imput_file_name,"r") as f:
data = f.readlines()
#print(data)
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(scaner,data)
#scaner("168.119.4.46:25619")
with open("serverdata.json", "w") as f:
f.write(json.dumps(serverJsonData))
The problem is theat If I use the with concurrent.futures.ProcessPoolExecutor() as executor executor.map(scaner,data) then it not wants to write into the list "serverJsonData", but if I run it as scaner("168.119.4.46:25619") then it works.
I Tryed debug it but not get the reason for this "bug"

Related

Why my code still slow after threading for 15k records only, how to fix this

I have a script, taking links from a file, visiting it, getting re-directed links, storing it back. But it works too slow on a file with 15k records. How can I make it quick? already used threading
Please do help to fix it out!, I've tried multiple ways, threadings but I cannot make it quick. Is there any solution to my problem by any chance? any expert who could help me out.
import concurrent.futures
import sys
import pandas as pd
import requests
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
file_name =None
concurrent = 10000
q = None
count=0
df =None
def do_work():
while True:
global q
url = q.get()
res = get_status(url)
q.task_done()
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
print(count)
count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=do_work)
t.daemon = True
t.start()
try:
for url in urls:
if type(url)!=str:
continue
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
process_data()
for i in range (len(df['shopify'])):
for j in range(len(old)):
if df['shopify'][i]==old[j]:
df['shopify'][i]=out[j]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)
Email,shopify,Proofy_Status_Name
hello#knobblystudio.com,http://puravidabracelets.myshopify.com,Deliverable
service#cafe-select.co.uk,cafe-select.co.uk,Deliverable
mtafich#gmail.com,,Deliverable
whoopies#stevessnacks.com,stevessnacks.com,Deliverable
customerservice#runwayriches.com,runwayriches.com,Deliverable
shop#blackdogride.com.au,blackdogride.com.au,Deliverable
anavasconcelos.nica#gmail.com,grass4you.com,Deliverable
info#prideandprestigehair.com,prideandprestigehair.com,Deliverable
info#dancinwoofs.com,dancinwoofs.com,Deliverable
Threads in Python do not run simultaneously due to the Global Interpreter Lock. You might want to use the multiprocessing module instead, or ProcessPoolExecutor() from concurrent.futures. If you decide to use ProcessPoolExecutors, pass the URLs to the callback and have the callback return the old and redirected URL which should be returned by the result method of the future you get from the executor.submit. When using processes, global variables are not shared, unlike threads.
There has been an attempt to remove the global interpreter lock but without the GIL, Python doesn't run quite as fast or something like that if I remember correctly.
Something like the following might work. I renamed the concurrent variable because it would shadow the concurrent module and probably cause an error. This code is untested because I don't have the csv file to test with.
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import sys
import pandas as pd
import requests
import numpy as np
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
futures = []
file_name =None
concurrent_ = 10000
q = None
count=0
df =None
def do_work(urls):
results = []
for url in urls:
res = get_status(url)
if res:
results.append((res[2], res[1]))
else:
results.append((url, url))
return results
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
#print(count)
#count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
def load_url(url, timeout):
ans = requests.get(url, timeout=timeout)
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
global urls
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
workers = 50
with ProcessPoolExecutor(max_workers=workers) as executor:
url_arrays = np.array_split(urls, workers)
for urls in url_arrays:
f = executor.submit(do_work, urls)
futures.append(f)
process_data()
df['shopify'] = [res[1] for f in concurrent.futures.as_completed(futures) for res in f.result()]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)

How to get simple threading to work Python

i want to know how i can add simple threading to my code. At the moment it checks just one by one, and if some site isnt reachable it will wait for the timeout before it will continue with the next one this slows everything down.
import requests
import sys
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
with open("websites.txt", 'r') as websites:
websites = websites.read().splitlines()
with open("para1.txt", 'r') as para1:
para1 = para1.read().splitlines()
with open("para2.txt", 'r') as para2:
para2 = para2.read().splitlines()
def main():
for i in para1:
for j in para2:
for m in websites:
try:
res = requests.get(m + i + j, verify=False, timeout=10)
print(m + i + j)
if res.status_code == 200:
print('Yes')
else:
print('No')
except Exception as e:
print(e)
except KeyboardInterrupt:
sys.exit()
finally:
res.close()
time.sleep(1)
if __name__ == '__main__':
main()
You can apply ThreadPoolExecutor moving part of code which perform requests to separate function and pass it as argument:
import urllib3
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def check_func(url):
response = requests.get(url, verify=False, timeout=10)
return response.status_code == 200
def main():
with open("websites.txt") as website_f, open("para1.txt") as para1_f,
open("para2.txt", 'r') as para2_f, ThreadPoolExecutor(max_workers=4) as executor:
tasks = {}
for website in website_f:
for para1 in para1_f:
for para2 in para2_f:
url = website.rstrip() + para1.rstrip() + para2.rstrip()
tasks[executor.submit(check_func, url)] = url
for task in as_completed(tasks):
url = tasks[task]
try:
result = task.result()
except KeyboardInterrupt: # handling Ctrl + C
for task in tasks:
task.cancel() # won't cancel already finished or pending futures
except CancelledError: # will never happen (normally)
pass
except Exception as e:
print(url, "-", "ERROR", e)
else:
print(url, "-", "GOOD" if result else "BAD")
if __name__ == "__main__":
main()
P.S. I haven't tested entire code so if there're any problems with it - write in comments.

Python Flask chunk data upload not working

I am trying to upload a large file (say ~1GB) from client (using Python request.post) to the flask server.
When client sends the request to server in chunks of 1024, server do not read the whole file and save to server 0kb.
Can you please help me in debugging what exactly I am mistaking here.
Server - Flask Code:
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads/'
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
with open(fileFullPath, "wb") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({'filename': filename})
if __name__ == '__main__':
app.run(host="0.0.0.0", port=int("8080"),debug=True)
Client - Request Code
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, 'r') as f:
try:
r = requests.post(url, data=read_in_chunks(f))
print "r: {0}".format(r)
except Exception, e:
print e
if __name__ == '__main__':
filename = 'bigfile.zip' # ~1GB
url = 'http://localhost:8080/upload/{0}'.format(filename)
main(filename, url)
kindly use 'file.stream.read(chunk_size)' instead of request.stream.read(chunk_size). It works for me...!
Old thread but I was looking for something similar so I'll post here anyway.
The server reads the file in write mode which will overwrite at each chunk. Prefer append mode:
with open(fileFullPath, "ab") as f:
The client needs to read the file in byte mode:
with open(content_path, "rb") as f:
Finally, the generator read_in_chunks needs to be used in a loop before being passed to the request:
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
Then you have your 2 files
Server
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads/"
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
with open(fileFullPath, "ab") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({"filename": filename})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=int("8080"), debug=True)
Client
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
if __name__ == "__main__":
filename = "bigfile.zip" # ~1GB
url = "http://localhost:8080/upload/{0}".format(filename)
main(filename, url)
Note that posting un chunks usually requires the total number of chunks and a hash of the file to validate the upload.
Flask depends on werkzeug to process streams, and werkzeug demands a content length for a stream. There's a thread on this here, but no real solution currently available, other than to take another framework approach.
This example below should work very well for you all. If you use Redis, you can also pub/sub the chunk being processed for progression bar in another API.
from flask import Flask, request, jsonify
#app.route("/submit_vdo", methods=['POST'])
def submit_vdo():
#copy_current_request_context
def receive_chunk(stream, full_file_path):
if full_file_path is None:
tmpfile = tempfile.NamedTemporaryFile('wb+', prefix=str(uuid.uuid4())+"_")
full_file_path = tmpfile.name
print ('Write temp to ', full_file_path)
with open(full_file_path, "wb") as f:
max_chunk_size = settings.VIDEO_MAX_SIZE_CHUNK # config.MAX_UPLOAD_BYTE_LENGHT
count_chunks = 0
total_uploaded = 0
try:
while True:
print ('Chunk ', count_chunks)
chunk = stream.read(max_chunk_size)
if chunk is not None and len(chunk)>0:
total_uploaded += len(chunk)
count_chunks += 1
f.write(chunk)
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'uploading...'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
print (temp)
else:
f.close()
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'DONE'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
break
except Exception as e:
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = e
temp ['success'] = False
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
return None
return full_file_path
stream = flask.request.files['file']
stream.seek(0)
full_file_path = receive_chunk(stream, full_file_path)
return "DONE !"

Using WGET or Python to download and rename attachments from CSV requiring basic authentication

I scraped a ticketing website that we were using and I now have a CSV file which looks like this: ID, Attachment_URL, Ticket_URL. What I now need to do is download every attachment and rename the file with the Ticket_URL. The main issue I have is that when navigating to the Attachment_URL you must use basic authentication and then you are redirected to an aws s3 link. I have been able to download individual files using wget, but I have not been able to iterate through the entire list (35k rows or so), and I am not sure how I would be able to name the file as the ticket_id. Any advice would be appreciated.
Got it.
To open the authenticated session:
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login.url.here', data=payload)
for i in range(1, 6000):
testURL = s.get(
'https://urlhere.com/efw/stuff&page={}'.format(i))
soup = BeautifulSoup(testURL.content)
table = soup.find("table", {"class": "table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')[1:]
print "The current page is: " + str(i)
for row in rows:
cols = row.find_all('a', attrs={'href': re.compile("^/helpdesk/")})
# time.sleep(1)
with open('fd.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(cols)
print cols
print cols
Then I cleaned the links a bit in R and to download the files.
#! /usr/bin/env python
import threading
import os
from time import gmtime, strftime
from Queue import Queue
import requests
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login', data=payload)
class log:
def info(self, message):
self.__message("info", message)
def error(self, message):
self.__message("error", message)
def debug(self, message):
self.__message("debug", message)
def __message(self, log_level, message):
date = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print "%s [%s] %s" % (date, log_level, message)
class fetch:
def __init__(self):
self.temp_dir = "/tmp"
def run_fetcher(self, queue):
while not queue.empty():
url, ticketid = queue.get()
if ticketid.endswith("NA"):
fileName = url.split("/")[-1] + 'NoTicket'
else:
fileName = ticketid.split("/")[-1]
response = s.get(url)
with open(os.path.join('/Users/Desktop/FolderHere', fileName + '.mp3'), 'wb') as f:
f.write(response.content)
print fileName
queue.task_done()
if __name__ == '__main__':
# load in classes
q = Queue()
log = log()
fe = fetch()
# get bucket name
#Read in input file
with open('/Users/name/csvfilehere.csv', 'r') as csvfile:
for line in csvfile:
id,url,ticket = line.split(",")
q.put([url.strip(),ticket.strip()])
# spin up fetcher workers
threads = []
for i in range(8):
t = threading.Thread(target=fe.run_fetcher, args=(q,))
t.daemon = True
threads.append(t)
t.start()
# close threads
[x.join() for x in threads]
# close queue
q.join()
log.info("End")

with gevent script doesn't finish

The problem is when script finished it didn't write like this "Process finished with exit code 1". But print("exiting...") is working. What's the problem?
from gevent import monkey
import gevent
monkey.patch_all()
threads_count = 2
urls_dic = {}
for filename in search_term:
if os.path.isfile(filename):
urls_dic[filename] = []
f = open(filename, mode='r')
for line in f:
urls_dic[filename].append(line.strip())
f.close()
new_urls = []
for i in range(threads_count):
new_urls.insert(i, {})
for key in urls_dic:
new_urls[i][key] = []
for url in range(i, len(urls_dic[key]), threads_count):
new_urls[i][key].append(urls_dic[key][url])
threads = [gevent.spawn(start, new_urls[i], i + 1) for i in range(threads_count)]
gevent.joinall(threads)
for job in threads:
print("job=", job)
for key in job.value:
write_excel(job.value[key], key)
print("exiting")
exit(1)

Categories