Python appending to binary file works in Unix, not in Windows - python

The function below receives file chunks from a web requests and assembles them. It works perfectly in Unix (OSX) but on Windows, it doesn't. Specifically, the file does assemble, however it always ends up too small, just a few KB. I cannot figure out what is causing this. No exceptions are raised, it all appears to work, except that the final file is not all there. I've included the entire function for context but I've marked the section which appears not to be working correctly. (Python 2.7 and Windows Server 2008 R2)
#view_config(route_name='upload', renderer='json')
def upload(request):
r = request.response
final_dir = 'w:\\foobar'
filename = request.params.get('flowFilename')
chunk_number = request.params.get('flowChunkNumber')
total_chunks = request.params.get('flowTotalChunks')
try:
temp_dir = os.path.join(final_dir, request.params.get('flowIdentifier'))
file_part = os.path.join(temp_dir, '%s.part.%s' % (filename, chunk_number))
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
except TypeError:
pass
if request.method == 'GET':
if file_part:
if os.path.isfile(file_part):
r.status = 200
else:
r.status = 404
return r
if request.POST:
try:
fo = request.params.get('file').file
f = open(file_part, 'wb')
f.write(fo.read())
f.close()
if chunk_number == total_chunks:
final_filename = os.path.join(final_dir, filename)
temp_filename = filename + '_INCOMPLETE'
#####################################################################
# This is where is appears to be going wrong...
final_file = open(temp_filename, 'a+b')
try:
for i in range(1, int(total_chunks) + 1):
ff = open(os.path.join(temp_dir, '%s.part.%s' % (filename, i)))
final_file.write(ff.read())
ff.close()
final_file.close()
os.rename(temp_filename, final_filename) # rename to final filename
shutil.rmtree(temp_dir) # clean up temp part files
except:
raise
####################################################################
r.status = 200
except Exception, e:
print 'ERROR', e.message
r.status = 404
return r

Related

Value not appending to global array

I am trying to run a multithreaded email checker to see if the emails are office 365 valid.
Looking over and over my code, I cannot seem to find the reason it's not working correctly.
It should be appending the email to a GOOD or BAD list.
Instead, it's not appending anything!
This is my code:
...
currentDirectory = os.getcwd() # set the current directory - /new/
# Locations
location_emails_goods = currentDirectory + '/contacts/goods/'
location_emails_bads = currentDirectory + '/contacts/bads/'
location_emails = currentDirectory + '/contacts/contacts.txt'
now = datetime.now()
todayString = now.strftime('%d-%m-%Y-%H-%M-%S')
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
def saveLogs():
global GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY, file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS
#print(GOOD_EMAILS_ARRAY)
for good in GOOD_EMAILS_ARRAY:
file_goods.write(good + '\n')
file_goods.close()
for bad in BAD_EMAILS_ARRAY:
file_bads.write(bad + '\n')
file_bads.close()
def newChecker(email):
global url, GOOD_EMAILS_ARRAY, BAD_EMAILS_ARRAY
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
BAD_EMAILS_ARRAY.append(email)
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
if valid:
GOOD_EMAILS_ARRAY.append(email)
else:
BAD_EMAILS_ARRAY.append(email)
# The follow is showing empty array eventhough I have defined GOOD_EMAILS_ARRAY globally so it should be updating
print(GOOD_EMAILS_ARRAY)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()
As you can see, I am trying to append an email to either GOOD_EMAILS_ARRAY or BAD_EMAILS_ARRAY.
The BAD_EMAILS_ARRAY and GOOD_EMAILS_ARRAY are global variables but it for reason won't append to them.
I am running this through multiprocessing if you need to know.
Any ideas or errors looking in my code?
Processes do not share memory, the global variable with same name in two processes are two different object.
If you need share state between processes, see this:
https://docs.python.org/3/library/multiprocessing.html#sharing-state-between-processes
Okay so it turns out that I just needed to use the Manager from multiprocessing:
from multiprocessing import Manager, Pool
then I could use a normal array through the manager such as:
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
This allowed me to then use my script like it was, just using these new arrays by Manager which works just how I wanted :)
...
FILE_NAME_DATE_GOODS = None
FILE_NAME_DATE_BADS = None
# Set empty arrays using manager so we can carry it over
manager = Manager()
bad_list = manager.list()
good_list = manager.list()
# Get all emails
def get_contacts(filename):
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
emails.append(a_contact.strip())
return emails
ALL_EMAILS = get_contacts(location_emails)
url = 'https://login.microsoftonline.com/common/GetCredentialType'
def saveLogs():
global file_bads, file_goods, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS, good_list, bad_list
for good in good_list:
file_goods.write(good + '\n')
file_goods.close()
for bad in bad_list:
file_bads.write(bad + '\n')
file_bads.close()
print('{} => Fully completed email scanning'.format(Fore.CYAN))
print('{} => Good emails [{}] || Bad emails [{}]'.format(Fore.GREEN, FILE_NAME_DATE_GOODS, FILE_NAME_DATE_BADS))
def newChecker(email):
global url, good_list, bad_list
s = req.session()
body = '{"Username":"%s"}' % email
request = req.post(url, data=body)
response = request.text
valid = re.search('"IfExistsResult":0,', response)
invalid = re.search('"IfExistsResult":1,', response)
if invalid:
bad_list.append(email)
if valid:
good_list.append(email)
else:
if valid:
good_list.append(email)
else:
bad_list.append(email)
def mp_handler(p):
global ALL_EMAILS
p.map(newChecker, ALL_EMAILS)
if __name__ == '__main__':
# Foreach email, parse it into our checker
# Define a filename to save to
FILE_NAME_DATE_GOODS = '{}{}{}'.format(location_emails_goods, todayString, '.txt')
FILE_NAME_DATE_BADS = '{}{}{}'.format(location_emails_bads, todayString, '.txt')
file_bads = open(FILE_NAME_DATE_BADS, 'a')
file_goods = open(FILE_NAME_DATE_GOODS, 'a')
p = multiprocessing.Pool(500)
mp_handler(p)
saveLogs()
p.close()

Python Flask chunk data upload not working

I am trying to upload a large file (say ~1GB) from client (using Python request.post) to the flask server.
When client sends the request to server in chunks of 1024, server do not read the whole file and save to server 0kb.
Can you please help me in debugging what exactly I am mistaking here.
Server - Flask Code:
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads/'
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
with open(fileFullPath, "wb") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({'filename': filename})
if __name__ == '__main__':
app.run(host="0.0.0.0", port=int("8080"),debug=True)
Client - Request Code
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, 'r') as f:
try:
r = requests.post(url, data=read_in_chunks(f))
print "r: {0}".format(r)
except Exception, e:
print e
if __name__ == '__main__':
filename = 'bigfile.zip' # ~1GB
url = 'http://localhost:8080/upload/{0}'.format(filename)
main(filename, url)
kindly use 'file.stream.read(chunk_size)' instead of request.stream.read(chunk_size). It works for me...!
Old thread but I was looking for something similar so I'll post here anyway.
The server reads the file in write mode which will overwrite at each chunk. Prefer append mode:
with open(fileFullPath, "ab") as f:
The client needs to read the file in byte mode:
with open(content_path, "rb") as f:
Finally, the generator read_in_chunks needs to be used in a loop before being passed to the request:
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
Then you have your 2 files
Server
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads/"
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
with open(fileFullPath, "ab") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({"filename": filename})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=int("8080"), debug=True)
Client
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
if __name__ == "__main__":
filename = "bigfile.zip" # ~1GB
url = "http://localhost:8080/upload/{0}".format(filename)
main(filename, url)
Note that posting un chunks usually requires the total number of chunks and a hash of the file to validate the upload.
Flask depends on werkzeug to process streams, and werkzeug demands a content length for a stream. There's a thread on this here, but no real solution currently available, other than to take another framework approach.
This example below should work very well for you all. If you use Redis, you can also pub/sub the chunk being processed for progression bar in another API.
from flask import Flask, request, jsonify
#app.route("/submit_vdo", methods=['POST'])
def submit_vdo():
#copy_current_request_context
def receive_chunk(stream, full_file_path):
if full_file_path is None:
tmpfile = tempfile.NamedTemporaryFile('wb+', prefix=str(uuid.uuid4())+"_")
full_file_path = tmpfile.name
print ('Write temp to ', full_file_path)
with open(full_file_path, "wb") as f:
max_chunk_size = settings.VIDEO_MAX_SIZE_CHUNK # config.MAX_UPLOAD_BYTE_LENGHT
count_chunks = 0
total_uploaded = 0
try:
while True:
print ('Chunk ', count_chunks)
chunk = stream.read(max_chunk_size)
if chunk is not None and len(chunk)>0:
total_uploaded += len(chunk)
count_chunks += 1
f.write(chunk)
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'uploading...'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
print (temp)
else:
f.close()
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'DONE'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
break
except Exception as e:
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = e
temp ['success'] = False
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
return None
return full_file_path
stream = flask.request.files['file']
stream.seek(0)
full_file_path = receive_chunk(stream, full_file_path)
return "DONE !"

How to optimize the memory usage of my python crawler

I am learning python crawler these days, and I write a simple crawler to get the picture on the Pixiv by Pixiv ID.
It works quite well, but here comes a big problem: When it is running, it takes up nearly 1.2G memory on my computer.
However, sometimes it just takes up just 10M memory, I really don't know which code causes such big usage of memory.
I have uploaded the script to my VPS(Only 768M memory Vulter server) and tried to run. As a result, I get a MerroyError.
So I wonder how to optimize the memory usage(even if taking more time to run).
Here is my code:
(I have rewrote all the code to make it pass pep8, if still unclear, please tell me which code makes you confused.)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[#class="page-menu"]'
'/div[#class="page"]'
'/span[#class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/#href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/img/#src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"
OMG!
Finally, I know what goes wrong.
I use mem_top() to see what takes up the memory.
Guess what?
It is for i in range(0, 38849402):
In the memory, there is a list [0, 1, 2, 3 ... 38849401], which takes up my memory.
I change it to :
Pixiv_ID = 38849402
while Pixiv_ID > 0:
some code here
Pixiv_ID = Pixiv_ID-1
Now the memory usage is just no more than 20M.
Feeling excited!

How to display image on web browser using Tornado Python server?

The error says the the indent is at the "if _name" etc. part.. I'm not sure why it would indicate this.
class GetFileHandler(tornado.web.RequestHandler):
def get(self):
fileid = self.get_argument('fileid', "")
cur.execute("""SELECT filepath FROM files_table WHERE file_id = %s""", (fileid, ))
m = cur.fetchall()
y = m[0]
x = y[0]
path = x + "/" + fileid + ".jpg"
try:
with open(path, 'rb') as f:
data = f.read()
self.write(data)
self.finish()
if __name__ == "__main__":
tornado.options.parse_command_line()
app = tornado.web.Application(handlers=[(r"/getit", GetFileHandler)])
http_server = tornado.httpserver.HTTPServer(app)
http_server.listen(options.port)
tornado.ioloop.IOLoop.instance().start()
try expects an except
try:
with open(path, 'rb') as f:
data = f.read()
self.write(data)
self.finish()
except IOError:
print "Failed!!"
in order to get it to show up as an image you will need to set the content header to reflect that it is an image mime-type ...
Where is the except block to match the get function? You should add, at a minimum, something like the following to the function.
except IOError as xcpt:
# IO error handling
raise xcpt # if you want to propagate the exception
if you have a separate folder for images and want to serve them as static content with a url e.g.
http://yourwebsite.com/images/yourimage.jpg
then you can make use of tornado.web.StaticFileHandler :
handlers = [
(r"/images/(.*)", tornado.web.StaticFileHandler, {'path': "./images"}),
(r"/", WebHandler)
]

Python URL download

The code below returns none. How can I fix it? I'm using Python 2.6.
import urllib
URL = "http://download.finance.yahoo.com/d/quotes.csv?s=%s&f=sl1t1v&e=.csv"
symbols = ('GGP', 'JPM', 'AIG', 'AMZN','GGP', 'JPM', 'AIG', 'AMZN')
#symbols = ('GGP')
def fetch_quote(symbols):
url = URL % '+'.join(symbols)
fp = urllib.urlopen(url)
try:
data = fp.read()
finally:
fp.close()
def main():
data_fp = fetch_quote(symbols)
# print data_fp
if __name__ =='__main__':
main()
You have to explicitly return the data from fetch_quote function. Something like this:
def fetch_quote(symbols):
url = URL % '+'.join(symbols)
fp = urllib.urlopen(url)
try:
data = fp.read()
finally:
fp.close()
return data # <======== Return
In the absence of an explicit return statement Python returns None which is what you are seeing.
Your method doesn't explicitly return anything, so it returns None

Categories