I am trying to upload a large file (say ~1GB) from client (using Python request.post) to the flask server.
When client sends the request to server in chunks of 1024, server do not read the whole file and save to server 0kb.
Can you please help me in debugging what exactly I am mistaking here.
Server - Flask Code:
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads/'
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
with open(fileFullPath, "wb") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({'filename': filename})
if __name__ == '__main__':
app.run(host="0.0.0.0", port=int("8080"),debug=True)
Client - Request Code
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, 'r') as f:
try:
r = requests.post(url, data=read_in_chunks(f))
print "r: {0}".format(r)
except Exception, e:
print e
if __name__ == '__main__':
filename = 'bigfile.zip' # ~1GB
url = 'http://localhost:8080/upload/{0}'.format(filename)
main(filename, url)
kindly use 'file.stream.read(chunk_size)' instead of request.stream.read(chunk_size). It works for me...!
Old thread but I was looking for something similar so I'll post here anyway.
The server reads the file in write mode which will overwrite at each chunk. Prefer append mode:
with open(fileFullPath, "ab") as f:
The client needs to read the file in byte mode:
with open(content_path, "rb") as f:
Finally, the generator read_in_chunks needs to be used in a loop before being passed to the request:
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
Then you have your 2 files
Server
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads/"
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
with open(fileFullPath, "ab") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({"filename": filename})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=int("8080"), debug=True)
Client
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
if __name__ == "__main__":
filename = "bigfile.zip" # ~1GB
url = "http://localhost:8080/upload/{0}".format(filename)
main(filename, url)
Note that posting un chunks usually requires the total number of chunks and a hash of the file to validate the upload.
Flask depends on werkzeug to process streams, and werkzeug demands a content length for a stream. There's a thread on this here, but no real solution currently available, other than to take another framework approach.
This example below should work very well for you all. If you use Redis, you can also pub/sub the chunk being processed for progression bar in another API.
from flask import Flask, request, jsonify
#app.route("/submit_vdo", methods=['POST'])
def submit_vdo():
#copy_current_request_context
def receive_chunk(stream, full_file_path):
if full_file_path is None:
tmpfile = tempfile.NamedTemporaryFile('wb+', prefix=str(uuid.uuid4())+"_")
full_file_path = tmpfile.name
print ('Write temp to ', full_file_path)
with open(full_file_path, "wb") as f:
max_chunk_size = settings.VIDEO_MAX_SIZE_CHUNK # config.MAX_UPLOAD_BYTE_LENGHT
count_chunks = 0
total_uploaded = 0
try:
while True:
print ('Chunk ', count_chunks)
chunk = stream.read(max_chunk_size)
if chunk is not None and len(chunk)>0:
total_uploaded += len(chunk)
count_chunks += 1
f.write(chunk)
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'uploading...'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
print (temp)
else:
f.close()
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'DONE'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
break
except Exception as e:
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = e
temp ['success'] = False
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
return None
return full_file_path
stream = flask.request.files['file']
stream.seek(0)
full_file_path = receive_chunk(stream, full_file_path)
return "DONE !"
Related
I've been working on trying to edit a webhook that was originally meant to be used for a weather API to get to be used with a postcode/zipcode API. The original file is here: https://github.com/dialogflow/fulfillment-webhook-weather-python/blob/master/app.py
I can't understand where mine is different, I thought I had solved it when I replaced urlencode with quote but alas, it wasn't enough.
The problem is very unlikely to do with the source json request that collects the postcode in postcodeValue(). The api url comes out correct when you enter it into a browser and is presented quite simply.
https://api.getaddress.io/find/SW11%201th?api-key=I98umgPiu02GEMmHdmfg3w12959
Is it in the correct format? Maybe I need to convert it to become even more JSON then it already is. This question is essentially an end of day brain dump that I I'm hoping that someone can save me with.
from __future__ import print_function
from future.standard_library import install_aliases
install_aliases()
from urllib.parse import urlparse, urlencode, quote
from urllib.request import urlopen, Request
from urllib.error import HTTPError
import json
import os
from flask import Flask
from flask import request
from flask import make_response
# Flask app should start in global layout
app = Flask(__name__)
#this line is just naming conventions I reckon with a reference to expect to receive data as POST
#app.route('/webhook', methods=['POST'])
def webhook():
req = request.get_json(silent=True, force=True)
#who knows where this is getting printed
print("Request:")
print(json.dumps(req, indent=4))
res = processRequest(req)
res = json.dumps(res, indent=4)
# print(res)
r = make_response(res)
r.headers['Content-Type'] = 'application/json'
return r
def processRequest(req):
if req.get("result").get("action") != "yahooWeatherForecast":
return {}
baseurl = "https://api.getaddress.io/find/"
apikey = "?api-key=I98umgPiu02GEMmHdmfg3w12959"
yql_query = postcodeValue(req)
if yql_query is None:
return {}
#this line is the actual api request
yql_url = baseurl + quote(yql_query) + apikey
result = urlopen(yql_url).read()
data = json.loads(result)
res = makeWebhookResult(data)
return res
#this function extracts an individual parameter and turns it into a string
def postcodeValue(req):
result = req.get("result")
parameters = result.get("parameters")
postcode = parameters.get("postcode")
if postcode is None:
return None
return postcode
#def housenoValue(req):
# result = req.get("result")
#parameters = result.get("parameters")
#houseno = parameters.get("houseno")
#if houseno is None:
# return None
#return houseno
def makeWebhookResult(data):
longitude = data.get("longitude")
if longitude is None:
return {}
#def makeWebhookResult(data):
# query = data.get('query')
# if query is None:
# return {}
# result = query.get('results')
# if result is None:
# return {}
# channel = result.get('channel')
# if channel is None:
# return {}
# item = channel.get('item')
# location = channel.get('location')
# units = channel.get('units')
# if (location is None) or (item is None) or (units is None):
# return {}
# condition = item.get('condition')
# if condition is None:
# return {}
# print(json.dumps(item, indent=4))
speech = "Sausage face " + longitude
print("Response:")
print(speech)
return {
"speech": speech,
"displayText": speech,
# "data": data,
# "contextOut": [],
"source": "apiai-weather-webhook-sample"
}
#More flask specific stuff
if __name__ == '__main__':
port = int(os.getenv('PORT', 5000))
print("Starting app on port %d" % port)
app.run(debug=False, port=port, host='0.0.0.0')
Here is a bit cleaner version of your code:
from urllib.request import urlopen
import os
from flask import Flask
app = Flask(__name__)
#app.route('/webhook', methods=['GET'])
def webhook():
res = processRequest()
return res
def processRequest():
try:
result = urlopen("https://api.getaddress.io/find/SW11%201th?api-key=I98umgPiu02GEMmHdmfg3w12959").read()
return result
except:
return "Error fetching data"
if __name__ == '__main__':
port = int(os.getenv('PORT', 5000))
print("Starting app on port %d" % port)
app.run(debug=False, port=port, host='0.0.0.0')
Open your browser and go to http://localhost:5000/webhook and you should see a response.
I scraped a ticketing website that we were using and I now have a CSV file which looks like this: ID, Attachment_URL, Ticket_URL. What I now need to do is download every attachment and rename the file with the Ticket_URL. The main issue I have is that when navigating to the Attachment_URL you must use basic authentication and then you are redirected to an aws s3 link. I have been able to download individual files using wget, but I have not been able to iterate through the entire list (35k rows or so), and I am not sure how I would be able to name the file as the ticket_id. Any advice would be appreciated.
Got it.
To open the authenticated session:
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login.url.here', data=payload)
for i in range(1, 6000):
testURL = s.get(
'https://urlhere.com/efw/stuff&page={}'.format(i))
soup = BeautifulSoup(testURL.content)
table = soup.find("table", {"class": "table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')[1:]
print "The current page is: " + str(i)
for row in rows:
cols = row.find_all('a', attrs={'href': re.compile("^/helpdesk/")})
# time.sleep(1)
with open('fd.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(cols)
print cols
print cols
Then I cleaned the links a bit in R and to download the files.
#! /usr/bin/env python
import threading
import os
from time import gmtime, strftime
from Queue import Queue
import requests
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login', data=payload)
class log:
def info(self, message):
self.__message("info", message)
def error(self, message):
self.__message("error", message)
def debug(self, message):
self.__message("debug", message)
def __message(self, log_level, message):
date = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print "%s [%s] %s" % (date, log_level, message)
class fetch:
def __init__(self):
self.temp_dir = "/tmp"
def run_fetcher(self, queue):
while not queue.empty():
url, ticketid = queue.get()
if ticketid.endswith("NA"):
fileName = url.split("/")[-1] + 'NoTicket'
else:
fileName = ticketid.split("/")[-1]
response = s.get(url)
with open(os.path.join('/Users/Desktop/FolderHere', fileName + '.mp3'), 'wb') as f:
f.write(response.content)
print fileName
queue.task_done()
if __name__ == '__main__':
# load in classes
q = Queue()
log = log()
fe = fetch()
# get bucket name
#Read in input file
with open('/Users/name/csvfilehere.csv', 'r') as csvfile:
for line in csvfile:
id,url,ticket = line.split(",")
q.put([url.strip(),ticket.strip()])
# spin up fetcher workers
threads = []
for i in range(8):
t = threading.Thread(target=fe.run_fetcher, args=(q,))
t.daemon = True
threads.append(t)
t.start()
# close threads
[x.join() for x in threads]
# close queue
q.join()
log.info("End")
I'm want to POST a large file from a python client to cherrypy. I'm using the requests library.
This is my client code:
def upload(fileName=None):
url = 'http://localhost:8080/upload'
files = {'myFile': ( fileName, open(fileName, 'rb') )}
r = requests.post(url, files=files)
#with open(fileName,'rb') as payload:
#headers = {'content-type': 'multipart/form-data'}
#r = requests.post('http://127.0.0.1:8080', data=payload,verify=False,headers=headers)
if __name__ == '__main__':
upload(sys.argv[1])
The problem is that this puts the whole file in the RAM memory. Is there any way to POST the file in pieces?
class FileDemo(object):
#cherrypy.expose
def upload(self, myFile):
print myFile.filename
#size = 0
#decoder = MultipartDecoder(myFile, 'image/jpeg')
#for part in decoder.parts:
#print(part.header['content-type'])
#while True:
#advances to the content that hasn't been read
#myFile.file.seek(size, 0)
#reads 100mb at a time so it doesn't fill up the RAM
#data = myFile.file.read(10240000)
#newFile = open("/home/ivo/Desktop/"+str(myFile.filename), 'a+')
#newFile.write(data)
#newFile.close
#size += len(data)
#if len(data) < 10240000:
#break
if __name__ == '__main__':
cherrypy.quickstart(FileDemo())
This is the code in the server side. It has a lot of comments because I've been trying a lot of stuff. Right now I'm just printing the file name and the client still transfers the whole file to RAM.
I don't know what else to try. Thank you in advance for your help.
If it's CherryPy specific upload you can skip multipart/form-data encoding obstacles and just send streaming POST body of file contents.
client
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib2
import io
import os
class FileLenIO(io.FileIO):
def __init__(self, name, mode = 'r', closefd = True):
io.FileIO.__init__(self, name, mode, closefd)
self.__size = statinfo = os.stat(name).st_size
def __len__(self):
return self.__size
f = FileLenIO('/home/user/Videos/video.mp4', 'rb')
request = urllib2.Request('http://127.0.0.1:8080/upload', f)
request.add_header('Content-Type', 'application/octet-stream')
# you can add custom header with filename if you need it
response = urllib2.urlopen(request)
print response.read()
server
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import tempfile
import shutil
import cherrypy
config = {
'global' : {
'server.socket_host' : '127.0.0.1',
'server.socket_port' : 8080,
'server.thread_pool' : 8,
# remove any limit on the request body size; cherrypy's default is 100MB
'server.max_request_body_size' : 0,
# increase server socket timeout to 60s; cherrypy's defult is 10s
'server.socket_timeout' : 60
}
}
class App:
#cherrypy.config(**{'response.timeout': 3600}) # default is 300s
#cherrypy.expose()
def upload(self):
'''Handle non-multipart upload'''
destination = os.path.join('/home/user/test-upload')
with open(destination, 'wb') as f:
shutil.copyfileobj(cherrypy.request.body, f)
return 'Okay'
if __name__ == '__main__':
cherrypy.quickstart(App(), '/', config)
Tested on 1.3GiB video file. Server-side memory consumption is under 10MiB, client's under 5MiB.
This is how I solved the problem:
client
import poster
def upload(fileName=None):
register_openers()
url = 'http://localhost:8080/upload'
data, headers = multipart_encode({"myFile": open(fileName, "rb")})
request = urllib2.Request(url, data, headers)
request.unverifiable = True
response = urllib2.urlopen(request)
the_page = response.read()
if __name__ == '__main__':
upload(sys.argv[1])
server
#cherrypy.expose
def upload(self, myFile):
cherrypy.response.timeout = 3600
newFile = open("/home/ivo/Desktop/"+str(myFile.filename), 'a+')
newFile.write(myFile.file.read())
newFile.close
I have a very simple Python script using gevent.pool to download URLs (see below). The script runs fine for a couple of days and then locks up. I noticed that the memory usage is very high at that time. Am I using gevent incorrectly?
import sys
from gevent import monkey
monkey.patch_all()
import urllib2
from gevent.pool import Pool
inputFile = open(sys.argv[1], 'r')
urls = []
counter = 0
for line in inputFile:
counter += 1
urls.append(line.strip())
inputFile.close()
outputDirectory = sys.argv[2]
def fetch(url):
try:
body = urllib2.urlopen("http://" + url, None, 5).read()
if len(body) > 0:
outputFile = open(outputDirectory + "/" + url, 'w')
outputFile.write(body)
outputFile.close()
print "Success", url
except:
pass
pool = Pool(int(sys.argv[3]))
pool.map(fetch, urls)
body = urllib2.urlopen("http://" + url, None, 5).read()
Above line reads the entire content in memory as a string. To prevent that, change fetch() as follow:
def fetch(url):
try:
u = urllib2.urlopen("http://" + url, None, 5)
try:
with open(outputDirectory + "/" + url, 'w') as outputFile:
while True:
chunk = u.read(65536)
if not chunk:
break
outputFile.write(chunk)
finally:
u.close()
print "Success", url
except:
print "Fail", url
I want to stream a big file via werkzeug.
Currently my wsgi application looks like this:
from werkzeug.wrappers import Request, Response
from werkzeug.wsgi import ClosingIterator, wrap_file
import os
class Streamer(object):
def __init__(self):
pass
def __call__(self, environ, start_response):
request = Request(environ)
filename = os.getcwd() + "/bigfile.xml"
try:
response = wrap_file(environ, open(filename) )
return response
except HTTPException, e:
response = e
return ClosingIterator(response(environ, start_response))
I'm not sure what I should do with the object returned by the wrap_file function.
Haven't tried myself but I think following will work.
g = file(path_to_bigfile) # or any generator
return Response(g, direct_passthrough=True)
Just in case one would additionally like to:
1. preserve the file name
2. issue download without page redirect
# file_name assumed to be known
# file_path assumed to be known
file_size = os.path.getsize(file_path)
fh = file(file_path, 'rb')
return Response(fh,
mimetype='application/octet-stream',
headers=[
('Content-Length', str(file_size)),
('Content-Disposition', "attachment; filename=\"%s\"" % file_name),
],
direct_passthrough=True)