I have the below code and want to write the stream of tweets to a text file. Is there a way to include the output to text file within the same code and save it in the working directory? I am an IDE lover and really don't like using the console. I am new to python (2 weeks), I am an R / R Studio user.
I know I could use:
filename.py > output.txt
I am currently using Rodeo, Python 3.6.1.
import oauth2 as oauth
import urllib.request as urllib
api_key = "##"
api_secret = "##"
access_token_key = "##-##"
access_token_secret = "##"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
'''
Construct, sign, and open a twitter request
using the hard-coded credentials above.
'''
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
f = open("output.txt", "wb")
def fetchsamples():
url = "https://stream.twitter.com/1.1/statuses/sample.json"
parameters = []
response = twitterreq(url, "GET", parameters)
for line in response:
f.write(line)
if __name__ == '__main__':
fetchsamples()
# f.close()
Besides the comment I made previously, I would suggesting checking out this stack overflow question: how to direct output into a txt file in python in windows
To quote:
If you want to do it in Python then you would write:
with open('out.txt', 'w') as f:
f.write(something)`
Obviously this is just a trivial example. You'd clearly do more inside the with block.
I am trying to upload a large file (say ~1GB) from client (using Python request.post) to the flask server.
When client sends the request to server in chunks of 1024, server do not read the whole file and save to server 0kb.
Can you please help me in debugging what exactly I am mistaking here.
Server - Flask Code:
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads/'
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
with open(fileFullPath, "wb") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({'filename': filename})
if __name__ == '__main__':
app.run(host="0.0.0.0", port=int("8080"),debug=True)
Client - Request Code
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, 'r') as f:
try:
r = requests.post(url, data=read_in_chunks(f))
print "r: {0}".format(r)
except Exception, e:
print e
if __name__ == '__main__':
filename = 'bigfile.zip' # ~1GB
url = 'http://localhost:8080/upload/{0}'.format(filename)
main(filename, url)
kindly use 'file.stream.read(chunk_size)' instead of request.stream.read(chunk_size). It works for me...!
Old thread but I was looking for something similar so I'll post here anyway.
The server reads the file in write mode which will overwrite at each chunk. Prefer append mode:
with open(fileFullPath, "ab") as f:
The client needs to read the file in byte mode:
with open(content_path, "rb") as f:
Finally, the generator read_in_chunks needs to be used in a loop before being passed to the request:
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
Then you have your 2 files
Server
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
import os
app = Flask(__name__)
app.config["UPLOAD_FOLDER"] = "uploads/"
#app.route("/upload/<filename>", methods=["POST", "PUT"])
def upload_process(filename):
filename = secure_filename(filename)
fileFullPath = os.path.join(app.config["UPLOAD_FOLDER"], filename)
with open(fileFullPath, "ab") as f:
chunk_size = 1024
chunk = request.stream.read(chunk_size)
f.write(chunk)
return jsonify({"filename": filename})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=int("8080"), debug=True)
Client
import os
import requests
def read_in_chunks(file_object, chunk_size=1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def main(fname, url):
content_path = os.path.abspath(fname)
with open(content_path, "rb") as f:
try:
for data in read_in_chunks(f):
r = requests.post(url, data=data)
print("r: {0}".format(r))
except Exception as e:
print(e)
if __name__ == "__main__":
filename = "bigfile.zip" # ~1GB
url = "http://localhost:8080/upload/{0}".format(filename)
main(filename, url)
Note that posting un chunks usually requires the total number of chunks and a hash of the file to validate the upload.
Flask depends on werkzeug to process streams, and werkzeug demands a content length for a stream. There's a thread on this here, but no real solution currently available, other than to take another framework approach.
This example below should work very well for you all. If you use Redis, you can also pub/sub the chunk being processed for progression bar in another API.
from flask import Flask, request, jsonify
#app.route("/submit_vdo", methods=['POST'])
def submit_vdo():
#copy_current_request_context
def receive_chunk(stream, full_file_path):
if full_file_path is None:
tmpfile = tempfile.NamedTemporaryFile('wb+', prefix=str(uuid.uuid4())+"_")
full_file_path = tmpfile.name
print ('Write temp to ', full_file_path)
with open(full_file_path, "wb") as f:
max_chunk_size = settings.VIDEO_MAX_SIZE_CHUNK # config.MAX_UPLOAD_BYTE_LENGHT
count_chunks = 0
total_uploaded = 0
try:
while True:
print ('Chunk ', count_chunks)
chunk = stream.read(max_chunk_size)
if chunk is not None and len(chunk)>0:
total_uploaded += len(chunk)
count_chunks += 1
f.write(chunk)
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'uploading...'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
print (temp)
else:
f.close()
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = 'DONE'
temp ['success'] = True
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
break
except Exception as e:
temp = {}
temp ['chunk_counts'] = count_chunks
temp ['total_bytes'] = total_uploaded
temp ['status'] = e
temp ['success'] = False
db_apn_logging.set(user_id+"#CHUNK_DOWNLOAD", json.dumps(temp), ex=5)
return None
return full_file_path
stream = flask.request.files['file']
stream.seek(0)
full_file_path = receive_chunk(stream, full_file_path)
return "DONE !"
I scraped a ticketing website that we were using and I now have a CSV file which looks like this: ID, Attachment_URL, Ticket_URL. What I now need to do is download every attachment and rename the file with the Ticket_URL. The main issue I have is that when navigating to the Attachment_URL you must use basic authentication and then you are redirected to an aws s3 link. I have been able to download individual files using wget, but I have not been able to iterate through the entire list (35k rows or so), and I am not sure how I would be able to name the file as the ticket_id. Any advice would be appreciated.
Got it.
To open the authenticated session:
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login.url.here', data=payload)
for i in range(1, 6000):
testURL = s.get(
'https://urlhere.com/efw/stuff&page={}'.format(i))
soup = BeautifulSoup(testURL.content)
table = soup.find("table", {"class": "table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')[1:]
print "The current page is: " + str(i)
for row in rows:
cols = row.find_all('a', attrs={'href': re.compile("^/helpdesk/")})
# time.sleep(1)
with open('fd.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(cols)
print cols
print cols
Then I cleaned the links a bit in R and to download the files.
#! /usr/bin/env python
import threading
import os
from time import gmtime, strftime
from Queue import Queue
import requests
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login', data=payload)
class log:
def info(self, message):
self.__message("info", message)
def error(self, message):
self.__message("error", message)
def debug(self, message):
self.__message("debug", message)
def __message(self, log_level, message):
date = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print "%s [%s] %s" % (date, log_level, message)
class fetch:
def __init__(self):
self.temp_dir = "/tmp"
def run_fetcher(self, queue):
while not queue.empty():
url, ticketid = queue.get()
if ticketid.endswith("NA"):
fileName = url.split("/")[-1] + 'NoTicket'
else:
fileName = ticketid.split("/")[-1]
response = s.get(url)
with open(os.path.join('/Users/Desktop/FolderHere', fileName + '.mp3'), 'wb') as f:
f.write(response.content)
print fileName
queue.task_done()
if __name__ == '__main__':
# load in classes
q = Queue()
log = log()
fe = fetch()
# get bucket name
#Read in input file
with open('/Users/name/csvfilehere.csv', 'r') as csvfile:
for line in csvfile:
id,url,ticket = line.split(",")
q.put([url.strip(),ticket.strip()])
# spin up fetcher workers
threads = []
for i in range(8):
t = threading.Thread(target=fe.run_fetcher, args=(q,))
t.daemon = True
threads.append(t)
t.start()
# close threads
[x.join() for x in threads]
# close queue
q.join()
log.info("End")
I have a very simple Python script using gevent.pool to download URLs (see below). The script runs fine for a couple of days and then locks up. I noticed that the memory usage is very high at that time. Am I using gevent incorrectly?
import sys
from gevent import monkey
monkey.patch_all()
import urllib2
from gevent.pool import Pool
inputFile = open(sys.argv[1], 'r')
urls = []
counter = 0
for line in inputFile:
counter += 1
urls.append(line.strip())
inputFile.close()
outputDirectory = sys.argv[2]
def fetch(url):
try:
body = urllib2.urlopen("http://" + url, None, 5).read()
if len(body) > 0:
outputFile = open(outputDirectory + "/" + url, 'w')
outputFile.write(body)
outputFile.close()
print "Success", url
except:
pass
pool = Pool(int(sys.argv[3]))
pool.map(fetch, urls)
body = urllib2.urlopen("http://" + url, None, 5).read()
Above line reads the entire content in memory as a string. To prevent that, change fetch() as follow:
def fetch(url):
try:
u = urllib2.urlopen("http://" + url, None, 5)
try:
with open(outputDirectory + "/" + url, 'w') as outputFile:
while True:
chunk = u.read(65536)
if not chunk:
break
outputFile.write(chunk)
finally:
u.close()
print "Success", url
except:
print "Fail", url
I am uploading a large file using the Python requests package, and I can't find any way to give data back about the progress of the upload. I have seen a number of progress meters for downloading a file, but these will not work for a file upload.
The ideal solution would be some sort of callback method such as:
def progress(percent):
print percent
r = requests.post(URL, files={'f':hugeFileHandle}, callback=progress)
Thanks in advance for your help :)
requests doesn't support upload streaming e.g.:
import os
import sys
import requests # pip install requests
class upload_in_chunks(object):
def __init__(self, filename, chunksize=1 << 13):
self.filename = filename
self.chunksize = chunksize
self.totalsize = os.path.getsize(filename)
self.readsofar = 0
def __iter__(self):
with open(self.filename, 'rb') as file:
while True:
data = file.read(self.chunksize)
if not data:
sys.stderr.write("\n")
break
self.readsofar += len(data)
percent = self.readsofar * 1e2 / self.totalsize
sys.stderr.write("\r{percent:3.0f}%".format(percent=percent))
yield data
def __len__(self):
return self.totalsize
# XXX fails
r = requests.post("http://httpbin.org/post",
data=upload_in_chunks(__file__, chunksize=10))
btw, if you don't need to report progress; you could use memory-mapped file to upload large file.
To workaround it, you could create a file adaptor similar to the one from
urllib2 POST progress monitoring:
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = len(iterable)
def read(self, size=-1): # TBD: add buffer for `len(data) > size` case
return next(self.iterator, b'')
def __len__(self):
return self.length
Example
it = upload_in_chunks(__file__, 10)
r = requests.post("http://httpbin.org/post", data=IterableToFileAdapter(it))
# pretty print
import json
json.dump(r.json, sys.stdout, indent=4, ensure_ascii=False)
I recommend to use a tool package named requests-toolbelt, which make monitoring upload bytes very easy, like
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
import requests
def my_callback(monitor):
# Your callback function
print monitor.bytes_read
e = MultipartEncoder(
fields={'field0': 'value', 'field1': 'value',
'field2': ('filename', open('file.py', 'rb'), 'text/plain')}
)
m = MultipartEncoderMonitor(e, my_callback)
r = requests.post('http://httpbin.org/post', data=m,
headers={'Content-Type': m.content_type})
And you may want to read this to show a progress bar.
I got it working with the code from here: Simple file upload progressbar in PyQt.
I changed it a bit, to use BytesIO instead of StringIO.
class CancelledError(Exception):
def __init__(self, msg):
self.msg = msg
Exception.__init__(self, msg)
def __str__(self):
return self.msg
__repr__ = __str__
class BufferReader(BytesIO):
def __init__(self, buf=b'',
callback=None,
cb_args=(),
cb_kwargs={}):
self._callback = callback
self._cb_args = cb_args
self._cb_kwargs = cb_kwargs
self._progress = 0
self._len = len(buf)
BytesIO.__init__(self, buf)
def __len__(self):
return self._len
def read(self, n=-1):
chunk = BytesIO.read(self, n)
self._progress += int(len(chunk))
self._cb_kwargs.update({
'size' : self._len,
'progress': self._progress
})
if self._callback:
try:
self._callback(*self._cb_args, **self._cb_kwargs)
except: # catches exception from the callback
raise CancelledError('The upload was cancelled.')
return chunk
def progress(size=None, progress=None):
print("{0} / {1}".format(size, progress))
files = {"upfile": ("file.bin", open("file.bin", 'rb').read())}
(data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata(files)
headers = {
"Content-Type": ctype
}
body = BufferReader(data, progress)
requests.post(url, data=body, headers=headers)
The trick is, to generate data and header from the files list manually, using encode_multipart_formdata() from urllib3
I know this is an old question, but I couldn't find an easy answer anywhere else, so hopefully this will help somebody else:
import requests
import tqdm
with open(file_name, 'rb') as f:
r = requests.post(url, data=tqdm(f.readlines()))
This solution uses requests_toolbelt and tqdm both well maintained and popular libraries.
from pathlib import Path
from tqdm import tqdm
import requests
from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
def upload_file(upload_url, fields, filepath):
path = Path(filepath)
total_size = path.stat().st_size
filename = path.name
with tqdm(
desc=filename,
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
) as bar:
with open(filepath, "rb") as f:
fields["file"] = ("filename", f)
e = MultipartEncoder(fields=fields)
m = MultipartEncoderMonitor(
e, lambda monitor: bar.update(monitor.bytes_read - bar.n)
)
headers = {"Content-Type": m.content_type}
requests.post(upload_url, data=m, headers=headers)
Example usage
upload_url = 'https://uploadurl'
fields = {
"field1": value1,
"field2": value2
}
filepath = '97a6fce8_owners_2018_Van Zandt.csv'
upload_file(upload_url, fields, filepath)
Usually you would build a streaming datasource (a generator) that reads the file chunked and reports its progress on the way (see kennethreitz/requests#663. This does not work with requests file-api, because requests doesn’t support streaming uploads (see kennethreitz/requests#295) – a file to upload needs to be complete in memory before it starts getting processed.
but requests can stream content from a generator as J.F. Sebastian has proven before, but this generator needs to generate the complete datastream including the multipart encoding and boundaries. This is where poster comes to play.
poster is originally written to be used with pythons urllib2 and supports streaming generation of multipart requests, providing progress indication as it goes along. Posters Homepage provides examples of using it together with urllib2 but you really don’t want to use urllib2. Check out this example-code on how to to HTTP Basic Authentication with urllib2. Horrrrrrrrible.
So we really want to use poster together with requests to do file uploads with tracked progress. And here is how:
# load requests-module, a streamlined http-client lib
import requests
# load posters encode-function
from poster.encode import multipart_encode
# an adapter which makes the multipart-generator issued by poster accessable to requests
# based upon code from http://stackoverflow.com/a/13911048/1659732
class IterableToFileAdapter(object):
def __init__(self, iterable):
self.iterator = iter(iterable)
self.length = iterable.total
def read(self, size=-1):
return next(self.iterator, b'')
def __len__(self):
return self.length
# define a helper function simulating the interface of posters multipart_encode()-function
# but wrapping its generator with the file-like adapter
def multipart_encode_for_requests(params, boundary=None, cb=None):
datagen, headers = multipart_encode(params, boundary, cb)
return IterableToFileAdapter(datagen), headers
# this is your progress callback
def progress(param, current, total):
if not param:
return
# check out http://tcd.netinf.eu/doc/classnilib_1_1encode_1_1MultipartParam.html
# for a complete list of the properties param provides to you
print "{0} ({1}) - {2:d}/{3:d} - {4:.2f}%".format(param.name, param.filename, current, total, float(current)/float(total)*100)
# generate headers and gata-generator an a requests-compatible format
# and provide our progress-callback
datagen, headers = multipart_encode_for_requests({
"input_file": open('recordings/really-large.mp4', "rb"),
"another_input_file": open('recordings/even-larger.mp4', "rb"),
"field": "value",
"another_field": "another_value",
}, cb=progress)
# use the requests-lib to issue a post-request with out data attached
r = requests.post(
'https://httpbin.org/post',
auth=('user', 'password'),
data=datagen,
headers=headers
)
# show response-code and -body
print r, r.text
My upload server doesn't support Chunk-Encoded so I came up with this solution. It basically just a wrapper around python IOBase and allow tqdm.wrapattr to work seamless.
import io
import requests
from typing import Union
from tqdm import tqdm
from tqdm.utils import CallbackIOWrapper
class UploadChunksIterator(Iterable):
"""
This is an interface between python requests and tqdm.
Make tqdm to be accessed just like IOBase for requests lib.
"""
def __init__(
self, file: Union[io.BufferedReader, CallbackIOWrapper], total_size: int, chunk_size: int = 16 * 1024
): # 16MiB
self.file = file
self.chunk_size = chunk_size
self.total_size = total_size
def __iter__(self):
return self
def __next__(self):
data = self.file.read(self.chunk_size)
if not data:
raise StopIteration
return data
# we dont retrive len from io.BufferedReader because CallbackIOWrapper only has read() method.
def __len__(self):
return self.total_size
fp = "data/mydata.mp4"
s3url = "example.com"
_quiet = False
with open(fp, "rb") as f:
total_size = os.fstat(f.fileno()).st_size
if not _quiet:
f = tqdm.wrapattr(f, "read", desc=hv, miniters=1, total=total_size, ascii=True)
with f as f_iter:
res = requests.put(
url=s3url,
data=UploadChunksIterator(f_iter, total_size=total_size),
)
res.raise_for_status()
Making #jfs' answer better in terms of an informative progress bar.
import math
import os
import requests
import sys
class ProgressUpload:
def __init__(self, filename, chunk_size=1250):
self.filename = filename
self.chunk_size = chunk_size
self.file_size = os.path.getsize(filename)
self.size_read = 0
self.divisor = min(math.floor(math.log(self.file_size, 1000)) * 3, 9) # cap unit at a GB
self.unit = {0: 'B', 3: 'KB', 6: 'MB', 9: 'GB'}[self.divisor]
self.divisor = 10 ** self.divisor
def __iter__(self):
progress_str = f'0 / {self.file_size / self.divisor:.2f} {self.unit} (0 %)'
sys.stderr.write(f'\rUploading {dist_file}: {progress_str}')
with open(self.filename, 'rb') as f:
for chunk in iter(lambda: f.read(self.chunk_size), b''):
self.size_read += len(chunk)
yield chunk
sys.stderr.write('\b' * len(progress_str))
percentage = self.size_read / self.file_size * 100
completed_str = f'{self.size_read / self.divisor:.2f}'
to_complete_str = f'{self.file_size / self.divisor:.2f} {self.unit}'
progress_str = f'{completed_str} / {to_complete_str} ({percentage:.2f} %)'
sys.stderr.write(progress_str)
sys.stderr.write('\n')
def __len__(self):
return self.file_size
# sample usage
requests.post(upload_url, data=ProgressUpload('file_path'))
The key is the __len__ method. Without it, I was getting connection closed errors. That's the only reason you can't just use tqdm + iter to get a simple progress bar.
My python code that works great. Credit : twine
import sys
import tqdm
import requests
import requests_toolbelt
class ProgressBar(tqdm.tqdm):
def update_to(self, n: int) -> None:
self.update(n - self.n)
with open("test.zip", "rb") as fp:
data_to_send = []
session = requests.session()
data_to_send.append(
("files", ("test.zip", fp))
)
encoder = requests_toolbelt.MultipartEncoder(data_to_send)
with ProgressBar(
total=encoder.len,
unit="B",
unit_scale=True,
unit_divisor=1024,
miniters=1,
file=sys.stdout,
) as bar:
monitor = requests_toolbelt.MultipartEncoderMonitor(
encoder, lambda monitor: bar.update_to(monitor.bytes_read)
)
r = session.post(
'http://httpbin.org/post',
data=monitor,
headers={"Content-Type": monitor.content_type},
)
print(r.text)