Python ZipFile - ValueError: I/O operation on closed file - python

When I use the zipfile object in multiple functions, it works fine. However, when I try to run the one of the functions in thread, it gives the error "I/O operation on closed file".
Below code works fine which validates and extracts the zipfile
from zipfile import ZipFile
from threading import Thread
def extract_data(file):
zip_file = Zipfile(file)
validate = validate_function(zip_file)
if validate.status_code == 200:
data = extract_function(zip_file)
However, If I run the extract_function in thread, It gives me "ValueError: I/O operation on closed file"
def extract_data(file):
zip_file = Zipfile(file)
validate = validate_function(zip_file)
if validate.status_code == 200:
extract = Thread(target=extract_function,args=[zip_file])
extract.start()
Please guide me for understanding the root cause of this issue.
Update:
Here is the sample code to reproduce the issue:
from zipfile import ZipFile
from threading import Thread
import pandas as pd
from flask import Flask, request, Response
from werkzeug.middleware.proxy_fix import ProxyFix
from werkzeug.datastructures import FileStorage
from flask_restplus import Api, Resource, reqparse, cors
from flask_cors import cross_origin
app = Flask(__name__)
app.wsgi_app = ProxyFix(app.wsgi_app)
api = Api(app,
version='1.0.0',
doc='/',
)
def validate_function(zip_file):
try:
error = ZipFile.testzip(zip_file)
if error is None:
return Response('Zip file is validated',200)
else:
return Response('Invalid Zip file',601)
except Exception as e:
return Response('Error :' + str(e),601)
def extract_function(zip_file):
df_list = []
try:
for file in zip_file.namelist():
if file.endswith('.csv'):
df_list.append(pd.read_csv(zip_file.open(file)))
else:
excel_df = pd.read_excel(zipfile.open(file),None)
if type(excel_df) == dict:
df_list.extend(list(excel_df.values()))
else:
df_list.append(excel_df)
print(len(df_list))
except Exception as e:
print('Error in converting to dataframe', str(e))
def extract_data(file):
zip_file = ZipFile(file)
resp = validate_function(zip_file)
if resp.status_code == 200:
data = Thread(target= extract_function, args=[zip_file])
data.start()
#extract_function(zip_file) --> This works
return resp
process_data = reqparse.RequestParser()
process_data.add_argument('file', location='files', type=FileStorage, required=True, help='Input file in Zip format')
#api.route('/process-data')
#api.expect(process_data)
class DataExtract(Resource):
#cors.crossdomain(origin='*')
#cross_origin()
def post(self):
file = request.files['file']
resp = extract_data(file)
return resp
app.run()

Use with to open files, so they are closed correctly after you are done. You also have to open the file separately in the new thread:
def extract_data(file):
with ZipFile(file) as zip_file:
validate = validate_function(zip_file)
if validate.status_code == 200:
extract = Thread(target=extract_function,args=[file])
extract.start()
def extract_function(file):
with ZipFile(file) as zip_file:
# extract ...

Related

Djnago rest framework html/url to docx

I am creating a Django API that converts any URL or HTML file into pdf and Docx. The implemented code below already renders in pdf format using pdfkit package. I'm using python-docx to generate in Docx, but I don't know how to handle it. I would like to have any support, please. I don't have deep knowledge and any help will be appreciated.
Here is my convert.py file:
import io
from pydoc import doc
from tempfile import NamedTemporaryFile
from typing import IO
from urllib.parse import urlparse
import pdfkit
from docx import Document
class ConvertingError(Exception):
"""
This exception represents an error during converting.
In example, when Host of a url is unreachable.
In other words, this is a wrapper for wkhtmltopdf errors.
"""
pass
def url_to_pdf(url: str) -> IO:
"""Fetch HTML from url and convert the page to pdf,"""
with NamedTemporaryFile('w+b') as tmpf:
try:
pdfkit.from_url(url, tmpf.name)
except OSError as e:
raise ConvertingError from e
pdf = io.BytesIO(tmpf.read())
return pdf
def html_to_pdf(html: str) -> IO:
"""Convert HTML string to pdf."""
with NamedTemporaryFile('w+b') as tmpf:
try:
pdfkit.from_string(html, tmpf.name)
except OSError as e:
raise ConvertingError from e
pdf = io.BytesIO(tmpf.read())
return pdf
def filename_from_url(url: str) -> str:
"""
Generate pdf filename using a hostname of a URL.
If no hostname is provided, return 'default.pdf' as filename.
"""
parsed = urlparse(URL)
return (parsed.hostname or 'default') + '.pdf'
def url_to_docx(url: str) -> IO:
pass
def html_to_docx(html: str) -> IO:
pass
And my views.py file
from fileinput import filename
from typing import IO
from django.http import FileResponse
from rest_framework.exceptions import ValidationError
from rest_framework.parsers import MultiPartParser
from rest_framework.viewsets import ViewSet
from .converter import filename_from_url, html_to_pdf, url_to_pdf, ConvertingError,
url_to_docx, html_to_docx
from .serializers import HtmlFileInputSerializer, UrlInputSerializer
def generate_from_html(self, request):
serializer = HtmlFileInputSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
file: IO = serializer.validated_data['file']
content = str(file.read())
try:
pdf = html_to_pdf(content)
except ConvertingError:
raise ValidationError('The file is of inappropriate type or corrupted.')
response = FileResponse(pdf)
response["Content-Type"] = 'application/pdf'
return response
def generate_docx_from_html(self, request):
pass
# class UrlConverterViewSet(ViewSet):
def generate_from_url(self, request):
serializer = UrlInputSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
url: str = serializer.validated_data['url']
try:
pdf = url_to_pdf(URL)
except ConvertingError:
raise ValidationError('The url is invalid or unreachable.')
filename = serializer.validated_data.get('filename') or filename_from_url(URL)
response = FileResponse(pdf, filename=filename)
response["Content-Type"] = 'application/pdf'
return response
def generate_docx_from_url(self, request):
pass
class GeneratePdf(ViewSet):
# generate pdf view from html file and URL
parser_classes = (MultiPartParser,)
def create(self, request):
if request.data.get('file'):
return generate_from_html(self, request)
elif request.data.get('url'):
return generate_from_url(self, request)
else:
raise ValidationError('The file or url is invalid or unreachable.')

Why is my code to download files producing a 404 error?

I have created an app, generated client credentials, and trusted the app for my SharePoint online site.
I have created a file test.txt and it is placed under -https://company.sharepoint.com/sites/testsite/Shared%20Documents/General/test.txt
Additionally, I have installed the latest version of the module
pip freeze | grep Office
Office365-REST-Python-Client==2.3.11
class SharePoint:
def __init__(self):
context_auth = AuthenticationContext(Configs.SITE_URL) ---> SITE_URL='https://company.sharepoint.com/sites/testsite/'
context_auth.acquire_token_for_app(client_id=Configs.OAUTH_CLIENT_ID, client_secret=Configs.OAUTH_CLIENT_SECRET)
self.ctx = ClientContext(Configs.SITE_URL, context_auth)
def download_files(self):
file_url = "/sites/testsite/Shared%20Documents/General/test.txt"
download_path = os.path.join(tempfile.mkdtemp(), os.path.basename(file_url))
print(download_path)
with open(download_path, "wb") as local_file:
file = self.ctx.web.get_file_by_server_relative_url(file_url).download(local_file).execute_query()
print("[Ok] file has been downloaded into: {0}".format(download_path))
if __name__ == '__main__':
s = SharePoint()
s.download_files()
However, it throws an error, not able to get my head around this.
office365.runtime.client_request_exception.ClientRequestException: ('-2130575338, Microsoft.SharePoint.SPException', 'The file /sites/testsite/Shared%20Documents/General/test.txt does not exist.', "404 Client Error: Not Found for url: https://company.sharepoint.com/sites/testsite/_api/Web/getFileByServerRelativeUrl('%2Fsites%2Ftestsite%2FShared%2520Documents%2FGeneral%2Ftest.txt')?$select=ServerRelativePath")
You seem to be basing this off of the example shown here.
I was having similar issues at first, until I made all function inputs be absolute paths, inclusive of url scheme and site. This just removes a lot of room for error.
My current script is similar to this:
from urllib.parse import urlparse
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
def download_file(local_absolute_path:str, global_absolute_path:str, client_context:ClientContext) -> None:
print(f"The file {global_absolute_path} is being prepared for download.")
download_location = urlparse(global_absolute_path)
file_to_download = client_context.web.get_file_by_server_relative_url(download_location)
with open(local_absolute_path, "wb") as local_file:
file_to_download.download_session(local_file).execute_query()
print(f"──► Download successful. The file has been saved as {local_absolute_path}\n")
Note that self.ctx in your code is equivalent to client_context in mine.
I recommend writing a bunch of helper functions to convert the paths back and forth between absolute, relative and the file name. The ones I currently use can be found below:
import os
from urllib.parse import urlparse
class PathHandler(object):
def __init__(self, absolute_path:str) -> None:
self.absolute_path = absolute_path
def get_filename_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return os.path.basename(parsed_url.path)
def get_relative_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return parsed_url.path
def get_parent_folder_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return os.path.dirname(parsed_url.path)
def get_scheme_and_root_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return f"{parsed_url.scheme}//{parsed_url.netloc}"
def convert_to_absolute_local(self, local_root:str, global_root:str) -> str:
return local_root + os.sep + self.absolute_path[len(global_root):].replace("/", os.sep)
def convert_to_absolute_global(self, local_root:str, global_root:str) -> str:
return global_root + "/" + self.absolute_path[len(local_root):].replace(os.sep, "/")

Handle multiple request at same time on Flask web app

What I have: I've a Flask web app deployed to Heroku's server, which consists of only one web process app.py. Here it is:
#importation
from flask import Flask, render_template, current_app, send_file, request, json, send_file
import os
#working functions
#json write
def json_write(dictionary):
with open("./json/info.json", "w+") as f:
json.dump(dictionary, f, indent=4)
#make file name
def make_file_name(name):
filename = "tube4u_"
for t in str(name):
if t.isalnum():
filename += t
filename += ".mp4"
return filename
#application initialisation
app=Flask(__name__)
#home
#app.route("/")
def home():
return render_template("index.html")
#processor
#app.route("/process/", methods=["GET"])
def process():
#get url
url = request.args["url"]
#import & initialisation
from pytube import YouTube
import pickle
json_dict = {}
try:
yt = YouTube(url)
except:
return "<h1>Invalid URL</h1>"
all_videos = yt.streams.filter(type="video", progressive=True)
json_dict["title"] = yt.title
json_dict["thumbnail"] = yt.thumbnail_url
json_dict["name"] = make_file_name(yt.title)
with open("./pickle/ytobj.pkl", "wb") as f:
pickle.dump(all_videos, f)
#videos with itag
json_dict["videos"] = [ {"itag": item.itag, "res": item.resolution} for item in all_videos]
json_write(json_dict)
return render_template("menu.html")
#download
#app.route("/download/", methods=["GET"])
def download():
import pickle
itag = int(request.args["itag"])
with open("./json/info.json") as f:
json_dict = json.load(f)
with open("./pickle/ytobj.pkl", "rb") as f:
all_videos = pickle.load(f)
video = all_videos.get_by_itag(itag)
video.download(output_path="./video", filename=f"{json_dict['name']}")
return render_template("thank.html")
#return video
#app.route("/video/", methods=["GET"])
def video():
filename = request.args["filename"]
return send_file(f"./video/{filename}", as_attachment=True)
#return json
#app.route("/json")
def fetchjson():
with open("./json/info.json") as f:
content = json.load(f)
return content
#get name
#app.route("/name")
def fetchname():
with open("./json/info.json") as f:
content = json.load(f)
return content
#app.route("/list")
def listall():
return f"{os.listdir('./video')}"
#running the app
if __name__ == "__main__":
app.run(debug=True)
How it works: here I made the app like that, whenever someone enter a URL and click Go then it creates a json file with the name info.json. after it gets everything properly it performs some task with the given URL reading from the file.
My problem:
Now the problem is, if I make a request of the web it will create a json with my given URL, suppose at the same time someone else make a request and enter a URL then server will lost my information and rewrite the json file with another client's given input URL my task will be performed with another's input url. It's really weird.
How to fix it? Like if there any way to create the info.json file on separate path for each client and gets deleted after work done?
There is a lot of ways in my point of view
When the server get client request then check if there is already a file.if there is already a file then add timestamp or add something else in the filename so the file will not be overwritten.
Ask the user file name and also add timestamp in the name and save it.
You can also use databases to store data of different clients .may be you can create login system and give every user an id and store data for every user in database accordingly.
So on...
You can see there is a lot of ways to solve this.

Azure Durable Functions : Http Trigger error

As a newbie in Azure,
I am following Microsoft Azure Function tutorial page
https://learn.microsoft.com/en-us/azure/azure-functions/durable/durable-functions-cloud-backup?tabs=python
and github page
https://github.com/Azure/azure-functions-durable-python/tree/master/samples/fan_in_fan_out .
**HttpStart code**
import logging
import json
import azure.functions as func
import azure.durable_functions as df
async def main(req: func.HttpRequest, starter: str) -> func.HttpResponse:
client = df.DurableOrchestrationClient(starter)
payload: str = json.loads(req.get_body().decode()) # Load JSON post request data
instance_id = await client.start_new(req.route_params["functionName"], client_input=payload)
logging.info(f"Started orchestration with ID = '{instance_id}'.")
return client.create_check_status_response(req, instance_id)
**E2_BackupSiteContent**
import azure.functions as func
import azure.durable_functions as df
def orchestrator_function(context: df.DurableOrchestrationContext):
root_directory: str = context.get_input()
if not root_directory:
raise Exception("A directory path is required as input")
files = yield context.call_activity("E2_GetFileList", root_directory)
tasks = []
for file in files:
tasks.append(context.call_activity("E2_CopyFileToBlob", file))
results = yield context.task_all(tasks)
total_bytes = sum(results)
return total_bytes
main = df.Orchestrator.create(orchestrator_function)
**E2_CopyFileToBlob**
import os
import pathlib
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError
connect_str = os.getenv('AzureWebJobsStorage')
def main(filePath: str) -> str:
# Create the BlobServiceClient object which will be used to create a container client
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
# Create a unique name for the container
container_name = "backups"
# Create the container if it does not exist
try:
blob_service_client.create_container(container_name)
except ResourceExistsError:
pass
# Create a blob client using the local file name as the name for the blob
parent_dir, fname = pathlib.Path(filePath).parts[-2:] # Get last two path components
blob_name = parent_dir + "_" + fname
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
# Count bytes in file
byte_count = os.path.getsize(filePath)
# Upload the created file
with open(filePath, "rb") as data:
blob_client.upload_blob(data)
return byte_count
**E2_GetFileList**
import os
from os.path import dirname
from typing import List
def main(rootDirectory: str) -> List[str]:
all_file_paths = []
# We walk the file system
for path, _, files in os.walk(rootDirectory):
# We copy the code for activities and orchestrators
if "E2_" in path:
# For each file, we add their full-path to the list
for name in files:
if name == "__init__.py" or name == "function.json":
file_path = os.path.join(path, name)
all_file_paths.append(file_path)
return all_file_paths
When I http trigger with postman app,
POST http://localhost:7071/api/orchestrators/E2_BackupSiteContent?req="D:\Tmp"
I got the following error messages.
[2021-11-12T02:13:42.432Z] Worker process started and initialized.
[2021-11-12T02:13:46.489Z] Host lock lease acquired by instance ID '000000000000000000000000AE48769C'.
[2021-11-12T02:13:52.529Z] Executing 'Functions.HttpStart' (Reason='This function was programmatically called via the host APIs.', Id=748996d0-1f84-4597-86ea-768467eb36e3)
[2021-11-12T02:13:52.560Z] Executed 'Functions.HttpStart' (Failed, Id=748996d0-1f84-4597-86ea-768467eb36e3, Duration=5433ms)
[2021-11-12T02:13:52.562Z] System.Private.CoreLib: Exception while executing function: Functions.HttpStart. Microsoft.Azure.WebJobs.Host: Exception binding parameter 'req'. Microsoft.AspNetCore.Server.Kestrel.Core: Reading the request body timed out due to data arriving too slowly. See MinRequestBodyDataRate.
What should I do to solve this problem?
(I tested with Linux and Windows.)
--Added--
Postman capture
Instead of passing the directory in the query string of the URL, you should pass the path in the HTTP request body in the postman tool. The Microsoft doc page itself shows how to do it, see Run the sample section.

Downloading csv data from an API

I am attempting to download csv data from an API which I will then edit I am struggling to get the different functions to work together.
i.e. passing the export link through to download the file and then through to opening it.
'''
File name: downloadAWR.py
Author: Harry&Joe
Date created: 3/10/17
Date last modified: 5/10/17
Version: 3.6
'''
import requests
import json
import urllib2
import zipfile
import io
import csv
import os
from urllib2 import urlopen, URLError, HTTPError
geturl() is used to create a download link for the csv data, one link will be created with user input data in this case the name and dates, this will then create a link that we can use to download the data. the link is stored in export_link
def geturl():
#getProjectName
project_name = 'BIMM'
#getApiToken
api_token = "API KEY HERE"
#getStartDate
start_date = '2017-01-01'
#getStopDate
stop_date = '2017-09-01'
url = "https://api.awrcloud.com/get.php?action=export_ranking&project=%s&token=%s&startDate=%s&stopDate=%s" % (project_name,api_token,start_date,stop_date)
export_link = requests.get(url).content
return export_link
dlfile is used to actually use the link a get a file we can manipulate and edit e.g. removing columns and some of the data.
def dlfile(export_link):
# Open the url
try:
f = urlopen(export_link)
print ("downloading " + export_link)
# Open our local file for writing
with open(os.path.basename(export_link), "wb") as local_file:
local_file.write(f.read())
#handle errors
except HTTPError as e:
print ("HTTP Error:", e.code, export_link)
except URLError as e:
print ("URL Error:", e.reason, export_link)
return f
readdata is used to go into the file and open it for us to use.
def readdata():
with zipfile.ZipFile(io.BytesIO(zipdata)) as z:
for f in z.filelist:
csvdata = z.read(f)
#reader = csv.reader(io.StringIO(csvdata.decode()))
def main():
#Do something with the csv data
export_link = (geturl())
data = dlfile(export_link)
csvdata = data.readdata()
if __name__ == '__main__':
main()
Generally I'm finding that the code works independently but struggles when I try to put it all together synchronously.
You need to clean up and call your code appropriately. It seems you copy pasted from different sources and now you have some salad bowl of code that isn't mixing well.
If the task is just to read and open a remote file to do something to it:
import io
import zipfile
import requests
def get_csv_file(project, api_token, start_date, end_date):
url = "https://api.awrcloud.com/get.php"
params = {'action': 'export_ranking',
'project': project,
'token': api_token,
'startDate': start_date,
'stopDate': end_date}
r = requests.get(url, params)
r.raise_for_status()
return zipfile.ZipFile(io.BytesIO(request.get(r.content).content))
def process_csv_file(zip_file):
contents = zip_file.extractall()
# do stuff with the contents
if __name__ == '__main__':
process_zip_file(get_csv_file('BIMM', 'api-key', '2017-01-01', '2017-09-01'))

Categories