I am creating a Django API that converts any URL or HTML file into pdf and Docx. The implemented code below already renders in pdf format using pdfkit package. I'm using python-docx to generate in Docx, but I don't know how to handle it. I would like to have any support, please. I don't have deep knowledge and any help will be appreciated.
Here is my convert.py file:
import io
from pydoc import doc
from tempfile import NamedTemporaryFile
from typing import IO
from urllib.parse import urlparse
import pdfkit
from docx import Document
class ConvertingError(Exception):
"""
This exception represents an error during converting.
In example, when Host of a url is unreachable.
In other words, this is a wrapper for wkhtmltopdf errors.
"""
pass
def url_to_pdf(url: str) -> IO:
"""Fetch HTML from url and convert the page to pdf,"""
with NamedTemporaryFile('w+b') as tmpf:
try:
pdfkit.from_url(url, tmpf.name)
except OSError as e:
raise ConvertingError from e
pdf = io.BytesIO(tmpf.read())
return pdf
def html_to_pdf(html: str) -> IO:
"""Convert HTML string to pdf."""
with NamedTemporaryFile('w+b') as tmpf:
try:
pdfkit.from_string(html, tmpf.name)
except OSError as e:
raise ConvertingError from e
pdf = io.BytesIO(tmpf.read())
return pdf
def filename_from_url(url: str) -> str:
"""
Generate pdf filename using a hostname of a URL.
If no hostname is provided, return 'default.pdf' as filename.
"""
parsed = urlparse(URL)
return (parsed.hostname or 'default') + '.pdf'
def url_to_docx(url: str) -> IO:
pass
def html_to_docx(html: str) -> IO:
pass
And my views.py file
from fileinput import filename
from typing import IO
from django.http import FileResponse
from rest_framework.exceptions import ValidationError
from rest_framework.parsers import MultiPartParser
from rest_framework.viewsets import ViewSet
from .converter import filename_from_url, html_to_pdf, url_to_pdf, ConvertingError,
url_to_docx, html_to_docx
from .serializers import HtmlFileInputSerializer, UrlInputSerializer
def generate_from_html(self, request):
serializer = HtmlFileInputSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
file: IO = serializer.validated_data['file']
content = str(file.read())
try:
pdf = html_to_pdf(content)
except ConvertingError:
raise ValidationError('The file is of inappropriate type or corrupted.')
response = FileResponse(pdf)
response["Content-Type"] = 'application/pdf'
return response
def generate_docx_from_html(self, request):
pass
# class UrlConverterViewSet(ViewSet):
def generate_from_url(self, request):
serializer = UrlInputSerializer(data=request.data)
serializer.is_valid(raise_exception=True)
url: str = serializer.validated_data['url']
try:
pdf = url_to_pdf(URL)
except ConvertingError:
raise ValidationError('The url is invalid or unreachable.')
filename = serializer.validated_data.get('filename') or filename_from_url(URL)
response = FileResponse(pdf, filename=filename)
response["Content-Type"] = 'application/pdf'
return response
def generate_docx_from_url(self, request):
pass
class GeneratePdf(ViewSet):
# generate pdf view from html file and URL
parser_classes = (MultiPartParser,)
def create(self, request):
if request.data.get('file'):
return generate_from_html(self, request)
elif request.data.get('url'):
return generate_from_url(self, request)
else:
raise ValidationError('The file or url is invalid or unreachable.')
Related
Im trying to generate pdf from given html file but get_template function is not working i guess.
from io import BytesIO
from django.template.loader import get_template
from xhtml2pdf import pisa
def render_to_pdf(context_dict={}):
try:
template = get_template('invoice.html')
html = template.render(context_dict)
result = BytesIO()
pdf = pisa.pisaDocument(BytesIO(html.encode("ISO-8859-1")), result)
if not pdf.err:
return result.getvalue()
return None
except Exception as e:
print('ERROR', e)
The Except block returns None.
Change line to:
pdf = pisa.pisaDocument(BytesIO(html.encode("utf-8")), result)
It was my logical error issue. I didn't added 'templates' folder to DIRS in settings.py file.
I have created an app, generated client credentials, and trusted the app for my SharePoint online site.
I have created a file test.txt and it is placed under -https://company.sharepoint.com/sites/testsite/Shared%20Documents/General/test.txt
Additionally, I have installed the latest version of the module
pip freeze | grep Office
Office365-REST-Python-Client==2.3.11
class SharePoint:
def __init__(self):
context_auth = AuthenticationContext(Configs.SITE_URL) ---> SITE_URL='https://company.sharepoint.com/sites/testsite/'
context_auth.acquire_token_for_app(client_id=Configs.OAUTH_CLIENT_ID, client_secret=Configs.OAUTH_CLIENT_SECRET)
self.ctx = ClientContext(Configs.SITE_URL, context_auth)
def download_files(self):
file_url = "/sites/testsite/Shared%20Documents/General/test.txt"
download_path = os.path.join(tempfile.mkdtemp(), os.path.basename(file_url))
print(download_path)
with open(download_path, "wb") as local_file:
file = self.ctx.web.get_file_by_server_relative_url(file_url).download(local_file).execute_query()
print("[Ok] file has been downloaded into: {0}".format(download_path))
if __name__ == '__main__':
s = SharePoint()
s.download_files()
However, it throws an error, not able to get my head around this.
office365.runtime.client_request_exception.ClientRequestException: ('-2130575338, Microsoft.SharePoint.SPException', 'The file /sites/testsite/Shared%20Documents/General/test.txt does not exist.', "404 Client Error: Not Found for url: https://company.sharepoint.com/sites/testsite/_api/Web/getFileByServerRelativeUrl('%2Fsites%2Ftestsite%2FShared%2520Documents%2FGeneral%2Ftest.txt')?$select=ServerRelativePath")
You seem to be basing this off of the example shown here.
I was having similar issues at first, until I made all function inputs be absolute paths, inclusive of url scheme and site. This just removes a lot of room for error.
My current script is similar to this:
from urllib.parse import urlparse
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
def download_file(local_absolute_path:str, global_absolute_path:str, client_context:ClientContext) -> None:
print(f"The file {global_absolute_path} is being prepared for download.")
download_location = urlparse(global_absolute_path)
file_to_download = client_context.web.get_file_by_server_relative_url(download_location)
with open(local_absolute_path, "wb") as local_file:
file_to_download.download_session(local_file).execute_query()
print(f"──► Download successful. The file has been saved as {local_absolute_path}\n")
Note that self.ctx in your code is equivalent to client_context in mine.
I recommend writing a bunch of helper functions to convert the paths back and forth between absolute, relative and the file name. The ones I currently use can be found below:
import os
from urllib.parse import urlparse
class PathHandler(object):
def __init__(self, absolute_path:str) -> None:
self.absolute_path = absolute_path
def get_filename_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return os.path.basename(parsed_url.path)
def get_relative_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return parsed_url.path
def get_parent_folder_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return os.path.dirname(parsed_url.path)
def get_scheme_and_root_from_absolute(self) -> str:
parsed_url = urlparse(self.absolute_path)
return f"{parsed_url.scheme}//{parsed_url.netloc}"
def convert_to_absolute_local(self, local_root:str, global_root:str) -> str:
return local_root + os.sep + self.absolute_path[len(global_root):].replace("/", os.sep)
def convert_to_absolute_global(self, local_root:str, global_root:str) -> str:
return global_root + "/" + self.absolute_path[len(local_root):].replace(os.sep, "/")
When I use the zipfile object in multiple functions, it works fine. However, when I try to run the one of the functions in thread, it gives the error "I/O operation on closed file".
Below code works fine which validates and extracts the zipfile
from zipfile import ZipFile
from threading import Thread
def extract_data(file):
zip_file = Zipfile(file)
validate = validate_function(zip_file)
if validate.status_code == 200:
data = extract_function(zip_file)
However, If I run the extract_function in thread, It gives me "ValueError: I/O operation on closed file"
def extract_data(file):
zip_file = Zipfile(file)
validate = validate_function(zip_file)
if validate.status_code == 200:
extract = Thread(target=extract_function,args=[zip_file])
extract.start()
Please guide me for understanding the root cause of this issue.
Update:
Here is the sample code to reproduce the issue:
from zipfile import ZipFile
from threading import Thread
import pandas as pd
from flask import Flask, request, Response
from werkzeug.middleware.proxy_fix import ProxyFix
from werkzeug.datastructures import FileStorage
from flask_restplus import Api, Resource, reqparse, cors
from flask_cors import cross_origin
app = Flask(__name__)
app.wsgi_app = ProxyFix(app.wsgi_app)
api = Api(app,
version='1.0.0',
doc='/',
)
def validate_function(zip_file):
try:
error = ZipFile.testzip(zip_file)
if error is None:
return Response('Zip file is validated',200)
else:
return Response('Invalid Zip file',601)
except Exception as e:
return Response('Error :' + str(e),601)
def extract_function(zip_file):
df_list = []
try:
for file in zip_file.namelist():
if file.endswith('.csv'):
df_list.append(pd.read_csv(zip_file.open(file)))
else:
excel_df = pd.read_excel(zipfile.open(file),None)
if type(excel_df) == dict:
df_list.extend(list(excel_df.values()))
else:
df_list.append(excel_df)
print(len(df_list))
except Exception as e:
print('Error in converting to dataframe', str(e))
def extract_data(file):
zip_file = ZipFile(file)
resp = validate_function(zip_file)
if resp.status_code == 200:
data = Thread(target= extract_function, args=[zip_file])
data.start()
#extract_function(zip_file) --> This works
return resp
process_data = reqparse.RequestParser()
process_data.add_argument('file', location='files', type=FileStorage, required=True, help='Input file in Zip format')
#api.route('/process-data')
#api.expect(process_data)
class DataExtract(Resource):
#cors.crossdomain(origin='*')
#cross_origin()
def post(self):
file = request.files['file']
resp = extract_data(file)
return resp
app.run()
Use with to open files, so they are closed correctly after you are done. You also have to open the file separately in the new thread:
def extract_data(file):
with ZipFile(file) as zip_file:
validate = validate_function(zip_file)
if validate.status_code == 200:
extract = Thread(target=extract_function,args=[file])
extract.start()
def extract_function(file):
with ZipFile(file) as zip_file:
# extract ...
in my Django 'views, I create a pdf file and I want to download it.
The file exist (path: /app/data/4.pdf) and i launch this command:
def download_line(request):
if not request.is_ajax() and not request.method == 'GET':
raise Http404
try:
fs =FileSystemStorage('/app/data')
with fs.open('4.pdf') as pdf:
response =HttpResponse(pdf,content_type='application/pdf')
response['Content-Disposition']='attachment; filename="4.pdf"'
except Exception as e:
logger.warning("Download Line | Erreur : " + e.message)
return response
But the download doesn't start and no error. Have you got a solution?
Thanks.
You can download existing file in your app by a link and static, like this
<a href="{% static 'questions/import_files/import_questions.xlsx' %}" download>Excel Format File </a>
I use FileResponse to serve file download, when the file already exists. FileResponse has been around since Django 1.7.4.
from django.core.files.storage import FileSystemStorage
from django.http import FileResponse
def download_line(request):
fs = FileSystemStorage('/absolute/folder/name')
FileResponse(fs.open('filename.pdf', 'rb'), content_type='application/force-download')
response['Content-Disposition'] = 'attachment; filename="filename.pdf"'
return response
Try this, I use this lines to download files
from django.http import HttpResponse
from wsgiref.util import FileWrapper
import os
def download(request, file_path):
"""
e.g.: file_path = '/tmp/file.pdf'
"""
try:
wrapper = FileWrapper(open(file_path, 'rb'))
response = HttpResponse(wrapper, content_type='application/force-download')
response['Content-Disposition'] = 'inline; filename=' + os.path.basename(file_path)
return response
except Exception as e:
return None
def sample_download_client_excel(request):
"""
e.g.: file_path = '/tmp/file.pdf'
"""
try:
obj = SampleFile.objects.all().first()
file_path = obj.file_name.url.strip('/')
wrapper = FileWrapper(open(file_path, 'rb'))
response = HttpResponse(
wrapper,
content_type='application/force-download'
)
response['Content-Disposition'] = 'inline; filename=' + os.path.basename(file_path)
return response
except Exception as e:
return None
Im doing an application that uses Django in server-side.
Im trying to do that:
import uuid
from base64 import b64decode
from django.core.files.base import ContentFile
#staticmethod
def add_photo(user, person, image_base64):
photo = DatabasePersonPhoto()
photo.user = user
photo.person = person
image_data = b64decode(image_base64)
image_name = str(uuid.uuid4())+".jpg"
photo.image = ContentFile(image_data, image_name)
photo.save()
return photo
This is my Base64 String:

The image file is generated, but I cant open it like an image.
I think this will be a best approach tried it and tested in django 1.10. based on this SO answer: https://stackoverflow.com/a/28036805/6143656
I made a function for decoded base64 file.
def decode_base64_file(data):
def get_file_extension(file_name, decoded_file):
import imghdr
extension = imghdr.what(file_name, decoded_file)
extension = "jpg" if extension == "jpeg" else extension
return extension
from django.core.files.base import ContentFile
import base64
import six
import uuid
# Check if this is a base64 string
if isinstance(data, six.string_types):
# Check if the base64 string is in the "data:" format
if 'data:' in data and ';base64,' in data:
# Break out the header from the base64 content
header, data = data.split(';base64,')
# Try to decode the file. Return validation error if it fails.
try:
decoded_file = base64.b64decode(data)
except TypeError:
TypeError('invalid_image')
# Generate file name:
file_name = str(uuid.uuid4())[:12] # 12 characters are more than enough.
# Get the file name extension:
file_extension = get_file_extension(file_name, decoded_file)
complete_file_name = "%s.%s" % (file_name, file_extension, )
return ContentFile(decoded_file, name=complete_file_name)
Then you can call the function
import decode_base64_file
p = Post(content='My Picture', image=decode_based64_file(your_base64_file))
p.save()
I found the solution.
I need to use only the parte without data:image/jpeg;base64,
In Python, we can do it with something like this:
image_base64 = image_base64.split('base64,', 1 )
fh = open("imageToSave.png", "wb")
fh.write(imgData.decode('base64'))
fh.close()
Edit (klaus-d): The code above gives an example, how to store an image file from BASE64 encoded data. It opens a file imageToSave.png in binary mode for writing, then decodes the base64 image data and write the result to the file. At the end it closes the file descriptor.