Disabling OCR when uploading to Google Drive via API - python

I was running into an issue with unconv where it would periodically just hang indefinitely when trying to convert random documents to PDFs, so I wrote a small python script to upload documents to GDrive and download them again as PDFs to work around this issue.
The problem I've run into is that, google drive is automatically trying to OCR images that get uploaded and I don't want that to happen, but I've thus far been unable to find documentation on how to disable the OCR.
One thing I did notice: I'm the create function from v3 of the api, in the v2 api, there is an insert function that takes an OCR flag. Is this possible with the v3 api?
Here is my code:
from __future__ import print_function
import httplib2
import magic
import io
import sys
import argparse
import subprocess as sp
from apiclient import discovery
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http
from googleapiclient.http import MediaFileUpload
from googleapiclient.http import MediaIoBaseDownload
from settings import *
"""
This script exists to mask unoconv for JUST pdf conversion. If it gets flags for anything else, it will fallback on unoconv.
Otherwise, it uploads the document to google drive, download it as a pdf, and then delete the file out of the drive.
"""
MIMETYPE_MAPPING = {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document":"application/vnd.google-apps.document",
"application/rtf":"application/vnd.google-apps.document",
"text/richtext":"application/vnd.google-apps.document",
"text/plain":"application/vnd.google-apps.document",
"text/html":"application/vnd.google-apps.document",
"application/vnd.oasis.opendocument.text":"application/vnd.google-apps.document",
"application/x-iwork-pages-sffpages":"application/vnd.google-apps.document",
"application/msword":"application/vnd.google-apps.document",
"application/vnd.ms-excel":"application/vnd.google-apps.spreadsheets",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":"application/vnd.google-apps.spreadsheets",
"text/csv":"application/vnd.google-apps.spreadsheets",
"text/tab-separated-values":"application/vnd.google-apps.spreadsheets",
"application/vnd.oasis.opendocument.spreadsheets":"application/vnd.google-apps.spreadsheets",
"application/vnd.oasis.opendocument.spreadsheet":"application/vnd.google-apps.spreadsheets",
"application/vnd.ms-powerpoint":"application/vnd.google-apps.presentation",
"application/vnd.openxmlformats-officedocument.presentationml.presentationml":"application/vnd.google-apps.presentation",
"application/vnd.oasis.opendocument.presentation":"application/vnd.google-apps.presentation",
"image/png":"application/vnd.google-apps.document",
"image/x-citrix-png":"application/vnd.google-apps.document",
"image/x-png":"application/vnd.google-apps.document",
"image/jpeg":"application/vnd.google-apps.document",
"image/x-citrix-jpeg":"application/vnd.google-apps.document",
"image/gif":"application/vnd.google-apps.document",
"image/bmp":"application/vnd.google-apps.document",
"application/pdf":"application/vnd.google-apps.document",
}
SERVICE = None
def get_service():
"""
Establishes the connection to the google drive APIs.
"""
global SERVICE
if SERVICE is None:
credentials = ServiceAccountCredentials.from_json(JSON_KEY)
http = http_auth = credentials.authorize(Http())
SERVICE = discovery.build('drive', 'v3', http=http_auth)
return SERVICE
def drive_upload(fp, fn):
"""
Uploads the file found at fp to root of google drive account as a google doc with name fn
Returns the id of the new file
"""
mimetype = magic.from_file(fp, mime=True)
drive_service = get_service()
file_metadata = {
'name' : fn,
'mimeType' : MIMETYPE_MAPPING.get(mimetype, 'application/vnd.google-apps.document'),
}
media = MediaFileUpload(fp,
mimetype=mimetype,
resumable=True)
import inspect
print(inspect.getargspec(drive_service.files().create)[0])
file = drive_service.files().create(body=file_metadata,
media_body=media,
fields='id').execute()
return file.get('id')
def download_pdf(file_id,dlp):
"""
Downloads file from google drive specified by file_id to the filepath in dlp
Will download file as pdf
"""
drive_service = get_service()
request = drive_service.files().export_media(fileId=file_id,
mimeType='application/pdf')
resp = request.execute()
f = open(dlp,'w')
f.write(resp)
f.close()
def convert_to_pdf(inputf, outputf):
"""
Converts input file to pdf located at output file and cleans up file from google drive
"""
fid = drive_upload(inputf,inputf.split('/')[-1])
download_pdf(fid,outputf)
#Now delete the file from drive
service = get_service()
service.files().delete(fileId=fid).execute()
def pass_through():
"""
Calls unoconv with same args that were passed to this script
"""
print("PASSING THROUGH",file=sys.stderr)
cmd = PATH_TO_UNOCONV + " " + " ".join(sys.argv[1:])
child = sp.Popen(cmd.split(), stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = child.communicate()
print(stdout,end='')
print(stderr, file=sys.stderr,end='')
sys.exit(child.returncode)
class ArgParse(argparse.ArgumentParser):
"""
This subclass of ArgumentParser exists to change the default behaviour of the exit function
If the exit function is called with a status other than 0 (usually because unsupported flags are used),
a call is made to pass_through let unoconv handle this call.
"""
def exit(self, status=0,message=None):
if status != 0:
pass_through()
else:
return super(ArgParse,self).exit(status=status,message=message)
if __name__ == '__main__':
parser = ArgParse(description="Wrapper for unoconv that farms pdf conversions to google drive, using any args other than the supplied will cause it to fallback on unoconv")
parser.add_argument('-f', metavar='format', help='Desired ouput format')
parser.add_argument('-o', metavar='output_file', help='Path to output file')
parser.add_argument('fname', metavar='inputf', type=str, nargs=1, help='Path to file to convert')
args = parser.parse_args()
fmt = args.f
output_file = args.o
input_file = args.fname[0]
if fmt.upper() == "PDF":
try:
convert_to_pdf(input_file, output_file)
except:
pass_through()
else:
#if we aren't converting the file to a PDF, let unoconv handle it
pass_through()

Related

Can you download multiple files from Google Drive asynchronously?

My problem is the following:
I am sending queries via the Google Drive API that fetch all files that match a certain criteria. I won't post the entire code here as it's quite extensive, but the query criteria is just to get all files that belong in folders with a certain name (for example: "I want all files that reside in folders where the folder name contains the string 'meet'").
The code I have written for this particular part, is the following:
import json
import environ
import os
import google.auth
import io
from apiclient import discovery
from httplib2 import Http
from google.cloud import secretmanager
from googleapiclient.http import MediaIoBaseDownload
from oauth2client.service_account import ServiceAccountCredentials
# Imported functions from a local file. Just writing to database and establishing connection
from firestore_drive import add_file, establish_db_connection
.... some other code here ...
def update_files_via_parent_folder(self, parent_id, parent_name):
page_token = None
# Set a query that fetches all files based on the ID of its parent folder
# E.g. "get all files from folder whose ID is parent_id"
query = f"'{parent_id}' in parents"
response = self.execute_query(query, page_token)
files = response.get('files', [])
while True:
# Execute the query, and extract all resulting files in the folder
for file in files:
file_id = file['id']
filename = file['name']
# Start requesting the current file from Drive, and download through a byte-stream
request = self.service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
dl_counter = 0
while done is False:
# Start downloading the file from Drive, and convert it to JSON (dictionary)
status, done = downloader.next_chunk()
prefab_json = json.loads(fh.getvalue())
# Find the proper collection-name and then add the file to database
collection_name = next(type_name for type_name in self.possible_types if type_name in parent_name)
add_file(self.db, collection_name, filename, file_content=prefab_json)
# Find out if there are more files to download in the same folder
page_token = response.get('nextPageToken', None)
if page_token is None:
if len(files) == 0:
print(f'Folder found, but contained no files.')
break
response = self.execute_query(query, page_token)
files = response.get('files', [])
def execute_query(self, query, page_token):
"""
Helper function for executing a query to Google Drive. Implemented as a function due to repeated usage.
"""
return self.service.files().list(
q=query,
spaces='drive',
fields='nextPageToken, files(id, name)',
pageToken=page_token).execute()
Now my question is this:
Is there a way to download the files asynchronously or in parallel in the following section?
for file in files:
file_id = ...
filename = ...
# Same as above; start download and write to database...
For reference, the point of the code is to extract files that are located on Google Drive, and copy them over to another database. I'm not concerned with local storage, only fetching from Drive and writing to a database (if this is even possible to do in parallel).
I've tried various options such as multiprocessing.pool, multiprocessing.ThreadPool, and asyncio, but I'm not sure if I actually used them correctly. I can also mention that the database used, is Firestore.
Additional note: the reason I want to do it, is because this sequential operation is extremely slow, and I want to deploy this as a cloud function (which has a maximum time limit of 540 second (9 minutes)).
Any feedback is welcome :)

Azure Durable Functions : Http Trigger error

As a newbie in Azure,
I am following Microsoft Azure Function tutorial page
https://learn.microsoft.com/en-us/azure/azure-functions/durable/durable-functions-cloud-backup?tabs=python
and github page
https://github.com/Azure/azure-functions-durable-python/tree/master/samples/fan_in_fan_out .
**HttpStart code**
import logging
import json
import azure.functions as func
import azure.durable_functions as df
async def main(req: func.HttpRequest, starter: str) -> func.HttpResponse:
client = df.DurableOrchestrationClient(starter)
payload: str = json.loads(req.get_body().decode()) # Load JSON post request data
instance_id = await client.start_new(req.route_params["functionName"], client_input=payload)
logging.info(f"Started orchestration with ID = '{instance_id}'.")
return client.create_check_status_response(req, instance_id)
**E2_BackupSiteContent**
import azure.functions as func
import azure.durable_functions as df
def orchestrator_function(context: df.DurableOrchestrationContext):
root_directory: str = context.get_input()
if not root_directory:
raise Exception("A directory path is required as input")
files = yield context.call_activity("E2_GetFileList", root_directory)
tasks = []
for file in files:
tasks.append(context.call_activity("E2_CopyFileToBlob", file))
results = yield context.task_all(tasks)
total_bytes = sum(results)
return total_bytes
main = df.Orchestrator.create(orchestrator_function)
**E2_CopyFileToBlob**
import os
import pathlib
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError
connect_str = os.getenv('AzureWebJobsStorage')
def main(filePath: str) -> str:
# Create the BlobServiceClient object which will be used to create a container client
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
# Create a unique name for the container
container_name = "backups"
# Create the container if it does not exist
try:
blob_service_client.create_container(container_name)
except ResourceExistsError:
pass
# Create a blob client using the local file name as the name for the blob
parent_dir, fname = pathlib.Path(filePath).parts[-2:] # Get last two path components
blob_name = parent_dir + "_" + fname
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
# Count bytes in file
byte_count = os.path.getsize(filePath)
# Upload the created file
with open(filePath, "rb") as data:
blob_client.upload_blob(data)
return byte_count
**E2_GetFileList**
import os
from os.path import dirname
from typing import List
def main(rootDirectory: str) -> List[str]:
all_file_paths = []
# We walk the file system
for path, _, files in os.walk(rootDirectory):
# We copy the code for activities and orchestrators
if "E2_" in path:
# For each file, we add their full-path to the list
for name in files:
if name == "__init__.py" or name == "function.json":
file_path = os.path.join(path, name)
all_file_paths.append(file_path)
return all_file_paths
When I http trigger with postman app,
POST http://localhost:7071/api/orchestrators/E2_BackupSiteContent?req="D:\Tmp"
I got the following error messages.
[2021-11-12T02:13:42.432Z] Worker process started and initialized.
[2021-11-12T02:13:46.489Z] Host lock lease acquired by instance ID '000000000000000000000000AE48769C'.
[2021-11-12T02:13:52.529Z] Executing 'Functions.HttpStart' (Reason='This function was programmatically called via the host APIs.', Id=748996d0-1f84-4597-86ea-768467eb36e3)
[2021-11-12T02:13:52.560Z] Executed 'Functions.HttpStart' (Failed, Id=748996d0-1f84-4597-86ea-768467eb36e3, Duration=5433ms)
[2021-11-12T02:13:52.562Z] System.Private.CoreLib: Exception while executing function: Functions.HttpStart. Microsoft.Azure.WebJobs.Host: Exception binding parameter 'req'. Microsoft.AspNetCore.Server.Kestrel.Core: Reading the request body timed out due to data arriving too slowly. See MinRequestBodyDataRate.
What should I do to solve this problem?
(I tested with Linux and Windows.)
--Added--
Postman capture
Instead of passing the directory in the query string of the URL, you should pass the path in the HTTP request body in the postman tool. The Microsoft doc page itself shows how to do it, see Run the sample section.

Google Translate API - Reading and Writing to Cloud Storage - Python

I'm using Google Translation API to translate a csv file with multiple columns and rows. The target language is english and the file has text in multiple languages.
The code posted below uses local files for testing but I'd like to use (import) file from the cloud storage bucket and export the translated file to a different cloud storage bucket.
I've tried to run the script below with my sample file and got an error message: "FileNotFoundError: [Errno 2] No such file or directory"
I stumbled upon this link for "Reading and Writing to Cloud Storage" but I was not able to implement the suggested solution into the script below. https://cloud.google.com/appengine/docs/standard/python/googlecloudstorageclient/read-write-to-cloud-storage#reading_from_cloud_storage
May I ask for a suggested modification of the script to import (and translate) the file from google cloud bucket and export the translated file to a different google cloud bucket? Thank you!
Script mentioned:
from google.cloud import translate
import csv
def listToString(s):
""" Transform list to string"""
str1 = " "
return (str1.join(s))
def detect_language(project_id,content):
"""Detecting the language of a text string."""
client = translate.TranslationServiceClient()
location = "global"
parent = f"projects/{project_id}/locations/{location}"
response = client.detect_language(
content=content,
parent=parent,
mime_type="text/plain", # mime types: text/plain, text/html
)
for language in response.languages:
return language.language_code
def translate_text(text, project_id,source_lang):
"""Translating Text."""
client = translate.TranslationServiceClient()
location = "global"
parent = f"projects/{project_id}/locations/{location}"
# Detail on supported types can be found here:
# https://cloud.google.com/translate/docs/supported-formats
response = client.translate_text(
request={
"parent": parent,
"contents": [text],
"mime_type": "text/plain", # mime types: text/plain, text/html
"source_language_code": source_lang,
"target_language_code": "en-US",
}
)
# Display the translation for each input text provided
for translation in response.translations:
print("Translated text: {}".format(translation.translated_text))
def main():
project_id="your-project-id"
csv_files = ["sample1.csv","sample2.csv"]
# Perform your content extraction here if you have a different file format #
for csv_file in csv_files:
csv_file = open(csv_file)
read_csv = csv.reader(csv_file)
content_csv = []
for row in read_csv:
content_csv.extend(row)
content = listToString(content_csv) # convert list to string
detect = detect_language(project_id=project_id,content=content)
translate_text(text=content,project_id=project_id,source_lang=detect)
if __name__ == "__main__":
main()
You could download the file from GCS and run your logic against the local (downloaded file) and then upload to another GCS bucket. Example:
Download file from "my-bucket" to /tmp
from google.cloud import storage
client = storage.Client()
bucket = client.get_bucket("my-bucket")
source_blob = bucket.blob("blob/path/file.csv")
new_file = "/tmp/file.csv"
download_blob = source_blob.download_to_filename(new_file)
After translating/running your code logic, upload to a bucket:
bucket = client.get_bucket('my-other-bucket')
blob = bucket.blob('myfile.csv')
blob.upload_from_filename('myfile.csv')

Cannot upload certain type in Drive using Google Drive API

I am trying to update some files in my drive using the following piece of code
from apiclient.discovery import build
from apiclient.http import MediaIoBaseDownload, MediaFileUpload
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http
import io
import pandas as pd
PATH_FOLDER = './'
scopes = ['https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scopes)
http_auth = credentials.authorize(Http())
drive = build('drive', 'v3', http=http_auth)
def upload_file(file_name):
""" This function purpose is to find
a file in the drive having the same name as the
file_name parameter then uploads a new version
using the local file having the same name
param:
file_name(str) : name of the file to update in drive (also
name of the local file to use as new version)
"""
files = drive.files().list().execute()['files']
for file in files:
if file['name'] == file_name:
file_id = file['id']
break
file_metadata = {
'name': file_name,
'mimeType': file['mimeType']
}
media = MediaFileUpload(PATH_FOLDER + file_name, mimetype=file['mimeType'], resumable=True)
drive.files().update(
fileId=file_id,
body=file_metadata,
media_body=media,
).execute()
When I try to upload certain file type (pkl, csv):
upload_file('universe.pkl')
upload_file('list.csv')
the code runs without any error but the files do not change in my drive
whereas when I apply the same function for a different type (xlsx, txt):
upload_file('info.xlsx')
upload_file('test.txt')
it works fine and my files get updated. Did anyone face the same problem? and if so what was the solution?
Thanks.
In the following line of your upload_file function you have hard-coded the mime-type:
media = MediaFileUpload(PATH_FOLDER + file_name, mimetype='text/csv', resumable=True)
Make sure that the mime-type you specify on file-metadata equals the one in media

How to upload a bytes image on Google Cloud Storage from a Python script

I want to upload an image on Google Cloud Storage from a python script. This is my code:
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient import discovery
scopes = ['https://www.googleapis.com/auth/devstorage.full_control']
credentials = ServiceAccountCredentials.from_json_keyfile_name('serviceAccount.json', scop
es)
service = discovery.build('storage','v1',credentials = credentials)
body = {'name':'my_image.jpg'}
req = service.objects().insert(
bucket='my_bucket', body=body,
media_body=googleapiclient.http.MediaIoBaseUpload(
gcs_image, 'application/octet-stream'))
resp = req.execute()
if gcs_image = open('img.jpg', 'r') the code works and correctly save my image on Cloud Storage. How can I directly upload a bytes image? (for example from an OpenCV/Numpy array: gcs_image = cv2.imread('img.jpg'))
In my case, I wanted to upload a PDF document to Cloud Storage from bytes.
When I tried the below, it created a text file with my byte string in it.
blob.upload_from_string(bytedata)
In order to create an actual PDF file using the byte string I had to do:
blob.upload_from_string(bytedata, content_type='application/pdf')
My byte data was b64encoded, so I also had b64decode it first.
If you want to upload your image from file.
import os
from google.cloud import storage
def upload_file_to_gcs(bucket_name, local_path, local_file_name, target_key):
try:
client = storage.Client()
bucket = client.bucket(bucket_name)
full_file_path = os.path.join(local_path, local_file_name)
bucket.blob(target_key).upload_from_filename(full_file_path)
return bucket.blob(target_key).public_url
except Exception as e:
print(e)
return None
but if you want to upload bytes directly:
import os
from google.cloud import storage
def upload_data_to_gcs(bucket_name, data, target_key):
try:
client = storage.Client()
bucket = client.bucket(bucket_name)
bucket.blob(target_key).upload_from_string(data)
return bucket.blob(target_key).public_url
except Exception as e:
print(e)
return None
note that target_key is prefix and the name of the uploaded file.
MediaIoBaseUpload expects an io.Base-like object and raises following error:
'numpy.ndarray' object has no attribute 'seek'
upon receiving a ndarray object. To solve it I am using TemporaryFile and numpy.ndarray().tofile()
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient import discovery
import googleapiclient
import numpy as np
import cv2
from tempfile import TemporaryFile
scopes = ['https://www.googleapis.com/auth/devstorage.full_control']
credentials = ServiceAccountCredentials.from_json_keyfile_name('serviceAccount.json', scopes)
service = discovery.build('storage','v1',credentials = credentials)
body = {'name':'my_image.jpg'}
with TemporaryFile() as gcs_image:
cv2.imread('img.jpg').tofile(gcs_image)
req = service.objects().insert(
bucket='my_bucket’, body=body,
media_body=googleapiclient.http.MediaIoBaseUpload(
gcs_image, 'application/octet-stream'))
resp = req.execute()
Be aware that googleapiclient is non-idiomatic and maintenance only(it’s not developed anymore). I would recommend using idiomatic one.
Here is how to directly upload a PIL Image from memory:
from google.cloud import storage
import io
from PIL import Image
# Define variables
bucket_name = XXXXX
destination_blob_filename = XXXXX
# Configure bucket and blob
client = storage.Client()
bucket = client.bucket(bucket_name)
im = Image.open("test.jpg")
bs = io.BytesIO()
im.save(bs, "jpeg")
blob.upload_from_string(bs.getvalue(), content_type="image/jpeg")
In addition to that, here is how to download blobfiles directly to memory as PIL Images:
blob = bucket.blob(destination_blob_filename)
downloaded_im_data = blob.download_as_bytes()
downloaded_im = Image.open(io.BytesIO(downloaded_im_data))

Categories