How to upload large csv file in chunks to Microsoft graph API? - python

I'm following this great blog File Handling in Sharepoint in order to upload files to online sharepoint.
I define a method to upload if my file size is smaller than 4MB, otherwise I have to upload in chunks. My method works ok for uploading any type of file but not for csv nor excel.
Here is my function definition
def save_pd_as_csv(self, df: pd.DataFrame, root_folder: str, file: str, *args: list, **kwargs: dict) -> None:
client_id = 'xxxxxxxxxxxxx'
tenant_id = 'xxxxxxxxxxxxxx'
scopes = ['Sites.ReadWrite.All', 'Files.ReadWrite.All']
auth_url = 'https://login.microsoftonline.com/xxxxxxxxxxx/oauth2/v2.0/authorize'
# MobileApplicationClient is used to get the Implicit Grant
oauth = OAuth2Session(client=MobileApplicationClient(client_id=client_id), scope=scopes)
authorization_url, state = oauth.authorization_url(auth_url)
# Graph API Configuration
CLIENT_ID = client_id
TENANT_ID = tenant_id
AUTHORITY_URL = 'https://login.microsoftonline.com/{}'.format(TENANT_ID)
RESOURCE_URL = 'https://graph.microsoft.com/'
SHAREPOINT_HOST_NAME = 'xxxxxxxxx' # URL of sharepoint host without https://
API_VERSION = 'v1.0'
USERNAME = 'xxxxxxxxx' # Office365 user's account username
PASSWORD = 'xxxxxxxxx'
SCOPES = ['Sites.ReadWrite.All', 'Files.ReadWrite.All'] # Add other scopes/permissions as needed.
# Creating a public client app, Aquire a access token for the user and set the header for API calls
pca = msal.PublicClientApplication(CLIENT_ID, authority=AUTHORITY_URL)
token = pca.acquire_token_by_username_password(USERNAME, PASSWORD, SCOPES)
headers = {'Authorization': 'Bearer {}'.format(token['access_token'])}
# get the site ID of document sharepoint, which is index 1 in the list (starting from 0)
site_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/sites?search=*', headers=headers).json()['value'][0]['id']
# list drives in the side team documents
drive_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/sites/{site_id}/drives?search=*', headers=headers).json()['value'][0]['id']
# id of the folder
item_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/root:/{root_folder}', headers=headers).json()['id']
# now create file
output = io.BytesIO()
df.to_csv(output, *args, **kwargs)
csv_data = output.getvalue()
# get file id
file_ids = requests.get(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}/children', headers=headers).json()['value']
item_dict = {i["name"]: i["id"] for i in file_ids}
# get size of file
size = sys.getsizeof(df)
# push file according to size
if size <= 4194304:
if file in item_dict.keys():
file_id = item_dict[file]
result = requests.put(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{file_id}/content', headers=headers, data=csv_data)
else:
result = requests.put(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}:/{file}:/content', headers=headers, data=csv_data)
if result.status_code == 200:
print("File uploaded")
# now upload by chunks
else:
df.to_csv("temp.csv")
result = requests.post(
f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}:/{file}:/createUploadSession',
headers={'Authorization': 'Bearer ' + token['access_token']},
json={
'#microsoft.graph.conflictBehavior': 'replace',
'description': 'Uploading a large file',
'fileSystemInfo': {'#odata.type': 'microsoft.graph.fileSystemInfo'},
'name': file
}
)
upload_url = result.json()['uploadUrl']
CHUNK_SIZE = 10485760
chunks = int(size / CHUNK_SIZE) + 1 if size % CHUNK_SIZE > 0 else 0
with open("temp.csv", 'rb') as fd:
start = 0
for chunk_num in range(chunks):
chunk = fd.read(CHUNK_SIZE)
bytes_read = len(chunk)
upload_range = f'bytes {start}-{start + bytes_read - 1}/{size}'
result = requests.put(upload_url,
headers={
'Content-Length': str(bytes_read),
'Content-Range': upload_range
},
data=chunk
)
result.raise_for_status()
start += bytes_read
if result.status_code == 200:
print("File uploaded to sharepoint")
Then I read a random dataframe that has a size greater than 4MB in order to go to else condition, with this method:
with open('df.pickle', 'rb') as pkl:
df_list = pickle.load(pkl)
save_pd_as_csv(df_list, root_folder=r"2.PROCESSED", file="history.csv", index=False)
As you can see I am creating a temporary file temp.csv in order to read this file with open() as binary rb
Finally I am able to read 2 chunks of data only (instead of 8) and get following error
File "C:\Users\Baptiste\PycharmProjects\union_brokerage\venv\lib\site-packages\requests\models.py", line 1021, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 400 Client Error: Bad Request for url:
It seems that at the 3rd chunk load, value is an empty string.
Why can't we read a csv just like a normal binary file and how can I upload such large files with graph API in chunks ?
Thank you all!
Baptiste

Related

Bypass google automated query scurity check

i have a list of google drive file links about 300 pdf files which i have to download
so what i am trying to do is using pythons request library i am requesting to google server and getting the files.
after 30 to 36 files download google blocks my requests and return
We're sorry...... but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.
i am using the following code
import requests
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
if response.status_code!=200:
print(response.status_code)
return response.status_code
print('downloading '+ destination)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
i = 0
for chunk in response.iter_content(CHUNK_SIZE):
print(str(i)+'%')
i = i+1
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print('downloaded '+ destination)
if __name__ == "__main__":
file_id = 'file id'
destination = file_id+'.pdf'
download_file_from_google_drive(file_id, destination)
i am iterating download_file_from_google_drive this function using my list
so can i bypass the security check
i tried using vpn which changes my ip address but nothing works.
after about 1hour downloading restart

Unable to download large files from google drive using python

I want to download large size files from google drive using python.
And I did this using below code
import pickle
import os.path
import requests
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os
import pickle
class DriveAPI:
global SCOPES
SCOPES = ['https://www.googleapis.com/auth/drive.file']
def __init__(self):
self.creds = None
if os.path.exists('token.pickle'):
with open('token.pickle','rb') as token:
self.creds = pickle.load(token)
if not self.creds or not self.creds.valid:
if self.creds and self.creds.expired and self.creds.refresh_token:
self.creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
self.creds = flow.run_local_server(port=0)
with open('token.pickle', 'wb') as token:
pickle.dump(self.creds,token)
self.service = build('drive','v3',credentials=self.creds)
results = self.service.files().list(pageSize=100,fields='files(id,name,createdTime)').execute()
items = results.get('files',[])
def download_file_from_google_drive(self,id, destination):
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(self,response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
if __name__ == "__main__":
obj = DriveAPI()
f_id = "File ID"
file_name = "File Name"
obj.service.permissions().create(body={"role":"reader", "type":"anyone"}, fileId=f_id).execute()
obj.FileDownload(f_id,file_name)
By using above code I was able to download 2Gb size file for a certain period of time like 2 months. But now I'm unable to download large size files.
If I run this code the file downloads only 2.2kb file present.
But there is no issues that prints in terminal.

How to add boto3 output into a new file and upload it back to an AWS s3 bucket using one script?

The purpose of the code below is to read a pdf file located in s3 bucket and list the values of the pdf in the terminal. My end goal is to load those values into a csv/xlsx and upload it to the same s3 bucket. In other words, this is a file conversion from pdf to xlsx.
Adding item to_excel at the end is not loading the data to excel, any suggestions? the code below is only creating an empty xlsx file on the local directory but I need it to do the following:
save the data in xlsx listed in the terminal from reading the pdf located in s3
take that xlsx file that has the terminal data and upload it back to s3
import boto3
import time
import pandas as pd
# Textract APIs used - "start_document_text_detection", "get_document_text_detection"
def InvokeTextDetectJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
`enter code here` 'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def CheckJobComplete(jobId):
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def JobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
# S3 Document Data
s3BucketName = "pdfbucket"
documentName = "pdf"
# Function invokes
jobId = InvokeTextDetectJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(CheckJobComplete(jobId)):
response = JobResults(jobId)
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"])
with pd.ExcelWriter('output_cp.xlsx') as writer:
item.to_excel(writer, sheetName='Sheet1')
So after some more research and checking with other people, below is the final product.
The purpose of this code is the following:
Read a pdf file stored in an AWS S3 Bucket
After reading the file, save the data as text in a new xlsx file
import boto3
import time
import pandas as pd
from xlsxwriter import Workbook
# Textract APIs used - "start_document_text_detection", "get_document_text_detection"
def InvokeTextDetectJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def CheckJobComplete(jobId):
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def JobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(
JobId=jobId, NextToken=nextToken)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
# S3 Document Data
s3BucketName = "cpaypdf"
documentName = "cp.pdf"
# Function invokes
jobId = InvokeTextDetectJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(CheckJobComplete(jobId)):
response = JobResults(jobId)
df = pd.DataFrame(
columns=[
"Text"
]
)
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
df = df.append({
"Text": item['Text']
},
ignore_index=True
)
writer = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='sheetname', index=False)
writer.save()

issue with Cloud Function python

** i have a google cloud function which needs to connect to url and get data in the form of csv files and store in one bucket. this is what written in python code .
when i test the function its compiling successfully but its not working at all. when i checked the log its giving the eblwo mentioned error.
favt_LnT_acn_blackline_data_pull_func43jttmffma0g Invalid constructor input for AccessSecretVersionRequest: 'projects/gcp-favt-acn-rpt-dev/secrets/blackline_api_key/versions/latest'
please find the code and suggest.
Thanks,
Vithal
**
'
import base64
import logging
import requests
#import pandas as pd
#from pandas import json_normalize
import json
import os
import datetime
from datetime import datetime as dt
import pytz
from google.cloud import storage
from google.cloud import secretmanager
def delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file):
storage_client = storage.Client()
bucket = storage_client.bucket(landing_bucket_name)
blob = bucket.blob(landing_blob_name)
rpt_last_run_blob = bucket.blob('some.csv')
retention_bucket = storage_client.bucket(retention_bucket_name)
if blob.exists(storage_client):
#Delete the old file
blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(landing_blob_name))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
blob.upload_from_filename(source_file_name)
print("File {} uploaded to Bucket {} With Name
{}.".format(source_file_name, bucket, landing_blob_name))
if file_retention_flag == 'Y':
#Copy the last file of the day to retention bucket
new_file_name = retn_file_suffix + '_' + landing_blob_name
blob_copy = bucket.copy_blob(blob, retention_bucket,
new_file_name)
print('File {} is copied to Retention Bucket
{}'.format(new_file_name, retention_bucket))
if rpt_last_run_blob.exists(storage_client):
#Delete the old file
rpt_last_run_blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(rpt_last_run_blob))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
rpt_last_run_blob.upload_from_filename(rpt_last_run_file)
print("File {} uploaded to Bucket {} With Name
{}.".format(rpt_last_run_file, bucket,
'Reports_Latest_Run_time.csv'))
def api_request():
et = pytz.timezone("US/Eastern")
current_et_time = dt.now().astimezone(et)
print('Current ET Time:', current_et_time)
pt = pytz.timezone("US/Pacific")
ut = pytz.timezone("UTC")
blackline_base_url = "https://....com"
blackline_sts_url = blackline_base_url + "/authorize/connect/token"
project_id = 'gcp-favt-acn-dev'
secret_id = '###_api_key'
secret_client = secretmanager.SecretManagerServiceClient()
secret_name =
secret_client.secret_version_path(project_id,secret_id,'latest')
secret_resp = secret_client.access_secret_version(secret_name)
api_key = secret_resp.payload.data.decode('UTF-8')
grant_type = 'password'
scope = '####'
username = '####'
payload = 'grant_type='+grant_type+'&scope='+scope+
'&username='+username+'&password='+api_key
sts_headers = { 'Authorization': 'Basic dXBzOk5KXXx2VENsSiEtRw==',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':
'BLSIAPPEN=!bpJj4AOTHPcaqipWtDI6FrozN629M9xYLA/
sbM1DWVH+jjuY5fgHVMACha2rIapXRoB7CcqnlaHgBw=='}
response = requests.request("POST", ###_sts_url, headers =
sts_headers, data = payload)
if response.ok:
sts_response = response.json()
access_token = sts_response['access_token']
print(access_token)
blackline_rpt_submit_url = ##_base_url + '/api/queryruns'
rpt_payload = ''
blackline_rpt_api_headers =
{'Authorization': 'Bearer {}'.format(access_token), 'Content-Type':
'text/plain'}
rpt_resp = requests.request("GET", blackline_rpt_submit_url, headers
= blackline_rpt_api_headers, data = rpt_payload)
print(rpt_resp.text)
jl = json.loads(rpt_resp.text)
reports_list = []
rprts_filename = "tmp_rprts.csv"
rprts_full_path = os.path.join("/tmp",rprts_filename)
with open(rprts_full_path, 'w') as f:
f.write('ReportName,ReportLastRunTime'+'\n')
hrs = -2
hrs_to_subtract = datetime.timedelta(hours=hrs)
two_hrs_ago_time = current_et_time + hrs_to_subtract
#print(two_hrs_ago_time)#latest_rpt_check_time)
frmtd_curr_time = two_hrs_ago_time.strftime('%Y-%m-%d %H:%M:%S')
latest_rpt_check_time =
dt.strptime(frmtd_curr_time,'%Y-%m-%d %H:%M:%S')
print("Latest Report Check Time:", latest_rpt_check_time)
for each in jl:
strpd_time = dt.strptime(each['endTime'][0:19],'%Y-%m-
%dT%H:%M:%S')
#print(strpd_time)
pt_localize = pt.localize(strpd_time)
#print(pt_localize)
et_time = pt_localize.astimezone(et)
#print(et_time)
frmtd_et_time = et_time.strftime('%Y-%m-%d %H:%M:%S')
#print(frmtd_et_time)
cnvrted_endTime = dt.strptime(frmtd_et_time,'%Y-%m-%d %H:%M:%S')
#print("Report LastRun EndTime:", cnvrted_endTime)
ut_time = pt_localize.astimezone(ut)
frmtd_ut_time = ut_time.strftime('%Y-%m-%d %H:%M:%S')
if cnvrted_endTime > latest_rpt_check_time:
reports_list.append({each['name']:each['exportUrls'][0]
["url"]})
rpt_last_run = each['name']+','+frmtd_ut_time
print(rpt_last_run)
with open(rprts_full_path, 'a') as f:
f.write(rpt_last_run+'\n')
retn_file_suffix = each['endTime'][0:10]
#print(retn_file_suffix)
rpt_run_hr = cnvrted_endTime.hour
#print(rpt_run_hr)
#############
print(reports_list)
for report in reports_list:
for k in report:
print(report[k])
report_fetch_url = blackline_base_url + '/' + report[k]
print('Report Fetch URL: {}'.format(report_fetch_url))
filename = "temp_file.csv"
full_path = os.path.join("/tmp",filename)
rpt_data = requests.request("GET", report_fetch_url, headers
= blackline_rpt_api_headers)
print(rpt_data.text)
with open(full_path,'wb') as tmp_file:
tmp_file.write(rpt_data.content)
#Upload it to Cloud Storage
landing_bucket_name = "####_dev_landing_bkt" #CHANGE ME
source_file_name = os.path.join(full_path)
rpt_last_run_file = os.path.join(rprts_full_path)
landing_blob_name = '##.csv' #CHANGE ME
retention_bucket_name = '####_dev_retention_bkt'
print('file retention check')
if (rpt_run_hr >= 22):
file_retention_flag = 'Y'
else:
file_retention_flag = 'N'
print(file_retention_flag)
delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file)
#Remove the temp file after it is uploaded to Cloud Storage to
avoid OOM issues with the Cloud Function.
os.remove(full_path)
#Remove the tmp file after upload
os.remove(rprts_full_path)
#def pacific_to_eastern_conversion(pacific_time, eastern_time):
def main(event,context):
try:
if 'data' in event:
name = base64.b64decode(event['data']).decode('utf-8')
else:
name = 'World'
print('Hello{}',format(name))
api_request()
except Exception as e:
logging.error(e)' enter code here
The approach you are using will work for Cloud Run but won't work for Cloud functions.
To make use of secrets in Google cloud functions, following are the steps:
Make sure that the function's runtime service account must be granted access to the secret. To use Secret Manager with Cloud Functions, assign the roles/secretmanager.secretAccessor role to the service account associated with your function.
Make the secret accessible to the function. This can be done using either the Google Cloud Console or the gcloud command-line tool.
I exposed the secret as an environment variable(with name set to "api_key") and accessed them in the code as stated below:
import os
api_key = os.environ.get('api_key')
I hope this answers your question.
Your cloud functions service account haven't access to Secret manager. Grant your Cloud Functions service account on the secret, or on the project (not recommended).
If you don't set a custom service account on your Cloud Functions (which is also not a good practice), the App Engine default service account is used. Here the pattern <ProjectID>#appspot.gserviceaccount.com

Cannot upload s3 files to another region (clients bucket) despite successful response

This is my code. I am trying to copy a directory from one bucket to another. I am seeing everything is positive, but files are not appearing in the clients bucket.
import boto3
ACCESS_KEY = 'access_key'
SECRET_KEY = 'secret_key'
REGION_NAME = 'US_EAST_1'
source_bucket = 'source_bucket'
#Make sure you provide / in the end
source_prefix = 'source_prefix'
target_bucket = 'target-bucket'
target_prefix = 'target-prefix'
client = boto3.client('s3')
session_src = boto3.session.Session()
source_s3_r = session_src.resource('s3')
def get_s3_keys(bucket, prefix):
keys = []
response = client.list_objects_v2(Bucket=bucket,Prefix=prefix,MaxKeys=100)
for obj in response['Contents']:
keys.append(obj['Key'])
return keys
session_dest = boto3.session.Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
dest_s3_r = session_dest.resource('s3')
# create a reference to source image
old_obj = source_s3_r.Object(source_bucket, source_prefix)
# create a reference for destination image
new_obj = dest_s3_r.Object(target_bucket, target_prefix)
keys = get_s3_keys(source_bucket, source_prefix)
responses = []
# upload the image to destination S3 object
for filename in keys:
print("Transferring file {}, {}".format(source_bucket,filename))
old_obj = source_s3_r.Object(source_bucket, filename)
response = new_obj.put(Body=old_obj.get()['Body'].read())
response_code = response['ResponseMetadata']['HTTPStatusCode']
responses.append(response_code)
print("File transfer response {}".format(response_code))
distinct_response = list(set(responses))
if len(distinct_response) > 1 or distinct_response[0] != 200:
print("File could not be transfered to krux bucket. Exiting now")
exit(1)
else:
print("File transfer to krux bucket successful")
I am getting a successful response code of 200 but the file is not transferred across.
Srinivas, Try this
I used S3 Resource object, try equivalent S3 Client if you want...
bucket= s3.Bucket(bucket_name) #from_bucket
for osi in bucket.objects.all():
print(osi)
copy_source={
'Bucket': bucket.name,
'Key': osi.key
}
s3.Bucket('to_bucket').copy(copy_source, osi.key)
Hope it helps..
r0ck

Categories