I want to create a Python script to backup Google Drive files as a bit of fun / learning, but I am stuck. My script below did work, but it just made the last modified date and created date of all the files on my local drive on back up as the date they were backed up, and didn't preserve the original created date / modified date as they were on Google Drive.
Here is my script:
from __future__ import print_function
import sys, httplib2, os, datetime, io
from time import gmtime, strftime
from apiclient import discovery
import oauth2client
from oauth2client import client
from oauth2client import tools
from datetime import date
#########################################################################
# Fixing OSX el capitan bug ->AttributeError: 'Module_six_moves_urllib_parse' object has no attribute 'urlencode'
os.environ["PYTHONPATH"] = "/Library/Python/2.7/site-packages"
#########################################################################
CLIENT_SECRET_FILE = 'client_secrets.json'
TOKEN_FILE="drive_api_token.json"
SCOPES = 'https://www.googleapis.com/auth/drive'
APPLICATION_NAME = 'Drive File API - Python'
OUTPUT_DIR=str(date.today())+"_drive_backup"
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
def get_credentials():
home_dir = os.path.expanduser('~')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir, TOKEN_FILE)
store = oauth2client.file.Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def prepDest():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
return True
return False
def downloadFile(file_name, file_id, file_createdDate, mimeType, service):
request = service.files().get_media(fileId=file_id)
if "application/vnd.google-apps" in mimeType:
if "document" in mimeType:
request = service.files().export_media(fileId=file_id, mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
file_name = file_name + ".docx"
else:
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
file_name = file_name + ".pdf"
print("Downloading -- " + file_name)
response = request.execute()
with open(os.path.join(OUTPUT_DIR, file_name), "wb") as wer:
wer.write(response)
def listFiles(service):
def getPage(pageTok):
return service.files().list(q="mimeType != 'application/vnd.google-apps.folder'",
pageSize=1000, pageToken=pageTok, fields="nextPageToken,files(id,name, createdDate, mimeType)").execute()
pT = ''; files=[]
while pT is not None:
results = getPage(pT)
pT = results.get('nextPageToken')
files = files + results.get('files', [])
return files
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
for item in listFiles(service):
downloadFile(item.get('name'), item.get('id'), item.get('createdDate'), item.get('mimeType'), service)
if __name__ == '__main__':
main()
To try and get the created date, you can see in the above script I added in createdDate, which looks like some of the metadata I can grab from the file:
https://developers.google.com/drive/v2/reference/files
But I don't know if I am grabbing that metadata correctly, and if so, how I actually assign it to my downloaded file.
EDIT: Really sorry but I didn't specify an OS - this is for a mac.
File v2 createdDate renamed in v3 to createdTime
The File reference you linked is for v2, but your code connects to the v3 service. When I ran your code, which uses createdDate from the v2 API, an error occurred (createdDate was an invalid metadata field).
I switched to the v3 File API, which lists the creation time as createdTime, and was able to retrieve the time without error.
File creation time changeable in Windows only
Linux/Unix does not allow setting a file's creation time, but it allows modification to the file's modified and access times via os.utime() (both times required by this function). The Drive API provides createdTime and modifiedTime but nothing for access time (which probably wouldn't make sense there), although the modification time could serve just as well for the access time.
In Windows, the file creation time could be set with win32file.SetFileTime.
Time conversion
Note that the times that are passed to the timestamp functions above are in seconds since epoch. The Drive API returns an ISO 8601 string that we convert to seconds with:
dt = datetime.datetime.strptime(dateTime, "%Y-%m-%dT%H:%M:%S.%fZ")
secs = int(dt.strftime("%s"))
Modifications
Replace all instances of createdDate with createdTime.
In listFiles() > getPage(), add modifiedTime to metadata fields:
def listFiles(service):
def getPage(pageTok):
return service.files().list(q="mimeType != 'application/vnd.google-apps.folder'",
pageSize=1000, pageToken=pageTok, fields="nextPageToken,files(id,name, createdTime, modifiedTime, mimeType)").execute()
In main()'s for-loop, pass modifiedTime to downloadFiles():
downloadFile(item.get('name'), item.get('id'), item.get('createdTime'), item.get('modifiedTime'), item.get('mimeType'), service)
In downloadFiles(), add modifiedTime to parameter list after file_createdTime.
Add these functions to set file timestamps:
def dateToSeconds(dateTime):
return int(datetime.datetime.strptime(dateTime, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%s"))
def setFileTimestamps(fname, createdTime, modifiedTime):
ctime = dateToSeconds(createdTime)
mtime = dateToSeconds(modifiedTime)
setFileCreationTime(fname, ctime)
setFileModificationTime(fname, mtime)
def setFileModificationTime(fname, newtime):
# Set access time to same value as modified time,
# since Drive API doesn't provide access time
os.utime(fname, (newtime, newtime))
def setFileCreationTime(fname, newtime):
"""http://stackoverflow.com/a/4996407/6277151"""
if os.name != 'nt':
# file creation time can only be changed in Windows
return
import pywintypes, win32file, win32con
wintime = pywintypes.Time(newtime)
winfile = win32file.CreateFile(
fname, win32con.GENERIC_WRITE,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE | win32con.FILE_SHARE_DELETE,
None, win32con.OPEN_EXISTING,
win32con.FILE_ATTRIBUTE_NORMAL, None)
win32file.SetFileTime(winfile, wintime, None, None)
winfile.close()
In downloadFiles(), call setFileTimestamps() right after writing the file (as last line of function):
def downloadFile(file_name, file_id, file_createdTime, modifiedTime, mimeType, service):
request = service.files().get_media(fileId=file_id)
if "application/vnd.google-apps" in mimeType:
if "document" in mimeType:
request = service.files().export_media(fileId=file_id, mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
file_name = file_name + ".docx"
else:
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
file_name = file_name + ".pdf"
print("Downloading -- " + file_name)
response = request.execute()
prepDest()
fname = os.path.join(OUTPUT_DIR, file_name)
with open(fname, "wb") as wer:
wer.write(response)
setFileTimestamps(fname, file_createdTime, modifiedTime)
GitHub repo
Related
I am trying to get a list of files with names, dates, etc into a csv file from files on my Google Drive folder which has around 15k sub-folders (one level below) within the main folder that have about 1-10 files each that amount to around 65k files in total.
I used the following code in Python to create my csv file which generates the information for all the sub-folders but only about 18k of the individual files within those sub-folders (the most recently uploaded files in those sub-folders).
I am not quite sure why my code is not able to get the list for all the files in those sub-folders. Is there a limit I am hitting that stops me from getting the information for all the files?
Note: The folder I am storing the files is a shared folder but I don't think that should be affecting anything.
import httplib2
import os
from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
import pandas as pd
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/drive-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly'
CLIENT_SECRET_FILE = 'credentials.json'
APPLICATION_NAME = 'Drive API Python Quickstart'
folder_id = '########' # Set to id of the parent folder you want to list (should be the content folder)
folder_list = []
all_folders = []
file_list = []
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth2 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
home_dir = os.path.expanduser('~')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir,'drive-python-quickstart.json')
store = Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def get_root_folder(): # Gets folder list from original root folder
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
results = service.files().list(q="mimeType = 'application/vnd.google-apps.folder' and '"+folder_id+"' in parents",
pageSize=1000, fields="nextPageToken, files(id, mimeType)", supportsAllDrives=True, includeItemsFromAllDrives=True).execute()
folders = results.get('files', [])
if not folders:
print('No folders found.')
else:
for folder in folders:
id = folder.get('id')
folder_list.append(id)
def get_all_folders(folder_list): # Creates list of all sub folder under root, keeps going until no folders underneath
for folder in folder_list:
additional_folders = []
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
results = service.files().list(
q="mimeType = 'application/vnd.google-apps.folder' and '" +folder+ "' in parents",
pageSize=1000, fields="nextPageToken, files(id, mimeType)", supportsAllDrives=True, includeItemsFromAllDrives=True).execute()
items = results.get('files', [])
for item in items:
id = item.get('id')
additional_folders.append(id)
if not additional_folders:
pass
else:
all_folders.extend(additional_folders)
folder_list = additional_folders
get_all_folders(folder_list)
def merge(): # Merges sub folder list with full list
global full_list
full_list = all_folders + folder_list
full_list.append(folder_id)
def get_file_list(): # Runs over each folder generating file list, for files over 1000 uses nextpagetoken to run additional requests, picks up metadata included in the request
for folder in full_list:
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
page_token = None
while True:
results = service.files().list(
q="'" + folder + "' in parents",
pageSize=1000, fields="nextPageToken, files(name, md5Checksum, mimeType, size, createdTime, modifiedTime, id, parents, trashed)", pageToken=page_token, supportsAllDrives=True, includeItemsFromAllDrives=True).execute()
items = results.get('files', [])
for item in items:
name = item['name']
checksum = item.get('md5Checksum')
size = item.get('size', '-')
id = item.get('id')
mimeType = item.get('mimeType', '-')
createdTime = item.get('createdTime', 'No date')
modifiedTime = item.get('modifiedTime', 'No date')
parents = item.get('parents')
trashed = item.get('trashed')
file_list.append([name, checksum, mimeType, size, createdTime, modifiedTime, id, parents, trashed])
page_token = results.get('nextPageToken', None)
if page_token is None:
break
files = pd.DataFrame(file_list,columns=['file_name','checksum_md5','mimeType','size', 'date_created', 'date_last_modified','google_id', 'google_parent_id', 'trashed'])
files.drop(files[files['trashed'] == True].index, inplace=True) #removes files which have True listed in trashed, these are files which had been moved to the recycle bin
foldernumbers = files['mimeType'].str.contains('application/vnd.google-apps.folder').sum()
filenumbers = (~files['mimeType'].str.contains('application/vnd.google-apps.folder')).sum()
print('Number of folders is: ', foldernumbers)
print('Number of files is: ', filenumbers)
files.to_csv('D:/GoogleAPIMetadata.csv', index=False)
if __name__ == '__main__':
print('Collecting folder id list')
get_root_folder()
get_all_folders(folder_list)
merge()
print('Generating file metadata list')
get_file_list()
I am trying to download the latest version of my software from google drive (I dont care about security, Im the only one running it, and im not sharing it), but it says the file is not found when its downloading.
Here is my code:
import os
import re
import sys
import functions
from functions import *
import json
from Google import Create_Service
import io
from googleapiclient.http import MediaIoBaseDownload
versionList = []
onlineVersionList = []
version = ""
#Google Api Stuff
CLIENT_SECRET_FILE = 'client_secret_GoogleCloud.json'
API_NAME = 'drive'
API_VERSION = 'v3'
SCOPES = ['https://www.googleapis.com/auth/drive', 'https://www.googleapis.com/auth/drive.install']
service = Create_Service(CLIENT_SECRET_FILE, API_NAME, API_VERSION, SCOPES)
#Searches for the highest version on google drive
page_token = None
while True:
response = service.files().list(q="mimeType = 'application/vnd.google-apps.folder' and name contains 'Version'"
"and not name contains 'ClockOS'",
fields='nextPageToken, files(id, name)',
pageToken=page_token).execute()
for file in response.get('files', []):
# Process change
onlineVersionList.append(file.get('name'))
page_token = response.get('nextPageToken', None)
if page_token is None:
break
for filename in os.listdir('Versions'):
if filename.startswith('Version'):
versionList.append(filename)
print(f"Loaded {filename}")
def major_minor_micro(version):
major, minor, micro = re.search('(\d+)\.(\d+)\.(\d+)', version).groups()
return int(major), int(minor), int(micro)
def major_minor(version):
major, minor, micro = re.search('(\d+)\.(\d+)\.(\d+)', version).groups()
return int(major), int(minor)
def major(version):
major, minor, micro = re.search('(\d+)\.(\d+)\.(\d+)', version).groups()
return int(major)
if versionType() == "stable":
latest = str(max(versionList, key=major))
onlineLatest = str(max(onlineVersionList, key=major))
elif versionType() == "standard":
latest = str(max(versionList, key=major_minor))
onlineLatest = str(max(onlineVersionList, key=major_minor))
elif versionType() == "beta":
latest = str(max(versionList, key=major_minor_micro))
onlineLatest = str(max(onlineVersionList, key=major_minor_micro))
else:
print("An error has occurred and a wrong version type was picked.")
sys.exit()
if str(onlineLatest) > str(latest):
#Gets the api of the highest version
page_token = None
while True:
response = service.files().list(q=f"mimeType = 'application/vnd.google-apps.folder' and name contains '{onlineLatest}'",
fields='nextPageToken, files(id, name)',
pageToken=page_token).execute()
for file in response.get('files', []):
# Process change
print('Found file id: %s (%s)' % (file.get('name'), file.get('id')))
onlineVersionID = file.get('name')
page_token = response.get('nextPageToken', None)
if page_token is None:
break
request = service.files().get_media(fileId=onlineVersionID)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print
"Download %d%%." % int(status.progress() * 100)
print("Ran", latest)
setInfo("version", latest)
os.chdir(("Versions/"+latest))
cwd = os.getcwd()
sys.path.append(cwd)
import main
And I get this error, after I log into google:
googleapiclient.errors.HttpError:
<HttpError 404 when requesting (link to file)?alt=media returned "File not found: Version 0.1.1.". Details: "[
{'domain': 'global', 'reason': 'notFound', 'message': 'File not found: Version 0.1.1.',
'locationType': 'parameter', 'location': 'fileId'}
]">
It clearly finds it, as it returns the name I gave it, so why wont it download, anyone know how to fix?
I already gave it scopes needed, and gave my account tester, and this is stored in the same google account as the one I log into.
From It clearly finds it, as it returns the name I gave it, I thought that you might be able to retrieve the file. And, when I saw your script, it seems that the filename is used as the file ID with onlineVersionID = file.get('name') and request = service.files().get_media(fileId=onlineVersionID). So in this case, how about the following modification?
From:
onlineVersionID = file.get('name')
To:
onlineVersionID = file.get('id')
** i have a google cloud function which needs to connect to url and get data in the form of csv files and store in one bucket. this is what written in python code .
when i test the function its compiling successfully but its not working at all. when i checked the log its giving the eblwo mentioned error.
favt_LnT_acn_blackline_data_pull_func43jttmffma0g Invalid constructor input for AccessSecretVersionRequest: 'projects/gcp-favt-acn-rpt-dev/secrets/blackline_api_key/versions/latest'
please find the code and suggest.
Thanks,
Vithal
**
'
import base64
import logging
import requests
#import pandas as pd
#from pandas import json_normalize
import json
import os
import datetime
from datetime import datetime as dt
import pytz
from google.cloud import storage
from google.cloud import secretmanager
def delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file):
storage_client = storage.Client()
bucket = storage_client.bucket(landing_bucket_name)
blob = bucket.blob(landing_blob_name)
rpt_last_run_blob = bucket.blob('some.csv')
retention_bucket = storage_client.bucket(retention_bucket_name)
if blob.exists(storage_client):
#Delete the old file
blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(landing_blob_name))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
blob.upload_from_filename(source_file_name)
print("File {} uploaded to Bucket {} With Name
{}.".format(source_file_name, bucket, landing_blob_name))
if file_retention_flag == 'Y':
#Copy the last file of the day to retention bucket
new_file_name = retn_file_suffix + '_' + landing_blob_name
blob_copy = bucket.copy_blob(blob, retention_bucket,
new_file_name)
print('File {} is copied to Retention Bucket
{}'.format(new_file_name, retention_bucket))
if rpt_last_run_blob.exists(storage_client):
#Delete the old file
rpt_last_run_blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(rpt_last_run_blob))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
rpt_last_run_blob.upload_from_filename(rpt_last_run_file)
print("File {} uploaded to Bucket {} With Name
{}.".format(rpt_last_run_file, bucket,
'Reports_Latest_Run_time.csv'))
def api_request():
et = pytz.timezone("US/Eastern")
current_et_time = dt.now().astimezone(et)
print('Current ET Time:', current_et_time)
pt = pytz.timezone("US/Pacific")
ut = pytz.timezone("UTC")
blackline_base_url = "https://....com"
blackline_sts_url = blackline_base_url + "/authorize/connect/token"
project_id = 'gcp-favt-acn-dev'
secret_id = '###_api_key'
secret_client = secretmanager.SecretManagerServiceClient()
secret_name =
secret_client.secret_version_path(project_id,secret_id,'latest')
secret_resp = secret_client.access_secret_version(secret_name)
api_key = secret_resp.payload.data.decode('UTF-8')
grant_type = 'password'
scope = '####'
username = '####'
payload = 'grant_type='+grant_type+'&scope='+scope+
'&username='+username+'&password='+api_key
sts_headers = { 'Authorization': 'Basic dXBzOk5KXXx2VENsSiEtRw==',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':
'BLSIAPPEN=!bpJj4AOTHPcaqipWtDI6FrozN629M9xYLA/
sbM1DWVH+jjuY5fgHVMACha2rIapXRoB7CcqnlaHgBw=='}
response = requests.request("POST", ###_sts_url, headers =
sts_headers, data = payload)
if response.ok:
sts_response = response.json()
access_token = sts_response['access_token']
print(access_token)
blackline_rpt_submit_url = ##_base_url + '/api/queryruns'
rpt_payload = ''
blackline_rpt_api_headers =
{'Authorization': 'Bearer {}'.format(access_token), 'Content-Type':
'text/plain'}
rpt_resp = requests.request("GET", blackline_rpt_submit_url, headers
= blackline_rpt_api_headers, data = rpt_payload)
print(rpt_resp.text)
jl = json.loads(rpt_resp.text)
reports_list = []
rprts_filename = "tmp_rprts.csv"
rprts_full_path = os.path.join("/tmp",rprts_filename)
with open(rprts_full_path, 'w') as f:
f.write('ReportName,ReportLastRunTime'+'\n')
hrs = -2
hrs_to_subtract = datetime.timedelta(hours=hrs)
two_hrs_ago_time = current_et_time + hrs_to_subtract
#print(two_hrs_ago_time)#latest_rpt_check_time)
frmtd_curr_time = two_hrs_ago_time.strftime('%Y-%m-%d %H:%M:%S')
latest_rpt_check_time =
dt.strptime(frmtd_curr_time,'%Y-%m-%d %H:%M:%S')
print("Latest Report Check Time:", latest_rpt_check_time)
for each in jl:
strpd_time = dt.strptime(each['endTime'][0:19],'%Y-%m-
%dT%H:%M:%S')
#print(strpd_time)
pt_localize = pt.localize(strpd_time)
#print(pt_localize)
et_time = pt_localize.astimezone(et)
#print(et_time)
frmtd_et_time = et_time.strftime('%Y-%m-%d %H:%M:%S')
#print(frmtd_et_time)
cnvrted_endTime = dt.strptime(frmtd_et_time,'%Y-%m-%d %H:%M:%S')
#print("Report LastRun EndTime:", cnvrted_endTime)
ut_time = pt_localize.astimezone(ut)
frmtd_ut_time = ut_time.strftime('%Y-%m-%d %H:%M:%S')
if cnvrted_endTime > latest_rpt_check_time:
reports_list.append({each['name']:each['exportUrls'][0]
["url"]})
rpt_last_run = each['name']+','+frmtd_ut_time
print(rpt_last_run)
with open(rprts_full_path, 'a') as f:
f.write(rpt_last_run+'\n')
retn_file_suffix = each['endTime'][0:10]
#print(retn_file_suffix)
rpt_run_hr = cnvrted_endTime.hour
#print(rpt_run_hr)
#############
print(reports_list)
for report in reports_list:
for k in report:
print(report[k])
report_fetch_url = blackline_base_url + '/' + report[k]
print('Report Fetch URL: {}'.format(report_fetch_url))
filename = "temp_file.csv"
full_path = os.path.join("/tmp",filename)
rpt_data = requests.request("GET", report_fetch_url, headers
= blackline_rpt_api_headers)
print(rpt_data.text)
with open(full_path,'wb') as tmp_file:
tmp_file.write(rpt_data.content)
#Upload it to Cloud Storage
landing_bucket_name = "####_dev_landing_bkt" #CHANGE ME
source_file_name = os.path.join(full_path)
rpt_last_run_file = os.path.join(rprts_full_path)
landing_blob_name = '##.csv' #CHANGE ME
retention_bucket_name = '####_dev_retention_bkt'
print('file retention check')
if (rpt_run_hr >= 22):
file_retention_flag = 'Y'
else:
file_retention_flag = 'N'
print(file_retention_flag)
delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file)
#Remove the temp file after it is uploaded to Cloud Storage to
avoid OOM issues with the Cloud Function.
os.remove(full_path)
#Remove the tmp file after upload
os.remove(rprts_full_path)
#def pacific_to_eastern_conversion(pacific_time, eastern_time):
def main(event,context):
try:
if 'data' in event:
name = base64.b64decode(event['data']).decode('utf-8')
else:
name = 'World'
print('Hello{}',format(name))
api_request()
except Exception as e:
logging.error(e)' enter code here
The approach you are using will work for Cloud Run but won't work for Cloud functions.
To make use of secrets in Google cloud functions, following are the steps:
Make sure that the function's runtime service account must be granted access to the secret. To use Secret Manager with Cloud Functions, assign the roles/secretmanager.secretAccessor role to the service account associated with your function.
Make the secret accessible to the function. This can be done using either the Google Cloud Console or the gcloud command-line tool.
I exposed the secret as an environment variable(with name set to "api_key") and accessed them in the code as stated below:
import os
api_key = os.environ.get('api_key')
I hope this answers your question.
Your cloud functions service account haven't access to Secret manager. Grant your Cloud Functions service account on the secret, or on the project (not recommended).
If you don't set a custom service account on your Cloud Functions (which is also not a good practice), the App Engine default service account is used. Here the pattern <ProjectID>#appspot.gserviceaccount.com
I've built a successful service connection to the Drive API already, and I’m creating export URLs to download each sheet in a Spreadsheet as a CSV file by sending requests with Google’s AuthorizedSession class. For some reason, only a portion of the CSV files come back correct, with the others containing broken HTML. When I send a single request, the sheet always comes back correct, but when I loop through the sheets and start sending requests things start to break. I've identified there's a problem with how I'm passing the credentials this way, but I'm not sure if I'm using AuthorizedSession correctly. Can anyone help me figure this one out?
from googleapiclient.discovery import build
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
import re
import shutil
import urllib.parse
CLIENT_SECRET_FILE = "client_secret.json"
API_NAME = "sheets"
API_VERSION = "v4"
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
SPREADSHEET_ID = "Spreadsheet ID goes here"
print(CLIENT_SECRET_FILE, API_NAME, API_VERSION, SCOPES, sep="-")
cred = service_account.Credentials.from_service_account_file(
CLIENT_SECRET_FILE, scopes=SCOPES
)
try:
service = build(API_NAME, API_VERSION, credentials=cred)
print(API_NAME, "service created successfully")
result = service.spreadsheets().get(spreadsheetId=SPREADSHEET_ID).execute()
export_url = re.sub("\/edit$", "/export", result["spreadsheetUrl"])
authed_session = AuthorizedSession(cred)
for sheet in result["sheets"]:
sheet_name = sheet["properties"]["title"]
params = {"format": "csv", "gid": sheet["properties"]["sheetId"]}
query_params = urllib.parse.urlencode(params)
url = export_url + "?" + query_params
response = authed_session.get(url)
file_path = "./Downloads/" + sheet_name + ".csv"
with open(file_path, "wb") as csv_file:
csv_file.write(response.content)
print("Downloaded sheet: " + sheet_name)
print("Downloads complete")
except Exception as e:
print("Unable to connect")
print(e)
This code should get you a sheetsservice
"""Hello sheets."""
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
SCOPES = ['"https://www.googleapis.com/auth/drive.readonly']
KEY_FILE_LOCATION = '<REPLACE_WITH_JSON_FILE>'
VIEW_ID = '<REPLACE_WITH_VIEW_ID>'
def initialize_sheet():
"""Initializes an sheetservice object.
Returns:
An authorized sheetservice object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
sheet= build('sheet', 'v4', credentials=credentials)
return sheet
If you use the same sheet service built by this method then you souldnt have any issues looping
I think that your script of authed_session = AuthorizedSession(cred) and response = authed_session.get(url) are correct. I thought that in your situation, the number of requests might be large in the short time, and this might be due to the reason of your issue. So as a simple modification, how about the following modification?
From:
for sheet in result["sheets"]:
sheet_name = sheet["properties"]["title"]
params = {"format": "csv", "gid": sheet["properties"]["sheetId"]}
query_params = urllib.parse.urlencode(params)
url = export_url + "?" + query_params
response = authed_session.get(url)
file_path = "./Downloads/" + sheet_name + ".csv"
with open(file_path, "wb") as csv_file:
csv_file.write(response.content)
print("Downloaded sheet: " + sheet_name)
To:
for sheet in result["sheets"]:
sheet_name = sheet["properties"]["title"]
params = {"format": "csv", "gid": sheet["properties"]["sheetId"]}
query_params = urllib.parse.urlencode(params)
url = export_url + "?" + query_params
response = authed_session.get(url)
file_path = "./Downloads/" + sheet_name + ".csv"
with open(file_path, "wb") as csv_file:
csv_file.write(response.content)
print("Downloaded sheet: " + sheet_name)
time.sleep(3) # <--- Added. Please adjust the value of 3 for your actual situation.
In this case, please use import time.
I'm using a combination of the GCS python SDK and google API client to loop through a version-enabled bucket and download specific objects based on metadata.
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print(item['bucket'])
print(item['name'])
print(item['metadata']['epoch'])
print(item['updated'])
blob = source_bucket.blob(item['name'])
blob.download_to_filename(
'/Users/admin/git/data-processing/{}'.format(item))
downloadepoch_objects()
The above function works properly for a blob that is not within a directory (gs://bucketname/test1.txt) as the item that gets passed in is simply test1.txt. The issue I am running into is when trying to download files from a complex directory tree (gs://bucketname/nfs/media/docs/test1.txt) The item that gets passed is nfs/media/docs/test1.txt. Is it possible to have the .download_to_file() method to create directories if they are not present?
Below is the working solution. I ended up stripping away the path from the object name and creating the directory structure on the fly. A better way might be as #Brandon Yarbrough suggested using 'prefix + response['prefixes'][0]' but I couldn't quite figure that out. Hope this helps others out.
#!/usr/local/bin/python3
from google.cloud import storage
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json
import os
import pathlib
bucket_name = 'test-bucket'
restore_epoch = '1519189202'
restore_location = '/Users/admin/data/'
credentials = GoogleCredentials.get_application_default()
service = discovery.build('storage', 'v1', credentials=credentials)
storage_client = storage.Client()
source_bucket = storage_client.get_bucket(bucket_name)
def listall_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
print(json.dumps(response, indent=2))
def listname_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
for item in response['items']:
print(item['name'] + ' Uploaded on: ' + item['updated'] +
' Epoch: ' + item['metadata']['epoch'])
def downloadepoch_objects():
request = service.objects().list(
bucket=bucket_name,
versions=True
)
response = request.execute()
try:
for item in response['items']:
if item['metadata']['epoch'] == restore_epoch:
print('Downloading ' + item['name'] + ' from ' +
item['bucket'] + '; Epoch= ' + item['metadata']['epoch'])
print('Saving to: ' + restore_location)
blob = source_bucket.blob(item['name'])
path = pathlib.Path(restore_location + r'{}'.format(item['name'])).parent
if os.path.isdir(path):
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
else:
os.mkdir(path)
blob.download_to_filename(restore_location + '{}'.format(item['name']))
print('Download complete')
except Exception:
pass
# listall_objects()
# listname_objects()
downloadepoch_objects()
GCS does not have a notion of "directories," although tools like gsutil do a good job of pretending for convenience. If you want all of the objects under the "nfs/media/docs/" path, you can specify that as a prefix, like so:
request = service.objects.list(
bucket=bucket_name,
versions=True,
prefix='nfs/media/docs/', # Only show objects beginning like this
delimiter='/' # Consider this character a directory marker.
)
response = request.execute()
subdirectories = response['prefixes']
objects = response['items']
Because of the prefix parameter, only objects that begin with 'nfs/media/docs' will be returned in response['items']. Because of the delimiter parameter, "subdirectories" will be returned in response['prefixes']. You can get more details in the Python documentation of the objects.list method.
If you were to use the newer google-cloud Python library, which I'd recommended for new code, the same call would look pretty similar:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket(bucket_name)
iterator = bucket.list_blobs(
versions=True,
prefix='nfs/media/docs/',
delimiter='/'
)
subdirectories = iterator.prefixes
objects = list(iterator)
Following solution worked for me. I am recursively downloading all blobs from a path prefix to a model directory at the project root, while maintaining the folder structure.
Multiple blobs are being downloaded concurrently.
GCS client version
google-cloud-storage==1.41.1
import os
from datetime import datetime
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor
BUCKET_NAME = "ml-model"
def timer(func):
def time_wrapper(*arg, **kwargs):
start = datetime.now()
func(*arg, **kwargs)
diff = datetime.now() - start
logger.info(f"{func.__name__} took {diff.seconds} s and {diff.microseconds//1000} ms")
return time_wrapper
def fetch_environment() -> str:
env = os.environ.get("environment", "staging")
return env
def create_custom_folder(dir_name: str):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def fetch_gcs_credential_file_path():
return os.environ.get("GCS_CREDENTIAL_FILE_PATH")
class GCS:
def __init__(self):
cred_file_path = fetch_gcs_credential_file_path()
self.client = storage.Client.from_service_account_json(cred_file_path)
self.bucket = self.client.bucket(BUCKET_NAME)
def download_blob(self, blob):
filename = blob.name.replace(self.path_prefix, '')
delimiter_based_splits = filename.split('/')
if len(delimiter_based_splits) > 1:
dir_name = "model/" + "/".join(delimiter_based_splits[: len(delimiter_based_splits)-1])
create_custom_folder(dir_name)
blob.download_to_filename(f"{dir_name}/{delimiter_based_splits[-1]}")
else:
blob.download_to_filename(f"model/" + filename)
#timer
def download_blobs_multithreaded(self, prefix: str):
'''
CREATE FOLDER IF NOT EXISTS
'''
create_custom_folder("model")
blobs = self.bucket.list_blobs(prefix=prefix)
self.path_prefix = prefix
with ThreadPoolExecutor() as executor:
executor.map(self.download_blob, blobs
def download_model():
env = fetch_environment()
folder_path_prefix = f"ml/{env}/{ML_MODEL_NAME}/v1/tf-saved-model/"
gcs = GCS()
gcs.download_blobs_multithreaded(folder_path_prefix)
if __name__ == '__main__':
download_model()