I want to download large size files from google drive using python.
And I did this using below code
import pickle
import os.path
import requests
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os
import pickle
class DriveAPI:
global SCOPES
SCOPES = ['https://www.googleapis.com/auth/drive.file']
def __init__(self):
self.creds = None
if os.path.exists('token.pickle'):
with open('token.pickle','rb') as token:
self.creds = pickle.load(token)
if not self.creds or not self.creds.valid:
if self.creds and self.creds.expired and self.creds.refresh_token:
self.creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
self.creds = flow.run_local_server(port=0)
with open('token.pickle', 'wb') as token:
pickle.dump(self.creds,token)
self.service = build('drive','v3',credentials=self.creds)
results = self.service.files().list(pageSize=100,fields='files(id,name,createdTime)').execute()
items = results.get('files',[])
def download_file_from_google_drive(self,id, destination):
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(self,response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
if __name__ == "__main__":
obj = DriveAPI()
f_id = "File ID"
file_name = "File Name"
obj.service.permissions().create(body={"role":"reader", "type":"anyone"}, fileId=f_id).execute()
obj.FileDownload(f_id,file_name)
By using above code I was able to download 2Gb size file for a certain period of time like 2 months. But now I'm unable to download large size files.
If I run this code the file downloads only 2.2kb file present.
But there is no issues that prints in terminal.
Related
i have a list of google drive file links about 300 pdf files which i have to download
so what i am trying to do is using pythons request library i am requesting to google server and getting the files.
after 30 to 36 files download google blocks my requests and return
We're sorry...... but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.
i am using the following code
import requests
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
if response.status_code!=200:
print(response.status_code)
return response.status_code
print('downloading '+ destination)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
i = 0
for chunk in response.iter_content(CHUNK_SIZE):
print(str(i)+'%')
i = i+1
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print('downloaded '+ destination)
if __name__ == "__main__":
file_id = 'file id'
destination = file_id+'.pdf'
download_file_from_google_drive(file_id, destination)
i am iterating download_file_from_google_drive this function using my list
so can i bypass the security check
i tried using vpn which changes my ip address but nothing works.
after about 1hour downloading restart
I'm following this great blog File Handling in Sharepoint in order to upload files to online sharepoint.
I define a method to upload if my file size is smaller than 4MB, otherwise I have to upload in chunks. My method works ok for uploading any type of file but not for csv nor excel.
Here is my function definition
def save_pd_as_csv(self, df: pd.DataFrame, root_folder: str, file: str, *args: list, **kwargs: dict) -> None:
client_id = 'xxxxxxxxxxxxx'
tenant_id = 'xxxxxxxxxxxxxx'
scopes = ['Sites.ReadWrite.All', 'Files.ReadWrite.All']
auth_url = 'https://login.microsoftonline.com/xxxxxxxxxxx/oauth2/v2.0/authorize'
# MobileApplicationClient is used to get the Implicit Grant
oauth = OAuth2Session(client=MobileApplicationClient(client_id=client_id), scope=scopes)
authorization_url, state = oauth.authorization_url(auth_url)
# Graph API Configuration
CLIENT_ID = client_id
TENANT_ID = tenant_id
AUTHORITY_URL = 'https://login.microsoftonline.com/{}'.format(TENANT_ID)
RESOURCE_URL = 'https://graph.microsoft.com/'
SHAREPOINT_HOST_NAME = 'xxxxxxxxx' # URL of sharepoint host without https://
API_VERSION = 'v1.0'
USERNAME = 'xxxxxxxxx' # Office365 user's account username
PASSWORD = 'xxxxxxxxx'
SCOPES = ['Sites.ReadWrite.All', 'Files.ReadWrite.All'] # Add other scopes/permissions as needed.
# Creating a public client app, Aquire a access token for the user and set the header for API calls
pca = msal.PublicClientApplication(CLIENT_ID, authority=AUTHORITY_URL)
token = pca.acquire_token_by_username_password(USERNAME, PASSWORD, SCOPES)
headers = {'Authorization': 'Bearer {}'.format(token['access_token'])}
# get the site ID of document sharepoint, which is index 1 in the list (starting from 0)
site_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/sites?search=*', headers=headers).json()['value'][0]['id']
# list drives in the side team documents
drive_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/sites/{site_id}/drives?search=*', headers=headers).json()['value'][0]['id']
# id of the folder
item_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/root:/{root_folder}', headers=headers).json()['id']
# now create file
output = io.BytesIO()
df.to_csv(output, *args, **kwargs)
csv_data = output.getvalue()
# get file id
file_ids = requests.get(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}/children', headers=headers).json()['value']
item_dict = {i["name"]: i["id"] for i in file_ids}
# get size of file
size = sys.getsizeof(df)
# push file according to size
if size <= 4194304:
if file in item_dict.keys():
file_id = item_dict[file]
result = requests.put(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{file_id}/content', headers=headers, data=csv_data)
else:
result = requests.put(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}:/{file}:/content', headers=headers, data=csv_data)
if result.status_code == 200:
print("File uploaded")
# now upload by chunks
else:
df.to_csv("temp.csv")
result = requests.post(
f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}:/{file}:/createUploadSession',
headers={'Authorization': 'Bearer ' + token['access_token']},
json={
'#microsoft.graph.conflictBehavior': 'replace',
'description': 'Uploading a large file',
'fileSystemInfo': {'#odata.type': 'microsoft.graph.fileSystemInfo'},
'name': file
}
)
upload_url = result.json()['uploadUrl']
CHUNK_SIZE = 10485760
chunks = int(size / CHUNK_SIZE) + 1 if size % CHUNK_SIZE > 0 else 0
with open("temp.csv", 'rb') as fd:
start = 0
for chunk_num in range(chunks):
chunk = fd.read(CHUNK_SIZE)
bytes_read = len(chunk)
upload_range = f'bytes {start}-{start + bytes_read - 1}/{size}'
result = requests.put(upload_url,
headers={
'Content-Length': str(bytes_read),
'Content-Range': upload_range
},
data=chunk
)
result.raise_for_status()
start += bytes_read
if result.status_code == 200:
print("File uploaded to sharepoint")
Then I read a random dataframe that has a size greater than 4MB in order to go to else condition, with this method:
with open('df.pickle', 'rb') as pkl:
df_list = pickle.load(pkl)
save_pd_as_csv(df_list, root_folder=r"2.PROCESSED", file="history.csv", index=False)
As you can see I am creating a temporary file temp.csv in order to read this file with open() as binary rb
Finally I am able to read 2 chunks of data only (instead of 8) and get following error
File "C:\Users\Baptiste\PycharmProjects\union_brokerage\venv\lib\site-packages\requests\models.py", line 1021, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 400 Client Error: Bad Request for url:
It seems that at the 3rd chunk load, value is an empty string.
Why can't we read a csv just like a normal binary file and how can I upload such large files with graph API in chunks ?
Thank you all!
Baptiste
** i have a google cloud function which needs to connect to url and get data in the form of csv files and store in one bucket. this is what written in python code .
when i test the function its compiling successfully but its not working at all. when i checked the log its giving the eblwo mentioned error.
favt_LnT_acn_blackline_data_pull_func43jttmffma0g Invalid constructor input for AccessSecretVersionRequest: 'projects/gcp-favt-acn-rpt-dev/secrets/blackline_api_key/versions/latest'
please find the code and suggest.
Thanks,
Vithal
**
'
import base64
import logging
import requests
#import pandas as pd
#from pandas import json_normalize
import json
import os
import datetime
from datetime import datetime as dt
import pytz
from google.cloud import storage
from google.cloud import secretmanager
def delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file):
storage_client = storage.Client()
bucket = storage_client.bucket(landing_bucket_name)
blob = bucket.blob(landing_blob_name)
rpt_last_run_blob = bucket.blob('some.csv')
retention_bucket = storage_client.bucket(retention_bucket_name)
if blob.exists(storage_client):
#Delete the old file
blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(landing_blob_name))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
blob.upload_from_filename(source_file_name)
print("File {} uploaded to Bucket {} With Name
{}.".format(source_file_name, bucket, landing_blob_name))
if file_retention_flag == 'Y':
#Copy the last file of the day to retention bucket
new_file_name = retn_file_suffix + '_' + landing_blob_name
blob_copy = bucket.copy_blob(blob, retention_bucket,
new_file_name)
print('File {} is copied to Retention Bucket
{}'.format(new_file_name, retention_bucket))
if rpt_last_run_blob.exists(storage_client):
#Delete the old file
rpt_last_run_blob.delete()
print('File {} is deleted from Cloud Storage before
Upload'.format(rpt_last_run_blob))
else:
print('No Such File Exists in Storage Bucket to Delete. So,
proceeding with Upload')
#Upload new one
rpt_last_run_blob.upload_from_filename(rpt_last_run_file)
print("File {} uploaded to Bucket {} With Name
{}.".format(rpt_last_run_file, bucket,
'Reports_Latest_Run_time.csv'))
def api_request():
et = pytz.timezone("US/Eastern")
current_et_time = dt.now().astimezone(et)
print('Current ET Time:', current_et_time)
pt = pytz.timezone("US/Pacific")
ut = pytz.timezone("UTC")
blackline_base_url = "https://....com"
blackline_sts_url = blackline_base_url + "/authorize/connect/token"
project_id = 'gcp-favt-acn-dev'
secret_id = '###_api_key'
secret_client = secretmanager.SecretManagerServiceClient()
secret_name =
secret_client.secret_version_path(project_id,secret_id,'latest')
secret_resp = secret_client.access_secret_version(secret_name)
api_key = secret_resp.payload.data.decode('UTF-8')
grant_type = 'password'
scope = '####'
username = '####'
payload = 'grant_type='+grant_type+'&scope='+scope+
'&username='+username+'&password='+api_key
sts_headers = { 'Authorization': 'Basic dXBzOk5KXXx2VENsSiEtRw==',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':
'BLSIAPPEN=!bpJj4AOTHPcaqipWtDI6FrozN629M9xYLA/
sbM1DWVH+jjuY5fgHVMACha2rIapXRoB7CcqnlaHgBw=='}
response = requests.request("POST", ###_sts_url, headers =
sts_headers, data = payload)
if response.ok:
sts_response = response.json()
access_token = sts_response['access_token']
print(access_token)
blackline_rpt_submit_url = ##_base_url + '/api/queryruns'
rpt_payload = ''
blackline_rpt_api_headers =
{'Authorization': 'Bearer {}'.format(access_token), 'Content-Type':
'text/plain'}
rpt_resp = requests.request("GET", blackline_rpt_submit_url, headers
= blackline_rpt_api_headers, data = rpt_payload)
print(rpt_resp.text)
jl = json.loads(rpt_resp.text)
reports_list = []
rprts_filename = "tmp_rprts.csv"
rprts_full_path = os.path.join("/tmp",rprts_filename)
with open(rprts_full_path, 'w') as f:
f.write('ReportName,ReportLastRunTime'+'\n')
hrs = -2
hrs_to_subtract = datetime.timedelta(hours=hrs)
two_hrs_ago_time = current_et_time + hrs_to_subtract
#print(two_hrs_ago_time)#latest_rpt_check_time)
frmtd_curr_time = two_hrs_ago_time.strftime('%Y-%m-%d %H:%M:%S')
latest_rpt_check_time =
dt.strptime(frmtd_curr_time,'%Y-%m-%d %H:%M:%S')
print("Latest Report Check Time:", latest_rpt_check_time)
for each in jl:
strpd_time = dt.strptime(each['endTime'][0:19],'%Y-%m-
%dT%H:%M:%S')
#print(strpd_time)
pt_localize = pt.localize(strpd_time)
#print(pt_localize)
et_time = pt_localize.astimezone(et)
#print(et_time)
frmtd_et_time = et_time.strftime('%Y-%m-%d %H:%M:%S')
#print(frmtd_et_time)
cnvrted_endTime = dt.strptime(frmtd_et_time,'%Y-%m-%d %H:%M:%S')
#print("Report LastRun EndTime:", cnvrted_endTime)
ut_time = pt_localize.astimezone(ut)
frmtd_ut_time = ut_time.strftime('%Y-%m-%d %H:%M:%S')
if cnvrted_endTime > latest_rpt_check_time:
reports_list.append({each['name']:each['exportUrls'][0]
["url"]})
rpt_last_run = each['name']+','+frmtd_ut_time
print(rpt_last_run)
with open(rprts_full_path, 'a') as f:
f.write(rpt_last_run+'\n')
retn_file_suffix = each['endTime'][0:10]
#print(retn_file_suffix)
rpt_run_hr = cnvrted_endTime.hour
#print(rpt_run_hr)
#############
print(reports_list)
for report in reports_list:
for k in report:
print(report[k])
report_fetch_url = blackline_base_url + '/' + report[k]
print('Report Fetch URL: {}'.format(report_fetch_url))
filename = "temp_file.csv"
full_path = os.path.join("/tmp",filename)
rpt_data = requests.request("GET", report_fetch_url, headers
= blackline_rpt_api_headers)
print(rpt_data.text)
with open(full_path,'wb') as tmp_file:
tmp_file.write(rpt_data.content)
#Upload it to Cloud Storage
landing_bucket_name = "####_dev_landing_bkt" #CHANGE ME
source_file_name = os.path.join(full_path)
rpt_last_run_file = os.path.join(rprts_full_path)
landing_blob_name = '##.csv' #CHANGE ME
retention_bucket_name = '####_dev_retention_bkt'
print('file retention check')
if (rpt_run_hr >= 22):
file_retention_flag = 'Y'
else:
file_retention_flag = 'N'
print(file_retention_flag)
delete_and_upload_blob(landing_bucket_name,
source_file_name,
landing_blob_name,
retention_bucket_name,
file_retention_flag,
retn_file_suffix,
rpt_last_run_file)
#Remove the temp file after it is uploaded to Cloud Storage to
avoid OOM issues with the Cloud Function.
os.remove(full_path)
#Remove the tmp file after upload
os.remove(rprts_full_path)
#def pacific_to_eastern_conversion(pacific_time, eastern_time):
def main(event,context):
try:
if 'data' in event:
name = base64.b64decode(event['data']).decode('utf-8')
else:
name = 'World'
print('Hello{}',format(name))
api_request()
except Exception as e:
logging.error(e)' enter code here
The approach you are using will work for Cloud Run but won't work for Cloud functions.
To make use of secrets in Google cloud functions, following are the steps:
Make sure that the function's runtime service account must be granted access to the secret. To use Secret Manager with Cloud Functions, assign the roles/secretmanager.secretAccessor role to the service account associated with your function.
Make the secret accessible to the function. This can be done using either the Google Cloud Console or the gcloud command-line tool.
I exposed the secret as an environment variable(with name set to "api_key") and accessed them in the code as stated below:
import os
api_key = os.environ.get('api_key')
I hope this answers your question.
Your cloud functions service account haven't access to Secret manager. Grant your Cloud Functions service account on the secret, or on the project (not recommended).
If you don't set a custom service account on your Cloud Functions (which is also not a good practice), the App Engine default service account is used. Here the pattern <ProjectID>#appspot.gserviceaccount.com
I've built a successful service connection to the Drive API already, and I’m creating export URLs to download each sheet in a Spreadsheet as a CSV file by sending requests with Google’s AuthorizedSession class. For some reason, only a portion of the CSV files come back correct, with the others containing broken HTML. When I send a single request, the sheet always comes back correct, but when I loop through the sheets and start sending requests things start to break. I've identified there's a problem with how I'm passing the credentials this way, but I'm not sure if I'm using AuthorizedSession correctly. Can anyone help me figure this one out?
from googleapiclient.discovery import build
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
import re
import shutil
import urllib.parse
CLIENT_SECRET_FILE = "client_secret.json"
API_NAME = "sheets"
API_VERSION = "v4"
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
SPREADSHEET_ID = "Spreadsheet ID goes here"
print(CLIENT_SECRET_FILE, API_NAME, API_VERSION, SCOPES, sep="-")
cred = service_account.Credentials.from_service_account_file(
CLIENT_SECRET_FILE, scopes=SCOPES
)
try:
service = build(API_NAME, API_VERSION, credentials=cred)
print(API_NAME, "service created successfully")
result = service.spreadsheets().get(spreadsheetId=SPREADSHEET_ID).execute()
export_url = re.sub("\/edit$", "/export", result["spreadsheetUrl"])
authed_session = AuthorizedSession(cred)
for sheet in result["sheets"]:
sheet_name = sheet["properties"]["title"]
params = {"format": "csv", "gid": sheet["properties"]["sheetId"]}
query_params = urllib.parse.urlencode(params)
url = export_url + "?" + query_params
response = authed_session.get(url)
file_path = "./Downloads/" + sheet_name + ".csv"
with open(file_path, "wb") as csv_file:
csv_file.write(response.content)
print("Downloaded sheet: " + sheet_name)
print("Downloads complete")
except Exception as e:
print("Unable to connect")
print(e)
This code should get you a sheetsservice
"""Hello sheets."""
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
SCOPES = ['"https://www.googleapis.com/auth/drive.readonly']
KEY_FILE_LOCATION = '<REPLACE_WITH_JSON_FILE>'
VIEW_ID = '<REPLACE_WITH_VIEW_ID>'
def initialize_sheet():
"""Initializes an sheetservice object.
Returns:
An authorized sheetservice object.
"""
credentials = ServiceAccountCredentials.from_json_keyfile_name(
KEY_FILE_LOCATION, SCOPES)
# Build the service object.
sheet= build('sheet', 'v4', credentials=credentials)
return sheet
If you use the same sheet service built by this method then you souldnt have any issues looping
I think that your script of authed_session = AuthorizedSession(cred) and response = authed_session.get(url) are correct. I thought that in your situation, the number of requests might be large in the short time, and this might be due to the reason of your issue. So as a simple modification, how about the following modification?
From:
for sheet in result["sheets"]:
sheet_name = sheet["properties"]["title"]
params = {"format": "csv", "gid": sheet["properties"]["sheetId"]}
query_params = urllib.parse.urlencode(params)
url = export_url + "?" + query_params
response = authed_session.get(url)
file_path = "./Downloads/" + sheet_name + ".csv"
with open(file_path, "wb") as csv_file:
csv_file.write(response.content)
print("Downloaded sheet: " + sheet_name)
To:
for sheet in result["sheets"]:
sheet_name = sheet["properties"]["title"]
params = {"format": "csv", "gid": sheet["properties"]["sheetId"]}
query_params = urllib.parse.urlencode(params)
url = export_url + "?" + query_params
response = authed_session.get(url)
file_path = "./Downloads/" + sheet_name + ".csv"
with open(file_path, "wb") as csv_file:
csv_file.write(response.content)
print("Downloaded sheet: " + sheet_name)
time.sleep(3) # <--- Added. Please adjust the value of 3 for your actual situation.
In this case, please use import time.
I want to create a Python script to backup Google Drive files as a bit of fun / learning, but I am stuck. My script below did work, but it just made the last modified date and created date of all the files on my local drive on back up as the date they were backed up, and didn't preserve the original created date / modified date as they were on Google Drive.
Here is my script:
from __future__ import print_function
import sys, httplib2, os, datetime, io
from time import gmtime, strftime
from apiclient import discovery
import oauth2client
from oauth2client import client
from oauth2client import tools
from datetime import date
#########################################################################
# Fixing OSX el capitan bug ->AttributeError: 'Module_six_moves_urllib_parse' object has no attribute 'urlencode'
os.environ["PYTHONPATH"] = "/Library/Python/2.7/site-packages"
#########################################################################
CLIENT_SECRET_FILE = 'client_secrets.json'
TOKEN_FILE="drive_api_token.json"
SCOPES = 'https://www.googleapis.com/auth/drive'
APPLICATION_NAME = 'Drive File API - Python'
OUTPUT_DIR=str(date.today())+"_drive_backup"
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
def get_credentials():
home_dir = os.path.expanduser('~')
credential_dir = os.path.join(home_dir, '.credentials')
if not os.path.exists(credential_dir):
os.makedirs(credential_dir)
credential_path = os.path.join(credential_dir, TOKEN_FILE)
store = oauth2client.file.Storage(credential_path)
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
flow.user_agent = APPLICATION_NAME
if flags:
credentials = tools.run_flow(flow, store, flags)
else: # Needed only for compatibility with Python 2.6
credentials = tools.run(flow, store)
print('Storing credentials to ' + credential_path)
return credentials
def prepDest():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
return True
return False
def downloadFile(file_name, file_id, file_createdDate, mimeType, service):
request = service.files().get_media(fileId=file_id)
if "application/vnd.google-apps" in mimeType:
if "document" in mimeType:
request = service.files().export_media(fileId=file_id, mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
file_name = file_name + ".docx"
else:
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
file_name = file_name + ".pdf"
print("Downloading -- " + file_name)
response = request.execute()
with open(os.path.join(OUTPUT_DIR, file_name), "wb") as wer:
wer.write(response)
def listFiles(service):
def getPage(pageTok):
return service.files().list(q="mimeType != 'application/vnd.google-apps.folder'",
pageSize=1000, pageToken=pageTok, fields="nextPageToken,files(id,name, createdDate, mimeType)").execute()
pT = ''; files=[]
while pT is not None:
results = getPage(pT)
pT = results.get('nextPageToken')
files = files + results.get('files', [])
return files
def main():
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('drive', 'v3', http=http)
for item in listFiles(service):
downloadFile(item.get('name'), item.get('id'), item.get('createdDate'), item.get('mimeType'), service)
if __name__ == '__main__':
main()
To try and get the created date, you can see in the above script I added in createdDate, which looks like some of the metadata I can grab from the file:
https://developers.google.com/drive/v2/reference/files
But I don't know if I am grabbing that metadata correctly, and if so, how I actually assign it to my downloaded file.
EDIT: Really sorry but I didn't specify an OS - this is for a mac.
File v2 createdDate renamed in v3 to createdTime
The File reference you linked is for v2, but your code connects to the v3 service. When I ran your code, which uses createdDate from the v2 API, an error occurred (createdDate was an invalid metadata field).
I switched to the v3 File API, which lists the creation time as createdTime, and was able to retrieve the time without error.
File creation time changeable in Windows only
Linux/Unix does not allow setting a file's creation time, but it allows modification to the file's modified and access times via os.utime() (both times required by this function). The Drive API provides createdTime and modifiedTime but nothing for access time (which probably wouldn't make sense there), although the modification time could serve just as well for the access time.
In Windows, the file creation time could be set with win32file.SetFileTime.
Time conversion
Note that the times that are passed to the timestamp functions above are in seconds since epoch. The Drive API returns an ISO 8601 string that we convert to seconds with:
dt = datetime.datetime.strptime(dateTime, "%Y-%m-%dT%H:%M:%S.%fZ")
secs = int(dt.strftime("%s"))
Modifications
Replace all instances of createdDate with createdTime.
In listFiles() > getPage(), add modifiedTime to metadata fields:
def listFiles(service):
def getPage(pageTok):
return service.files().list(q="mimeType != 'application/vnd.google-apps.folder'",
pageSize=1000, pageToken=pageTok, fields="nextPageToken,files(id,name, createdTime, modifiedTime, mimeType)").execute()
In main()'s for-loop, pass modifiedTime to downloadFiles():
downloadFile(item.get('name'), item.get('id'), item.get('createdTime'), item.get('modifiedTime'), item.get('mimeType'), service)
In downloadFiles(), add modifiedTime to parameter list after file_createdTime.
Add these functions to set file timestamps:
def dateToSeconds(dateTime):
return int(datetime.datetime.strptime(dateTime, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%s"))
def setFileTimestamps(fname, createdTime, modifiedTime):
ctime = dateToSeconds(createdTime)
mtime = dateToSeconds(modifiedTime)
setFileCreationTime(fname, ctime)
setFileModificationTime(fname, mtime)
def setFileModificationTime(fname, newtime):
# Set access time to same value as modified time,
# since Drive API doesn't provide access time
os.utime(fname, (newtime, newtime))
def setFileCreationTime(fname, newtime):
"""http://stackoverflow.com/a/4996407/6277151"""
if os.name != 'nt':
# file creation time can only be changed in Windows
return
import pywintypes, win32file, win32con
wintime = pywintypes.Time(newtime)
winfile = win32file.CreateFile(
fname, win32con.GENERIC_WRITE,
win32con.FILE_SHARE_READ | win32con.FILE_SHARE_WRITE | win32con.FILE_SHARE_DELETE,
None, win32con.OPEN_EXISTING,
win32con.FILE_ATTRIBUTE_NORMAL, None)
win32file.SetFileTime(winfile, wintime, None, None)
winfile.close()
In downloadFiles(), call setFileTimestamps() right after writing the file (as last line of function):
def downloadFile(file_name, file_id, file_createdTime, modifiedTime, mimeType, service):
request = service.files().get_media(fileId=file_id)
if "application/vnd.google-apps" in mimeType:
if "document" in mimeType:
request = service.files().export_media(fileId=file_id, mimeType='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
file_name = file_name + ".docx"
else:
request = service.files().export_media(fileId=file_id, mimeType='application/pdf')
file_name = file_name + ".pdf"
print("Downloading -- " + file_name)
response = request.execute()
prepDest()
fname = os.path.join(OUTPUT_DIR, file_name)
with open(fname, "wb") as wer:
wer.write(response)
setFileTimestamps(fname, file_createdTime, modifiedTime)
GitHub repo