Related
I'm attempting to pull email attachments from outlook and store them in an s3 bucket in aws. This is one of my first python projects, and its proving to be very difficult for me and is probably very messy code.
# see https://github.com/AzureAD/microsoft-authentication-library-for-python/blob/dev/sample/device_flow_sample.py
# This authenticates for first time login.
# As long as you call acquire_token_silent before you invoke any graph APIs, the tokens will stay up to date.
# The refresh token is good for 90 days, and automatically updates. Once you login, the tokens will
# be updated and stored in the cache (and persisted to a file), and will stay alive more-or-less indefinitely
# (there are some things that can invalidate it on the server side).
from __future__ import with_statement
import io
import sys
import json
import logging
import os
import tarfile
import atexit
from wsgiref import headers
import requests
import msal
import boto3
import base64
from botocore.exceptions import ClientError
import codecs
print('Starting...')
# logging
logging.basicConfig(level=logging.DEBUG) # Enable DEBUG log for entire script
logging.getLogger("msal").setLevel(logging.INFO) # Optionally disable MSAL DEBUG logs
client=boto3.client('secretsmanager')
# config
config = dict(
authority = "https://login.microsoftonline.com/common",
client_id = '123456',
scope = ["Mail.ReadWrite"],
username = 'username',
cache_file = client.get_secret_value(SecretId="demo-ms-graph")['SecretBinary'],
endpoint = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$expand=attachments&$search="hasAttachments:true"'
)
# cache
cache = msal.SerializableTokenCache()
if os.path.exists(config["cache_file"]):
with tarfile.open('token.cache.tar.gz', "w:gz") as tar:
tar.add(config["cache_file"])
bts = open('token.cache.tar.gz','rb').read()
print("Length before",len(bts))
#sec=client.update_secret(SecretId="demo-ms-graph", SecretBinary=bts)
sec=client.get_secret_value(SecretId="demo-ms-graph")['SecretBinary']
print("Length after",len(sec))
with tarfile.open(fileobj=io.BytesIO(sec), mode='r:gz') as t:
d=t.extractfile('token.cache')
#print file content
print("File content",str(d.read()))
with tarfile.open(fileobj=io.BytesIO(sec), mode='r:gz') as t:
d=t.extractfile('token.cache')
# app
app = msal.PublicClientApplication(
config["client_id"], authority=config["authority"],
token_cache=cache)
print('Connecting to app..')
# exists?
result = None
accounts = app.get_accounts()
if accounts:
logging.info("found accounts in the app")
for a in accounts:
print(a)
if a["username"] == config["username"]:
result = app.acquire_token_silent(config["scope"], account=a)
break
if result and "access_token" in result:
# Calling graph using the access token
graph_data = requests.get( # Use token to call downstream service
config["endpoint"],
headers={'Authorization': 'Bearer ' + result['access_token']},).json()
#print("Graph API call result: %s" % json.dumps(graph_data, indent=2))
else:
print(result.get("error"))
print(result.get("error_description"))
print(result.get("correlation_id")) # You may need this when reporting a bug
main = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$expand=attachments&$search="hasAttachments:true"'
response = requests.get(main, headers={'Authorization': 'Bearer ' + result['access_token']})
if response.status_code != 200:
raise Exception(response.json())
response_json = response.json()
print('Starting upload...')
emails = response_json['value']
s3 = boto3.client('s3')
bucket ='demo-email-app'
for email in emails:
email_id = email['id']
subject = email['subject']
if email['hasAttachments']:
print(subject)
attachments = email['attachments']
for attachment in attachments:
name = attachment['name']
fileContent = json.dumps(email, indent=2)
s3.put_object(Bucket=bucket, Key=name.replace('.', '_') + '.json', Body=fileContent.encode('UTF-8'))
print('Upload Complete')
#download_email_attachments(email_id, headers)
print('All uploads complete')
My secret is stored as binary in secrets manager, and it seems to be able to pull the secret and prints it fine. I'm running into an error in the # exists? section with "AttributeError: 'NoneType' object has no attribute 'get'"
Am I approaching this in the wrong way? I have the token stored in secretsmanager, and am trying to retrieve it for msal to use and authenticate my user to the MS Graph API so I can pull attachments from the Outlook account and store those in an s3 bucket.
I am having a problem with the Google Cloud Speech API, every time I run the script the error
six.raise_from (exceptions.from_grpc_error (exc), exc) occurs
File "<string>", line 3, in raise_from
google.api_core.exceptions.InvalidArgument: 400 RecognitionAudio not set.
he doesn't seem to recognize RecognitionAudio for some reason, I already checked the API documentation but I couldn't solve the problem
I am not understanding the reason for the error, I will leave my code here in case anyone knows and can help me, thanks
import telebot
import requests
from pydub import AudioSegment
import os
import io
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./chatbot.json"
token = "1233361335"
bot = telebot.TeleBot(token)
downloadAudio = "https://api.telegram.org/file/bot{token}/".format(token = token)
#bot.message_handler(commands=['start'])
def send_welcome(message):
bot.reply_to(message, "welcome")
#bot.message_handler(content_types=['voice'])
def handlerAudio(message):
#get audio from telegram
messageVoice = message.voice
#get download link
audioPath = bot.get_file(messageVoice.file_id).file_path
audioLink = downloadAudio+audioPath
#download file
audioFile = requests.get(audioLink)
audioName = "audio.ogg"
#save locally
open(audioName, 'wb').write(audioFile.content)
#convert format to .WAV
AudioSegment.from_file(audioName).export("audio.wav", format="wav")
sound = AudioSegment.from_wav("audio.wav")
sound = sound.set_channels(1) #convert mono
sound.export("audio.wav", format="wav")
client = speech.SpeechClient()
with io.open("audio.wav", 'rb') as audio_file:
content = audio_file.read()
audio = types.RecognitionAudio(content=content)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=48000,
language_code='pt-BR')
response = client.recognize(config, audio)
for result in response.results:
print(u'Transcript: {}'.format(result.alternatives[0].transcript))
#bot.reply_to(message, result.alternatives[0].transcript)
bot.polling()
I'm trying to send an email with an attachment (ideally multiple attachments) that are larger than 10 MB and smaller than the limit of 25 MB in total. The reason I mention 10 MB is because it seems to be the lower bound for when the normal way of attaching files stops working and you get Error 10053.
I've read in the documentation that the best way to do this would be by using the resumable upload method but I haven't been able to get it to work nor have I been able to find any good examples in Python. Most of the SO questions on this simply link back to the documentation which doesn't have a Python example or their code resulted in other errors.
I'm looking for an explanation in Python because I want to make sure I understand it correctly.
Questions I've looked through:
Attaching a file using Resumable upload w/ Gmail API
Gmail Api resumable upload Rest( attachment larger than 5MB)
using /upload urls with Google API client
How to upload large messages to Gmail
Error 10053 When Sending Large Attachments using Gmail API
Sending email via gmail & python
MIMEMultipart, MIMEText, MIMEBase, and payloads for sending email with file attachment in Python
Code:
import base64
import json
import os
from email import utils, encoders
from email.message import EmailMessage
from email.mime import application, multipart, text, base, image, audio
import mimetypes
from apiclient import errors
from googleapiclient import discovery, http
from google.oauth2 import service_account
def send_email(email_subject, email_body, email_sender='my_service_account#gmail.com', email_to='', email_cc='', email_bcc='', files=None):
# Getting credentials
with open(os.environ.get('SERVICE_KEY_PASSWORD')) as f:
service_account_info = json.loads(f.read())
# Define which scopes we're trying to access
SCOPES = ['https://www.googleapis.com/auth/gmail.send']
# Setting up credentials using the gmail api
credentials = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES)
# This allows us to assign an alias account to the message so that the messages aren't coming from 'ServiceDriod-8328balh blah blah'
delegated_credentials = credentials.with_subject(email_sender)
# 'Building' the service instance using the credentials we've passed
service = discovery.build(serviceName='gmail', version='v1', credentials=delegated_credentials)
# Building out the email
message = multipart.MIMEMultipart()
message['to'] = email_to
message['from'] = email_sender
message['date'] = utils.formatdate(localtime=True)
message['subject'] = email_subject
message['cc'] = email_cc
message['bcc'] = email_bcc
message.attach(text.MIMEText(email_body, 'html'))
for f in files or []:
mimetype, encoding = mimetypes.guess_type(f)
# If the extension is not recognized it will return: (None, None)
# If it's an .mp3, it will return: (audio/mp3, None) (None is for the encoding)
# For an unrecognized extension we set mimetype to 'application/octet-stream' so it won't return None again.
if mimetype is None or encoding is not None:
mimetype = 'application/octet-stream'
main_type, sub_type = mimetype.split('/', 1)
# Creating the attachement:
# This part is used to tell how the file should be read and stored (r, or rb, etc.)
if main_type == 'text':
print('text')
with open(f, 'rb') as outfile:
attachement = text.MIMEText(outfile.read(), _subtype=sub_type)
elif main_type == 'image':
print('image')
with open(f, 'rb') as outfile:
attachement = image.MIMEImage(outfile.read(), _subtype=sub_type)
elif main_type == 'audio':
print('audio')
with open(f, 'rb') as outfile:
attachement = audio.MIMEAudio(outfile.read(), _subtype=sub_type)
elif main_type == 'application' and sub_type == 'pdf':
with open(f, 'rb') as outfile:
attachement = application.MIMEApplication(outfile.read(), _subtype=sub_type)
else:
attachement = base.MIMEBase(main_type, sub_type)
with open(f, 'rb') as outfile:
attachement.set_payload(outfile.read())
encoders.encode_base64(attachement)
attachement.add_header('Content-Disposition', 'attachment', filename=os.path.basename(f))
message.attach(attachement)
media_body = http.MediaFileUpload(files[0], chunksize=500, resumable=True)
print('Uploading large file...')
body = {'raw': base64.urlsafe_b64encode(message.as_bytes()).decode()}
message = (service.users().messages().send(userId='me', body=body, media_body=media_body).execute())
Note: Right now, in the MediaFileUpload I'm using files[0] because I'm only using one file for testing and I just wanted to attach one file for now until it works.
Error:
Exception has occurred: ResumableUploadError
<HttpError 400 "Bad Request">
File "C:\Users\CON01599\AppData\Local\Continuum\anaconda3\Lib\site-packages\googleapiclient\http.py", line 927, in next_chunk
raise ResumableUploadError(resp, content)
File "C:\Users\CON01599\AppData\Local\Continuum\anaconda3\Lib\site-packages\googleapiclient\_helpers.py", line 130, in positional_wrapper
return wrapped(*args, **kwargs)
File "C:\Users\CON01599\AppData\Local\Continuum\anaconda3\Lib\site-packages\googleapiclient\http.py", line 822, in execute
_, body = self.next_chunk(http=http, num_retries=num_retries)
File "C:\Users\CON01599\AppData\Local\Continuum\anaconda3\Lib\site-packages\googleapiclient\_helpers.py", line 130, in positional_wrapper
return wrapped(*args, **kwargs)
File "C:\Users\CON01599\Documents\GitHub\pipelines\components\email\send_email.py", line 105, in send_email
message = (service.users().messages().send(userId='me', body=body, media_body=media_body).execute())
Answer:
import base64
import io
import json
import os
from email import utils, encoders
from email.message import EmailMessage
from email.mime import application, multipart, text, base, image, audio
import mimetypes
from apiclient import errors
from googleapiclient import discovery, http
from google.oauth2 import service_account
def get_environment_variables():
""" Retrieves the environment variables and returns them in
a dictionary object.
"""
env_var_dict = {
'to': os.environ.get('TO'),
'subject': os.environ.get('SUBJECT'),
'body': os.environ.get('BODY'),
'file': os.environ.get('FILE')
}
return env_var_dict
def send_email(email_subject, email_body, email_sender='my_service_account#gmail.com', email_to='', email_cc='', email_bcc='', files=None):
# Pulling in the string value of the service key from the parameter
with open(os.environ.get('SERVICE_KEY_PASSWORD')) as f:
service_account_info = json.loads(f.read())
# Define which scopes we're trying to access
SCOPES = ['https://www.googleapis.com/auth/gmail.send']
# Setting up credentials using the gmail api
credentials = service_account.Credentials.from_service_account_info(service_account_info, scopes=SCOPES)
# This allows us to assign an alias account to the message so that the messages aren't coming from 'ServiceDriod-8328balh blah blah'
delegated_credentials = credentials.with_subject(email_sender)
# 'Building' the service instance using the credentials we've passed
service = discovery.build(serviceName='gmail', version='v1', credentials=delegated_credentials)
# Building out the email
message = multipart.MIMEMultipart()
message['to'] = email_to
message['from'] = email_sender
message['date'] = utils.formatdate(localtime=True)
message['subject'] = email_subject
message['cc'] = email_cc
message['bcc'] = email_bcc
message.attach(text.MIMEText(email_body, 'html'))
for f in files or []:
f = f.strip(' ')
mimetype, encoding = mimetypes.guess_type(f)
# If the extension is not recognized it will return: (None, None)
# If it's an .mp3, it will return: (audio/mp3, None) (None is for the encoding)
# For an unrecognized extension we set mimetype to 'application/octet-stream' so it won't return None again.
if mimetype is None or encoding is not None:
mimetype = 'application/octet-stream'
main_type, sub_type = mimetype.split('/', 1)
# Creating the attachement:
# This part is used to tell how the file should be read and stored (r, or rb, etc.)
if main_type == 'text':
print('text')
with open(f, 'rb') as outfile:
attachement = text.MIMEText(outfile.read(), _subtype=sub_type)
elif main_type == 'image':
print('image')
with open(f, 'rb') as outfile:
attachement = image.MIMEImage(outfile.read(), _subtype=sub_type)
elif main_type == 'audio':
print('audio')
with open(f, 'rb') as outfile:
attachement = audio.MIMEAudio(outfile.read(), _subtype=sub_type)
elif main_type == 'application' and sub_type == 'pdf':
with open(f, 'rb') as outfile:
attachement = application.MIMEApplication(outfile.read(), _subtype=sub_type)
else:
attachement = base.MIMEBase(main_type, sub_type)
with open(f, 'rb') as outfile:
attachement.set_payload(outfile.read())
encoders.encode_base64(attachement)
attachement.add_header('Content-Disposition', 'attachment', filename=os.path.basename(f))
message.attach(attachement)
media_body = http.MediaIoBaseUpload(io.BytesIO(message.as_bytes()), mimetype='message/rfc822', resumable=True)
body_metadata = {} # no thread, no labels in this example
try:
print('Uploading file...')
response = service.users().messages().send(userId='me', body=body_metadata, media_body=media_body).execute()
print(response)
except errors.HttpError as error:
print('An error occurred when sending the email:\n{}'.format(error))
if __name__ == '__main__':
env_var_dict = get_environment_variables()
print("Sending email...")
send_email(email_subject=env_var_dict['subject'],
email_body=env_var_dict['body'],
email_to=env_var_dict['to'],
files=env_var_dict['file'].split(','))
print("Email sent!")
The issue you're having here is that your MediaUpload is a single attachment.
Instead of uploading a single attachment as a resumable MediaUpload, you need to upload the entire RFC822 message as a resumable MediaUpload.
In other words:
import ...
...
from io import BytesIO
from googleapiclient.http import MediaIoBaseUpload
SCOPES = [ 'scopes' ]
creds = get_credentials_somehow()
gmail = get_authed_service_somehow()
msg = create_rfc822_message(headers, email_body)
to_attach = get_attachment_paths_from_dir('../reports/tps/memos/2019/04')
add_attachments(msg, to_attach)
media = MediaIoBaseUpload(BytesIO(msg.as_bytes()), mimetype='message/rfc822', resumable=True)
body_metadata = {} # no thread, no labels in this example
resp = gmail.users().messages().send(userId='me', body=body_metadata, media_body=media).execute()
print(resp)
# { "id": "some new id", "threadId": "some new thread id", "labelIds": ["SENT"]}
I pieced this together from your provided code, reviewing this GitHub issue and Google's Inbox-to-Gmail email importer, specificially this bit.
When sending replies to existing messages, you will almost certainly have some sort of metadata that you should provide to help Gmail keep track of your new response and the original conversation. Namely, instead of an empty body parameter, you would pass informative metadata such as
body_metadata = { 'labelIds': [
"your label id here",
"another label id" ],
'threadId': "some thread id you took from the message you're replying to"
}
Other good refs:
API Client's Gmail PyDoc
Actual code used
You mention the attachment being larger than 10Mb, but you don't mention it being smaller than 25Mb: there's a limitation to gmail that attachments can't be larger than 25Mb, so if this is your case, there's simply no way to get this done, as it is beyond gmail limitations.
The explanation can be found here.
Can you confirm that your attachment is not too large?
I'm trying to download all the media that is sent to my Twilio account and cannot for the life of me figure out how to access the actual images.
from twilio.rest import Client
import requests
from operator import itemgetter
import json
ACCOUNT_SID = "xxxxxxx"
AUTH_TOKEN = "xxxxxxxx"
client = Client(ACCOUNT_SID, AUTH_TOKEN)
# builds a list of messages and media uris
messages = client.messages.list(from_="+19999999999")
msgs = []
for m in messages:
line = [m.from_, m.to, m.body, m.sid, m.subresource_uris['media']]
line = [str(x) for x in line]
msgs.append(line)
# with list of all messages:
msgs = sorted(msgs, key=itemgetter(0))
for m in msgs:
# get media list for each message that has one, else catch exception
try:
medias = client.messages(m[3]).media.list()
# returns Twilio.Api.V2010.MediaInstance and i'm stuck
for med in medias:
print client.messages(m[3]).media(med.sid).fetch()
except Exception as e:
pass
I am just lost and can't find any concrete examples in the documentation. I really can't even tell if I'm close, or waaaaaaaaaaay off. Thanks in advance!
SOLUTION Thanks to philnash
from twilio.rest import Client
import requests
import json
# Find these values at https://twilio.com/user/account
ACCOUNT_SID = "xxxxx"
AUTH_TOKEN = "xxxxxx"
BASE_URL = "https://%s:%s#api.twilio.com" % (ACCOUNT_SID, AUTH_TOKEN)
client = Client(ACCOUNT_SID, AUTH_TOKEN)
# with list of all messages:
messages = client.messages.list(from_="+1999999999")
for m in messages:
sid = m.sid
# get media list for each message that has one, else catch exception
try:
message = client.messages(sid).fetch()
print message.body
medias = message.media.list()
# returns Twilio.Api.V2010.MediaInstance and i'm stuck
for media in medias:
media_instance = client.messages(sid).media(media.sid).fetch()
uri = requests.get(BASE_URL + media_instance.uri).json()
uri2 = requests.get(BASE_URL + uri['uri'].replace('.json', ''))
with open(media_instance.uri.split("/")[-1].replace(".json", ".png"), "wb") as f:
f.write(uri2.content)
f.close()
except Exception as e:
print e
Twilio developer evangelist here.
When you get the Media URI from the helper library, it is the json representation of the resource and ends in .json. To get the raw resource you need only to strip the .json extension. You can use that URL to download the image.
I am trying to download files from google drive and all I have is the drive's URL.
I have read about google API that talks about some drive_service and MedioIO, which also requires some credentials( mainly JSON file/OAuth). But I am unable to get any idea about how it is working.
Also, tried urllib2.urlretrieve, but my case is to get files from the drive. Tried wget too but no use.
Tried PyDrive library. It has good upload functions to drive but no download options.
Any help will be appreciated.
Thanks.
If by "drive's url" you mean the shareable link of a file on Google Drive, then the following might help:
import requests
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if __name__ == "__main__":
file_id = 'TAKE ID FROM SHAREABLE LINK'
destination = 'DESTINATION FILE ON YOUR DISK'
download_file_from_google_drive(file_id, destination)
The snipped does not use pydrive, nor the Google Drive SDK, though. It uses the requests module (which is, somehow, an alternative to urllib2).
When downloading large files from Google Drive, a single GET request is not sufficient. A second one is needed - see wget/curl large file from google drive.
I recommend gdown package.
pip install gdown
Take your share link
https://drive.google.com/file/d/0B9P1L--7Wd2vNm9zMTJWOGxobkU/view?usp=sharing
and grab the id - eg. 1TLNdIufzwesDbyr_nVTR7Zrx9oRHLM_N by pressing the download button (look for at the link), and swap it in after the id below.
import gdown
url = 'https://drive.google.com/uc?id=0B9P1L--7Wd2vNm9zMTJWOGxobkU'
output = '20150428_collected_images.tgz'
gdown.download(url, output, quiet=False)
Having had similar needs many times, I made an extra simple class GoogleDriveDownloader starting on the snippet from #user115202 above. You can find the source code here.
You can also install it through pip:
pip install googledrivedownloader
Then usage is as simple as:
from google_drive_downloader import GoogleDriveDownloader as gdd
gdd.download_file_from_google_drive(file_id='1iytA1n2z4go3uVCwE__vIKouTKyIDjEq',
dest_path='./data/mnist.zip',
unzip=True)
This snippet will download an archive shared in Google Drive. In this case 1iytA1n2z4go3uVCwE__vIKouTKyIDjEq is the id of the sharable link got from Google Drive.
Here's an easy way to do it with no third-party libraries and a service account.
pip install google-api-core and google-api-python-client
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2 import service_account
import io
credz = {} #put json credentials her from service account or the like
# More info: https://cloud.google.com/docs/authentication
credentials = service_account.Credentials.from_service_account_info(credz)
drive_service = build('drive', 'v3', credentials=credentials)
file_id = '0BwwA4oUTeiV1UVNwOHItT0xfa2M'
request = drive_service.files().get_media(fileId=file_id)
#fh = io.BytesIO() # this can be used to keep in memory
fh = io.FileIO('file.tar.gz', 'wb') # this can be used to write to disk
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print("Download %d%%." % int(status.progress() * 100))
PyDrive allows you to download a file with the function GetContentFile(). You can find the function's documentation here.
See example below:
# Initialize GoogleDriveFile instance with file id.
file_obj = drive.CreateFile({'id': '<your file ID here>'})
file_obj.GetContentFile('cats.png') # Download file as 'cats.png'.
This code assumes that you have an authenticated drive object, the docs on this can be found here and here.
In the general case this is done like so:
from pydrive.auth import GoogleAuth
gauth = GoogleAuth()
# Create local webserver which automatically handles authentication.
gauth.LocalWebserverAuth()
# Create GoogleDrive instance with authenticated GoogleAuth instance.
drive = GoogleDrive(gauth)
Info on silent authentication on a server can be found here and involves writing a settings.yaml (example: here) in which you save the authentication details.
There's in the docs a function that downloads a file when we provide an ID of the file to download,
from __future__ import print_function
import io
import google.auth
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaIoBaseDownload
def download_file(real_file_id):
"""Downloads a file
Args:
real_file_id: ID of the file to download
Returns : IO object with location.
Load pre-authorized user credentials from the environment.
TODO(developer) - See https://developers.google.com/identity
for guides on implementing OAuth2 for the application.
"""
creds, _ = google.auth.default()
try:
# create drive api client
service = build('drive', 'v3', credentials=creds)
file_id = real_file_id
# pylint: disable=maybe-no-member
request = service.files().get_media(fileId=file_id)
file = io.BytesIO()
downloader = MediaIoBaseDownload(file, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print(F'Download {int(status.progress() * 100)}.')
except HttpError as error:
print(F'An error occurred: {error}')
file = None
return file.getvalue()
if __name__ == '__main__':
download_file(real_file_id='1KuPmvGq8yoYgbfW74OENMCB5H0n_2Jm9')
This bears the question:
How do we get the file ID to download the file?
Generally speaking, a URL from a shared file from Google Drive looks like this
https://drive.google.com/file/d/1HV6vf8pB-EYnjcJcH65eGZVMa2v2tcMh/view?usp=sharing
where 1HV6vf8pB-EYnjcJcH65eGZVMa2v2tcMh corresponds to fileID.
You can simply copy it from the URL or, if you prefer, it's also possible to create a function to get the fileID from the URL.
For instance, given the following url = https://drive.google.com/file/d/1HV6vf8pB-EYnjcJcH65eGZVMa2v2tcMh/view?usp=sharing,
def url_to_id(url):
x = url.split("/")
return x[5]
Printing x will give
['https:', '', 'drive.google.com', 'file', 'd', '1HV6vf8pB-EYnjcJcH65eGZVMa2v2tcMh', 'view?usp=sharing']
And so, as we want to return the 6th array value, we use x[5].
This has also been described above,
from pydrive.auth import GoogleAuth
gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)
This creates its own server too do the dirty work of authenticating
file_obj = drive.CreateFile({'id': '<Put the file ID here>'})
file_obj.GetContentFile('Demo.txt')
This downloads the file
import requests
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id , 'confirm': 1 }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if __name__ == "__main__":
file_id = 'TAKE ID FROM SHAREABLE LINK'
destination = 'DESTINATION FILE ON YOUR DISK'
download_file_from_google_drive(file_id, destination)
Just repeating the accepted answer but adding confirm=1 parameter so it always downloads even if the file is too big
# Importing [PyDrive][1] OAuth
from pydrive.auth import GoogleAuth
def download_tracking_file_by_id(file_id, download_dir):
gauth = GoogleAuth(settings_file='../settings.yaml')
# Try to load saved client credentials
gauth.LoadCredentialsFile("../credentials.json")
if gauth.credentials is None:
# Authenticate if they're not there
gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
# Refresh them if expired
gauth.Refresh()
else:
# Initialize the saved creds
gauth.Authorize()
# Save the current credentials to a file
gauth.SaveCredentialsFile("../credentials.json")
drive = GoogleDrive(gauth)
logger.debug("Trying to download file_id " + str(file_id))
file6 = drive.CreateFile({'id': file_id})
file6.GetContentFile(download_dir+'mapmob.zip')
zipfile.ZipFile(download_dir + 'test.zip').extractall(UNZIP_DIR)
tracking_data_location = download_dir + 'test.json'
return tracking_data_location
The above function downloads the file given the file_id to a specified downloads folder. Now the question remains, how to get the file_id? Simply split the url by id= to get the file_id.
file_id = url.split("id=")[1]
I tried using google Colaboratory: https://colab.research.google.com/
Suppose your sharable link is https://docs.google.com/spreadsheets/d/12hiI0NK7M0KEfscMfyBaLT9gxcZMleeu/edit?usp=sharing&ouid=102608702203033509854&rtpof=true&sd=true
all you need is id that is 12hiI0NK7M0KEfscMfyBaLT9gxcZMleeu
command in cell
!gdown 12hiI0NK7M0KEfscMfyBaLT9gxcZMleeu
run the cell and you will see that file is downloaded in /content/Amazon_Reviews.xlsx
Note: one should know how to use Google colab
This example is based on an similar to RayB, but keeps the file in memory
and is a little simpler, and you can paste it into colab and it works.
import googleapiclient.discovery
import oauth2client.client
from google.colab import auth
auth.authenticate_user()
def download_gdrive(id):
creds = oauth2client.client.GoogleCredentials.get_application_default()
service = googleapiclient.discovery.build('drive', 'v3', credentials=creds)
return service.files().get_media(fileId=id).execute()
a = download_gdrive("1F-yaQB8fdsfsdafm2l8WFjhEiYSHZrCcr")
You can install https://pypi.org/project/googleDriveFileDownloader/
pip install googleDriveFileDownloader
And download the file, here is the sample code to download
from googleDriveFileDownloader import googleDriveFileDownloader
a = googleDriveFileDownloader()
a.downloadFile("https://drive.google.com/uc?id=1O4x8rwGJAh8gRo8sjm0kuKFf6vCEm93G&export=download")