i have a list of google drive file links about 300 pdf files which i have to download
so what i am trying to do is using pythons request library i am requesting to google server and getting the files.
after 30 to 36 files download google blocks my requests and return
We're sorry...... but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.
i am using the following code
import requests
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
if response.status_code!=200:
print(response.status_code)
return response.status_code
print('downloading '+ destination)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
i = 0
for chunk in response.iter_content(CHUNK_SIZE):
print(str(i)+'%')
i = i+1
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print('downloaded '+ destination)
if __name__ == "__main__":
file_id = 'file id'
destination = file_id+'.pdf'
download_file_from_google_drive(file_id, destination)
i am iterating download_file_from_google_drive this function using my list
so can i bypass the security check
i tried using vpn which changes my ip address but nothing works.
after about 1hour downloading restart
Related
I'm following this great blog File Handling in Sharepoint in order to upload files to online sharepoint.
I define a method to upload if my file size is smaller than 4MB, otherwise I have to upload in chunks. My method works ok for uploading any type of file but not for csv nor excel.
Here is my function definition
def save_pd_as_csv(self, df: pd.DataFrame, root_folder: str, file: str, *args: list, **kwargs: dict) -> None:
client_id = 'xxxxxxxxxxxxx'
tenant_id = 'xxxxxxxxxxxxxx'
scopes = ['Sites.ReadWrite.All', 'Files.ReadWrite.All']
auth_url = 'https://login.microsoftonline.com/xxxxxxxxxxx/oauth2/v2.0/authorize'
# MobileApplicationClient is used to get the Implicit Grant
oauth = OAuth2Session(client=MobileApplicationClient(client_id=client_id), scope=scopes)
authorization_url, state = oauth.authorization_url(auth_url)
# Graph API Configuration
CLIENT_ID = client_id
TENANT_ID = tenant_id
AUTHORITY_URL = 'https://login.microsoftonline.com/{}'.format(TENANT_ID)
RESOURCE_URL = 'https://graph.microsoft.com/'
SHAREPOINT_HOST_NAME = 'xxxxxxxxx' # URL of sharepoint host without https://
API_VERSION = 'v1.0'
USERNAME = 'xxxxxxxxx' # Office365 user's account username
PASSWORD = 'xxxxxxxxx'
SCOPES = ['Sites.ReadWrite.All', 'Files.ReadWrite.All'] # Add other scopes/permissions as needed.
# Creating a public client app, Aquire a access token for the user and set the header for API calls
pca = msal.PublicClientApplication(CLIENT_ID, authority=AUTHORITY_URL)
token = pca.acquire_token_by_username_password(USERNAME, PASSWORD, SCOPES)
headers = {'Authorization': 'Bearer {}'.format(token['access_token'])}
# get the site ID of document sharepoint, which is index 1 in the list (starting from 0)
site_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/sites?search=*', headers=headers).json()['value'][0]['id']
# list drives in the side team documents
drive_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/sites/{site_id}/drives?search=*', headers=headers).json()['value'][0]['id']
# id of the folder
item_id = requests.get(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/root:/{root_folder}', headers=headers).json()['id']
# now create file
output = io.BytesIO()
df.to_csv(output, *args, **kwargs)
csv_data = output.getvalue()
# get file id
file_ids = requests.get(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}/children', headers=headers).json()['value']
item_dict = {i["name"]: i["id"] for i in file_ids}
# get size of file
size = sys.getsizeof(df)
# push file according to size
if size <= 4194304:
if file in item_dict.keys():
file_id = item_dict[file]
result = requests.put(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{file_id}/content', headers=headers, data=csv_data)
else:
result = requests.put(f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}:/{file}:/content', headers=headers, data=csv_data)
if result.status_code == 200:
print("File uploaded")
# now upload by chunks
else:
df.to_csv("temp.csv")
result = requests.post(
f'{RESOURCE_URL}{API_VERSION}/drives/{drive_id}/items/{item_id}:/{file}:/createUploadSession',
headers={'Authorization': 'Bearer ' + token['access_token']},
json={
'#microsoft.graph.conflictBehavior': 'replace',
'description': 'Uploading a large file',
'fileSystemInfo': {'#odata.type': 'microsoft.graph.fileSystemInfo'},
'name': file
}
)
upload_url = result.json()['uploadUrl']
CHUNK_SIZE = 10485760
chunks = int(size / CHUNK_SIZE) + 1 if size % CHUNK_SIZE > 0 else 0
with open("temp.csv", 'rb') as fd:
start = 0
for chunk_num in range(chunks):
chunk = fd.read(CHUNK_SIZE)
bytes_read = len(chunk)
upload_range = f'bytes {start}-{start + bytes_read - 1}/{size}'
result = requests.put(upload_url,
headers={
'Content-Length': str(bytes_read),
'Content-Range': upload_range
},
data=chunk
)
result.raise_for_status()
start += bytes_read
if result.status_code == 200:
print("File uploaded to sharepoint")
Then I read a random dataframe that has a size greater than 4MB in order to go to else condition, with this method:
with open('df.pickle', 'rb') as pkl:
df_list = pickle.load(pkl)
save_pd_as_csv(df_list, root_folder=r"2.PROCESSED", file="history.csv", index=False)
As you can see I am creating a temporary file temp.csv in order to read this file with open() as binary rb
Finally I am able to read 2 chunks of data only (instead of 8) and get following error
File "C:\Users\Baptiste\PycharmProjects\union_brokerage\venv\lib\site-packages\requests\models.py", line 1021, in raise_for_status
raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 400 Client Error: Bad Request for url:
It seems that at the 3rd chunk load, value is an empty string.
Why can't we read a csv just like a normal binary file and how can I upload such large files with graph API in chunks ?
Thank you all!
Baptiste
The purpose of the code below is to read a pdf file located in s3 bucket and list the values of the pdf in the terminal. My end goal is to load those values into a csv/xlsx and upload it to the same s3 bucket. In other words, this is a file conversion from pdf to xlsx.
Adding item to_excel at the end is not loading the data to excel, any suggestions? the code below is only creating an empty xlsx file on the local directory but I need it to do the following:
save the data in xlsx listed in the terminal from reading the pdf located in s3
take that xlsx file that has the terminal data and upload it back to s3
import boto3
import time
import pandas as pd
# Textract APIs used - "start_document_text_detection", "get_document_text_detection"
def InvokeTextDetectJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
`enter code here` 'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def CheckJobComplete(jobId):
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def JobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
# S3 Document Data
s3BucketName = "pdfbucket"
documentName = "pdf"
# Function invokes
jobId = InvokeTextDetectJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(CheckJobComplete(jobId)):
response = JobResults(jobId)
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"])
with pd.ExcelWriter('output_cp.xlsx') as writer:
item.to_excel(writer, sheetName='Sheet1')
So after some more research and checking with other people, below is the final product.
The purpose of this code is the following:
Read a pdf file stored in an AWS S3 Bucket
After reading the file, save the data as text in a new xlsx file
import boto3
import time
import pandas as pd
from xlsxwriter import Workbook
# Textract APIs used - "start_document_text_detection", "get_document_text_detection"
def InvokeTextDetectJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def CheckJobComplete(jobId):
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def JobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(
JobId=jobId, NextToken=nextToken)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
# S3 Document Data
s3BucketName = "cpaypdf"
documentName = "cp.pdf"
# Function invokes
jobId = InvokeTextDetectJob(s3BucketName, documentName)
print("Started job with id: {}".format(jobId))
if(CheckJobComplete(jobId)):
response = JobResults(jobId)
df = pd.DataFrame(
columns=[
"Text"
]
)
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
df = df.append({
"Text": item['Text']
},
ignore_index=True
)
writer = pd.ExcelWriter("result.xlsx", engine='xlsxwriter')
df.to_excel(writer, sheet_name='sheetname', index=False)
writer.save()
I'm using a python script to send an API request to get the attachments of an email. The email I'm using has 4 attachments (plus pictures in the signature in the email). The python request only retrieves 1 attachment along with the pics from the signature. When using Postman with the exact same information, it retrieves all attachments along with the pics.
Any ideas on how I can get the other attachments?
import requests
url = 'https://graph.microsoft.com/v1.0/users/{{users email}}/messages/{{messageID}}/attachments'
body = None
head = {"Content-Type": "application/json;charset=UFT-8", "Authorization": "Bearer " + accessToken}
response1 = requests.get(url, data=body, headers=head)
response = response1.text
Below shows the response from the python script, with only 7 items, and the Postman response with 10 items.
Below code retrieves multiple attachments
(files being an array of attachment names)
def execute(accessToken, messageID, files, noAttachments):
import os
from os import path
import requests
import base64
import json
if noAttachments == "False":
url = 'https://graph.microsoft.com/v1.0/users/{{users email}}/messages/{{messageID}}/attachments'
body = {}
head = {"Authorization": "Bearer " + accessToken}
responseCode = requests.request("GET", url, headers=head, data = body)
response = responseCode.text
test = json.loads(responseCode.text.encode('utf8'))
x, contentBytes = response.split('"contentBytes":"',1)
if len(files) == 1:
imgdata = base64.b64decode(str(contentBytes))
filename = "C:/Temp/SAPCareAttachments/" + files[0]
with open(filename, 'wb') as f:
f.write(imgdata)
else:
for file in test["value"]:
imgdata = base64.b64decode(file["contentBytes"])
if file["name"] in files:
filename = "C:/Temp/" + file["name"]
with open(filename, 'wb') as f:
f.write(imgdata)
print(responseCode)
I have a list with three hashes and the script should create HTTP request for each hash in the list and then save the file of the hash.
for some reason, the script generates only one HTTP request and for this reason, I manage to download only one file instead of three.
all_hashes = ['07a3355f81f0dbd9f5a9a', 'e0f1d8jj3d613ad5ebda6d', 'dsafhghhhffdsfdd']
for hash in all_hashes:
params = {'apikey': 'xxxxxxxxxxxxx', 'hash': (hash)}
response = requests.get('https://www.test.com/file/download', params=params)
downloaded_file = response.content
name = response.headers['x-goog-generation']
if response.status_code == 200:
with open('%s.bin' % name, 'wb') as f:
f.write(response.content)
your response checking and saving code should also be in the loop
e.g
all_hashes = ['07a3355f81f0dbd9f5a9a', 'e0f1d8jj3d613ad5ebda6d', 'dsafhghhhffdsfdd']
for hash in all_hashes:
params = {'apikey': 'xxxxxxxxxxxxx', 'hash': (hash)}
response = requests.get('https://www.test.com/file/download', params=params)
downloaded_file = response.content
name = response.headers['x-goog-generation']
if response.status_code == 200:
with open('%s.bin' % name, 'wb') as f:
f.write(response.content)
currently you response is the last request since your code is executed after the loop is finished.
I'm trying to write a script to port my existing database into Firebase. My data is stored in JSON and I thought I could just pull the JSON and then send that as data into a POST to my Firebase.
def Post_And_Recieve_JSON(url, data, headers):
print("Compiling query...")
Post_And_Recieve_JSON.url = url
Post_And_Recieve_JSON.headers = headers
Post_And_Recieve_JSON.data = (data)
print("Sending request...")
request = urllib.request.Request(url=url, data=data,headers=headers)
print("Recieving response...")
response = urllib.request.urlopen(request)
print("Reading response...")
response_data = response.read()
print("Converting into usable format...")
response_data_JSON = json.loads(response_data.decode(encoding='UTF-8'))
return response_data_JSON
for all_users in existing_database:
full_data.append(Post_And_Recieve_JSON(...)
for item in full_data:
url = 'firebaseurlhere ' + item['profileId'] + '.json'
data = json.dumps(item).encode('ascii')
Post_And_Recieve_JSON(url, data, headers)
Where full_data is a list of JSON objects I've properly pulled from teh existing database.
I'm getting "http.client.BadStatusLine: ''"
I've solved this using the firebase python lib found here: http://ozgur.github.io/python-firebase/
I used pip3 to install it. I just wish I could have done it the same way I do other REST calls instead of requiring a new lib.
full_data = []
from firebase import *
firebase = firebase.FirebaseApplication('https://secret_url.firebaseio.com/', None)
for id in id_list:
print(str(id))
url = 'from_url'
try:
result = firebase.put('/users/ ' + str(id) + '/', data=Post_And_Recieve_JSON(url, None, headers)["payload"], name='Data')
except:
print('Skipping a user')