I am attempting to open, read and use a macro and then re save an Excel file on Sharepoint using Python. Using the Office 365 REST-Python Client I can open and read but struggling to see how to do the rest.
Would appreciate any help, thanks!
`ctx_auth = AuthenticationContext(url)
if ctx_auth.acquire_token_for_user(username, password):
ctx = ClientContext(url, ctx_auth)
web = ctx.web
ctx.load(web)
ctx.execute_query()
response = File.open_binary(ctx, relative_url)
#save data to BytesIO stream
bio = io.BytesIO()
bio.write(response.content)
bio.seek(0) #set file object to start
#read file into pandas dataframe
df = pd.read_excel(bio, sheet_name="Overview")
print(df)
df.at[0,"Unnamed: 1"] = "description"
bio2 = io.BytesIO()
#pip install xlsxwriter
writer = pd.ExcelWriter(bio2)
df.to_excel(writer, sheet_name="Overview")
writer.save()
bio2.seek(0)
df = pd.read_excel(bio2, sheet_name="Overview")
workbook = bio2.read()
response2 = File.save_binary(ctx, relative_url, workbook)
print(response2)`
You can refer to the following Python script to save and read Excel files on Sharepoint.
#import all the libraries
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.files.file import File
import io
import pandas as pd
#target url taken from sharepoint and credentials
url = 'https://company.sharepoint.com/user/folder'
path = '/user/folder/Documents/Target_Excel_File_v4.xlsx'
username = 'Dumby_account#company.com'
password = 'Password!'
ctx_auth = AuthenticationContext(url)
if ctx_auth.acquire_token_for_user(username, password):
ctx = ClientContext(url, ctx_auth)
web = ctx.web
ctx.load(web)
ctx.execute_query()
print("Authentication successful")
response = File.open_binary(ctx, path)
#save data to BytesIO stream
bytes_file_obj = io.BytesIO()
bytes_file_obj.write(response.content)
bytes_file_obj.seek(0) #set file object to start
#read excel file and each sheet into pandas dataframe
df = pd.read_excel(bytes_file_obj, sheet_name = None)
print(df)
There is a similar SO threading problem here.
Related
Im trying this with API-Rest and this method:
from office365.runtime.auth.client_credential import ClientCredential
from office365.sharepoint.fields.lookup_value import FieldLookupValue
from office365.sharepoint.client_context import ClientContext
from datetime import datetime, timedelta
import pandas as pd
def dfSpLibrary(relative_url):
ctx = ClientContext(site_url).with_credentials(ClientCredential(client_id, client_secret))
libraryRoot = ctx.web.get_folder_by_server_relative_path(relative_url)
ctx.load(libraryRoot)
ctx.execute_query()
#if you want to get the folders within <sub_folder>
folders = libraryRoot.folders
ctx.load(folders)
ctx.execute_query()
files = libraryRoot.files
ctx.load(files)
ctx.execute_query()
#create a dataframe of the important file properties for me for each file in the folder
df_files = pd.DataFrame(columns = ['Name', 'ServerRelativeUrl', 'TimeLastModified', 'ModTime'])
for myfile in files:
mod_time = datetime.strptime(myfile.properties['TimeLastModified'], '%Y-%m-%dT%H:%M:%SZ')
df_dictionary = pd.DataFrame([{'Name': myfile.properties['Name'], 'ServerRelativeUrl': myfile.properties['ServerRelativeUrl'], 'TimeLastModified': myfile.properties['TimeLastModified'], 'ModTime': mod_time}])
df_files = pd.concat([df_files, df_dictionary], ignore_index=True)
This returns a df with all the files of the library but i cant find the propeties that are in the table like title of currency. There is a way to archive this??
I am using the Google Drive API to get file revisions, then I am interested in reading (not downloading) the contents of each file revision to a pandas dataframe. I have been able to get the revision Ids, but reading the contents is what is problematic. I have tried the following code and getting a googleapiclient.errors.HttpError. If I change it from get_media to just get which returns alt=json, I do not get the file contents but the revision metadata. Will appreciate help here:
import io
import pandas as pd
from google_apis import create_service
from googleapiclient.http import MediaIoBaseDownload
import urllib
import csv
import requests
CLIENT_FILE = 'client_secret.json'
API_NAME = 'drive'
API_VERSION = 'v3'
SCOPES = ['https://www.googleapis.com/auth/drive']
# Retrieve file revision history
service = create_service(CLIENT_FILE, API_NAME, API_VERSION, SCOPES)
def get_file_revision_history(file_id):
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
while nextPageToken:
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000,
pageToken=nextPageToken
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
return revisions
file_id = '1E8Wbd80CbFlFSHYZQkApXMM9EQOz1lQRl4m3rfq-vdY'
revision_history = get_file_revision_history(file_id)
print(revision_history)
df = pd.json_normalize(revision_history)
#df.to_csv('revision history ({0}).csv'.format(file_id), index=False)
#read the file contents
revision_history_id = '104'
res = service.revisions().get_media(
fileId = file_id,
revisionId = revision_history_id
).execute()
#uri = res.uri
print(res)
I thought that when I saw your file ID and your revision ID of '104', the file might be Google Docs (Document, Spreadsheet, Slides, and so on). In this case, unfortunately, get_mediamethod cannot be used. Here,exportmethod is required to be used. But, unfortunately, it seems thatrevisions()has no method ofexport. So, in this case, the export link retrieved from service.revisions().get()` is used for downloading the data.
And, your goal is to put the exported values on pandas. From this, I guessed that the file of your file ID might be Google Spreadsheet. If my understanding is correct, when this is reflected in a python script, how about the following sample script?
Sample script:
service = build("drive", "v3", credentials=creds) # Please use your script.
file_id = '1E8Wbd80CbFlFSHYZQkApXMM9EQOz1lQRl4m3rfq-vdY' # This is from your showing script. From your question, I guessed that this might be Google Spreadsheet.
mimeType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # In this case, Google Spreadsheet is exported as XLSX format.
sheet = "Sheet1" # Please set the sheet name you want to retrieve the values.
revision_history_id = '104' # Please set the revision ID.
res = service.revisions().get(fileId=file_id, revisionId=revision_history_id, fields="*").execute()
link = res.get("exportLinks").get(mimeType)
if link:
data = requests.get(link, headers={"Authorization": "Bearer " + creds.token})
values = pd.read_excel(BytesIO(data.content), usecols=None, sheet_name=sheet)
print(values)
Here, the access token is retrieved from creds of service = build("drive", "v3", credentials=creds). Please be careful about this.
In this case, from io import BytesIO, import pandas as pd, and import requests are also used.
When this script is run, the Google Spreadsheet is exported with the revision ID as XLSX format. And, the values from the expected sheet are put to pandas.
Reference:
Revisions: get
After giving it another thought, I finally got it right like this:
import pandas as pd
from google_apis import create_service
CLIENT_FILE = 'client_secret.json'
API_NAME = 'drive'
API_VERSION = 'v3'
SCOPES = ['https://www.googleapis.com/auth/drive']
# Retrieve file revision history
service = create_service(CLIENT_FILE, API_NAME, API_VERSION, SCOPES)
def get_file_revision_history(file_id):
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
while nextPageToken:
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000,
pageToken=nextPageToken
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
return revisions
file_id = '1E8Wbd80CbFlFSHYZQkApXMM9EQOz1lQRl4m3rfq-vdY'
revision_history = get_file_revision_history(file_id)
exportlinks = list(map(lambda x: x["exportLinks"], revision_history))
csv_urls = list(map(lambda y: y["text/csv"], exportlinks))
modified_time = list(map(lambda z: z["modifiedTime"], revision_history))
appended_data = []
for i,j in zip(csv_urls,modified_time):
res = service._http.request(i)
rn_string_data = list(res)[1]
data = list(map(lambda x: x.split(','),rn_string_data.decode('utf-8').split("\r\n")))
df = pd.DataFrame(data[1:],columns=['employee_name','hourly_rate','currency'])
df['ModifiedTime'] = j
appended_data.append(df)
appended_data = pd.concat(appended_data)
print(appended_data)
``
This is the code that I have so far:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
import getpass
import csv
def auth_api():
# Authenticate SharePoint
tenant_url= "https://[company_name].sharepoint.com"
username = "[username]"
password = getpass.getpass()
ctx_auth = AuthenticationContext(tenant_url)
ctx_auth.acquire_token_for_user(username, password)
ctx = ClientContext(tenant_url, ctx_auth)
# Create a CSV file to save the data
csv_file = open("sp.csv", "w")
csv_writer = csv.writer(csv_file, delimiter=",")
csv_writer.writerow(["Title", "Modified"])
# Get the files' metadata
files = ctx.web.lists.get_by_title("Accounting").root_folder.files
ctx.load(files)
ctx.execute_query()
for file in files:
item = file.listItemAllFields
ctx.load(item)
ctx.execute_query()
row = [item.properties["Title"], item.properties["Modified"]]
csv_writer.writerow(row)
# Get the folders' metadata
folders = ctx.web.lists.get_by_title("Accounting").root_folder.folders
ctx.load(folders)
ctx.execute_query()
for folder in folders:
item = folder.list_item_all_fields
ctx.load(item)
ctx.execute_query()
row = [item.properties]
csv_writer.writerow(row)
csv_file.close()
if __name__ == "__main__":
auth_api()
The code works, except the properties of the files and folders are not retrieving all of the data that I need. The title comes up as an empty string for each file/folder and none of the properties show the files/folders' sizes. Should I use a different Python library?
I wanted to read an excel file in S3 from Glue.
Here's what I've done so far.
import pandas as pd
import awswrangler as wr
import io
ad_request_path = 's3://bucketname/key.xlsx'
df = wr.s3.read_excel(ad_request_path)
OR
bucket_name = 'bucketname'
object_key = 'key.xlsx'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
workbook = open_workbook_xls(file_contents=data)
df = pd.read_excel(io.BytesIO(data))
print(df)
I got this error message:
XLRDError: Excel xlsx file; not supported
Managed to make it work. Just add engine = 'openpyxl'
import awswrangler as wr
import openpyxl
ad_request_path = 's3://bucketname/key.xlsx'
df = wr.s3.read_excel(ad_request_path, engine='openpyxl')
I have a Python script that read files and convert it to dataframe using Python and streamlit. Then I want to create a function to allows the user to download this dataframe as an Excel file with extension .xls.
So I tried to read the dataframe and convert it to an Excel file using these two functions:
pd.ExcelWriter
df.to_excel
But when I try to download the file using a link the file doesn't download and displays this error:
Failed-Network error
Code:
import pandas as pd
import streamlit as st
writer = pd.ExcelWriter('update2.xlsx')
df.to_excel(writer, index = False, header=True,encoding='utf-8')
with open(writer,'rb') as f:
b64 = base64.b64encode(f.read())
href = f'Download {extension}'
st.write(href, unsafe_allow_html=True)
With the streamlit latest release(above 1.0.0):
Use
st.download_button
Displays a download button widget.
This is useful when you would like to provide a way for your users to download a file directly from your app.
Note that the data to be downloaded is stored in memory while the user is connected, so it's a good idea to keep file sizes under a couple of hundred megabytes to conserve memory.
Here is a sample code from the discussion, that can be helpful to download excel files...
import pandas as pd
from io import BytesIO
from pyxlsb import open_workbook as open_xlsb
import streamlit as st
def to_excel(df):
output = BytesIO()
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df.to_excel(writer, index=False, sheet_name='Sheet1')
workbook = writer.book
worksheet = writer.sheets['Sheet1']
format1 = workbook.add_format({'num_format': '0.00'})
worksheet.set_column('A:A', None, format1)
writer.save()
processed_data = output.getvalue()
return processed_data
df_xlsx = to_excel(df)
st.download_button(label='📥 Download Current Result',
data=df_xlsx ,
file_name= 'df_test.xlsx')
This worked for me
import pandas as pd
from io import BytesIO
import streamlit as st
def to_excel(df: pd.DataFrame):
in_memory_fp = BytesIO()
df.to_excel(in_memory_fp)
# Write the file out to disk to demonstrate that it worked.
in_memory_fp.seek(0, 0)
return in_memory_fp.read()
cols = ["col1", "col2"]
df = pd.DataFrame.from_records([{k: 0.0 for k in cols} for _ in range(25)])
excel_data = to_excel(df)
file_name = "excel.xlsx"
st.download_button(
f"Click to download {file_name}",
excel_data,
file_name,
f"text/{file_name}",
key=file_name
)
line 5 can't be executed since you havent assigned any excel to the DataFrame df.
try something like this in your code:
df = pd.read_csv('update2.xlsx')
I hope, this helped.
Take care
def get_binary_file_downloader_html(bin_file, file_label='File'):
with open(bin_file, 'rb') as f:
data = f.read()
bin_str = base64.b64encode(data).decode()
href = f'Descargar {file_label}'
return href
st.markdown(get_binary_file_downloader_html('Wip_QRY.xlsx', 'Excel'), unsafe_allow_html=True)