How to read specific sheets from My XLS file in Python - python

As of now i can read EXCEL file's all sheet.
e.msgbox("select Excel File")
updated_deleted_xls = e.fileopenbox()
book = xlrd.open_workbook(updated_deleted_xls, formatting_info=True)
openfile = e.fileopenbox()
for sheet in book.sheets():
for row in range(sheet.nrows):
for col in range(sheet.ncols):
thecell = sheet.cell(row, 0)
xfx = sheet.cell_xf_index(row, 0)
xf = book.xf_list[xfx]

If you open your editor from the desktop or command line, you would have to specify the file path while trying to read the file:
import pandas as pd
df = pd.read_excel(r'File path', sheet_name='Sheet name')
Alternatively, if you open your editor in the file's directory, then you could read directly using the panda library
import pandas as pd
df = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='Title Sheet')
df1 = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx',sheet_name='Transactions')
df2 = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='NewCustomerList')
df3 = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerDemographic')
df4 = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerAddress')

Maybe Pandaswould be helpful ( the go-to package for data) :
import pandas as pd
df = pd.read_excel('filname.xls', sheet = 0)
Edit: Since a lot of time has passed and pandas matured the arguemnts have change. So for pandas >1.0.0
import pandas as pd
df = pd.read_excel('filname.xls', sheet_name = 0)

You can use book.sheet_by_name() to read specific sheets by their name from xls file.
for name, sheet_name in zip(filename, sheetnumber):
book = xlrd.open_workbook(name)
sheet = book.sheet_by_name(sheet_name)
for row in range(sheet.nrows):
for column in range(sheet.ncols):
thecell = sheet.cell(row, 0)
xfx = sheet.cell_xf_index(row, 0)
xf = book.xf_list[xfx]
filename is the path to your xls file. Specify the sheet number you need to read in sheetnumber.
Alternatively, you could use book.sheet_by_index() and pass argument to return a specific sheet.
From docs:
sheet_by_index(sheetx)
Parameters: sheetx – Sheet index in range(nsheets)
For example:
first_sheet = book.sheet_by_index(0) # returns the first sheet.

You can use either book.sheet_by_name() or book.get_sheet()
Example using get_sheet()
book = xlrd.open_workbook(updated_deleted_xls, formatting_info=True)
sheet = book.get_sheet(0) #Gets the first sheet.
Example using sheet_by_name()
book = xlrd.open_workbook(updated_deleted_xls, formatting_info=True)
sheet_names = book.sheet_names()
xl_sheet = xl_workbook.sheet_by_name(sheet_names[0])
MoreInfo on getting sheet by sheet_by_name

Related

updating and saving excel file using openpyxl and then reading it gives none values for formula based column

I am trying to update an excel sheet using openpyxl. When reading a updated formula based cell I am getting None output. The updates are not getting saved even though I have used openpyxl save command.
import openpyxl
# data_only=False to upadate excel file
def write_cell(data_only):
wb_obj = openpyxl.load_workbook("mydata.xlsx", data_only=data_only)
sheet_obj = wb_obj["Sheet1"]
sheet_obj = wb_obj.active
sheet_obj.cell(row = 1, column = 1).value = 8
wb_obj.save(filename="mydata.xlsx")
# data_only=True to read excel file"
def read_cell(data_only):
wb_obj = openpyxl.load_workbook("mydata.xlsx", data_only=data_only)
sheet = wb_obj["Sheet1"]
# Formula at column 2 : =A1*5
val = sheet.cell(row = 1, column = 2).value
return val
write_cell(False)
print(read_cell(True))
Actual Output -> None
Expected output -> 40
There are two solutions to this:
If you refer the documentation, it is mentioned that you can either have the formula or the value from formula. If you modify a file with formulae then you must pass it through some kind of application such as Excel and save it again which will now update the value of the formula. You won't get the none as the output now if you try to read the value of the cell containing formula.
Another solution is to open the excel file and save it from the script itself after saving it using openpyxl:
from win32com.client import Dispatch
import openpyxl
def write_cell(data_only):
wb_obj = openpyxl.load_workbook("mydata.xlsx", data_only=data_only)
sheet_obj = wb_obj["Sheet1"]
sheet_obj = wb_obj.active
sheet_obj.cell(row = 1, column = 1).value = 8
wb_obj.save(filename="mydata.xlsx")
open_save("mydata.xlsx")
def open_save(filename):
"""Function to open and save the excel file"""
xlApp = Dispatch("Excel.Application")
xlApp.Visible = False
xlBook = xlApp.Workbooks.Open(filename)
xlBook.Save()
xlBook.Close()

ValueError: Sheet 'Sheet1' already exists and if_sheet_exists is set to 'error'

I am trying to create an excel file of 3 columns: System Date, Time, Value on a webpage at that time.
Intention is to create a dataframe of the 3 values, every time the code runs, and append the dataframe to existing excel workbook (with one existing sheet).
I am able to create dataframe every time code runs, but when I try to append it to an excel file, it throws error:
ValueError: Sheet 'Sheet1' already exists and if_sheet_exists is set to 'error'
Can you please suggest, where am I going wrong.
# Importing Libraries
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup
import openpyxl
#getting today's date amd formatting it
now = datetime.now()
Date = now.strftime ("%d/%m/%Y")
Time = now.strftime ("%H:%M")
# GET request to scrape. 'Page' variable to assign contents
page = requests.get("https://www.traderscockpit.com/?pageView=live-nse-advance-decline-ratio-chart")
# Create BeautifulSoup object to parse content
soup = BeautifulSoup(page.content, 'html.parser')
adv = soup.select_one('a:-soup-contains("Advanced:")').next_sibling.strip()
dec = soup.select_one('a:-soup-contains("Declined:")').next_sibling.strip()
ADratio = round(int(adv)/int(dec), 2)
df = pd.DataFrame({tuple([Date, Time, ADratio])})
#Load workbook and read last used row
path = r'C:\Users\kashk\OneDrive\Documents\ADratios.xlsx'
writer = pd.ExcelWriter (path, engine='openpyxl', mode = 'a')
wb = openpyxl.load_workbook(path)
startrow = writer.sheets['Sheet1'].max_row
#Append data frame to existing table in existing sheet
df.to_excel (writer, sheet_name = 'Sheet1', index = False, header = False, startrow = startrow)
writer.save()
writer.close()
A fast and easy solution would be upgrading your pandas > 1.4.0 since it provides a if_sheet_exists = 'overlay' Source
pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='overlay')
If you don't want to upgrade your pandas, there is a way to work around by removing and re-write the sheet into the excel file. (Not recommended if you have a lot of records since it will be slow).
path, sheet_name = 'ADratios.xlsx' , 'Sheet 1'
df.columns = ['Date','Time','ADratio']
with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
book = openpyxl.load_workbook(path, 'r')
df_bak = pd.read_excel(path)
writer.book = openpyxl.load_workbook(path)
writer.book.remove(writer.book.worksheets[writer.book.sheetnames.index(sheet_name)])
writer.sheets = {ws.title:ws for ws in writer.book.worksheets}
pd.concat([df_bak, df], axis=0).to_excel(writer, sheet_name=sheet_name, index = False)

Getting wrong value when reading an xlsx file using openpyxl

I'm trying to read values from an xlsx file containing formulas using openpyxl; however, I noticed that for some cells, I'm getting a wrong value.
Here's the XLSX example:
Here's the result I get:
The code:
wb = openpyxl.load_workbook(excel_file, data_only=True)
# getting all sheets
sheets = wb.sheetnames
print(sheets)
# getting a particular sheet
worksheet = wb["Feuil1"]
print(worksheet)
# getting active sheet
active_sheet = wb.active
print(active_sheet)
# reading a cell
print(worksheet["A1"].value)
excel_data = list()
# iterating over the rows and
# getting value from each cell in row
for row in worksheet.iter_rows():
row_data = list()
for cell in row:
#cell.number_format='0.0########'
print(cell.number_format)
row_data.append(str(cell.value))
print(cell.value)
excel_data.append(row_data)
return render(request, 'myapp/index.html', {"excel_data":excel_data})
Hey What you want from an open excel file means which type of format do you gate
data,
This My answer for get data from excel file with xlrd.
import xlrd
from xlrd import open_workbook
fp = tempfile.NamedTemporaryFile(delete= False, suffix=filetype)
fp.write(binascii.a2b_base64(selected file))
workbook = xlrd.open_workbook(file name)
sheet = workbook.sheet_by_name(sheet name)
row = [c or '' for c in sheet.row_values(header_row)]
first_row = []
for col in range(sheet.ncols):
first_row.append(sheet.cell_value(0,col) )
archive_lines = []
for row in range(1, sheet.nrows):
elm = {}
for col in range(sheet.ncols):
elm[first_row[col]]=sheet.cell_value(row,col)
archive_lines.append(elm)

Writing resutls into 2 different sheets in the same Excel file

can you teach me whether Python can write into a same Excel file, but 2 different spreadsheets (tabs)?
Just for example, I want to pick and write the titles of below 4 websites, and write them into the same file title.xls but respectively in its Sheet1 and Sheet 2.
www.dailynews.com
www.dailynews.co.zw
www.gulf-daily-news.com
www.dailynews.gov.bw
I do them in 2 scripts, each for 2 websites:
from bs4 import BeautifulSoup
import urllib2
import xlwt
line_in_list = ['www.dailynews.com','www.dailynews.co.zw']
# line_in_list = [www.gulf-daily-news.com','www.dailynews.gov.bw']
book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
sheet = book.add_sheet('Sheet1', cell_overwrite_ok = True)
# sheet = book.add_sheet('Sheet2', cell_overwrite_ok = True)
for cor,websites in enumerate(line_in_list):
url = "http://" + websites
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
site_title = soup.find_all("title")
print site_title
sheet.write (cor, 0, site_title[0].text)
book.save("title.xls")
however, the script is overwriting the sheets. I can only have either Sheet1 or Sheet2 but never both.
any helps? thanks.
You can also do it using pandas.
import pandas as pd
# Add your data in list, which may contain a dictionary with the name of the
# columns as the key
df1 = pd.DataFrame({'website': ['www.dailynews.com', 'www.dailynews.co.zw']})
df2 = pd.DataFrame({'website': ['www.gulf-daily-news.com', 'www.dailynews.gov.bw']})
# Create a new excel workbook
writer = pd.ExcelWriter('title.xlsx', engine='xlsxwriter')
# Write each dataframe to a different worksheet.
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')
# Save workbook
writer.close()
If I correctly understood what you need. Sorry, can't comment to make it more clear.
sheet1 = book.add_sheet('Sheet1', cell_overwrite_ok = True)
sheet2 = book.add_sheet('Sheet2', cell_overwrite_ok = True)
sheet1.write (cor, 0, site_title[0].text)
sheet2.write (cor, 0, site_title[0].text)
import numpy as np
import pandas as pd
# Create a Dataframe
df1 = pd.DataFrame(np.random.rand(100).reshape(50,2),columns=['a','b'])
df2 = pd.DataFrame(np.random.rand(100).reshape(50,2),columns=['a','b'])
# Excel path
excelpath = 'path_to_your_excel.xlsx'
# Write your dataframes to difference sheets
with pd.ExcelWriter(excelpath) as writer:
df1.to_excel(writer,sheet_name='Sheet1')
df2.to_excel(writer,sheet_name = 'Sheet2')
""" I noticed that the above script overwrite all existing columns of in
the excel. In case you want to keep some columns and sheet untouched,
you might consider doing it the following way"""
import pandas as pd
import numpy as np
from openpyxl import load_workbook
book = load_workbook(excelpath)
writer = pandas.ExcelWriter(excelpath, engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df1.to_excel(writer, "Sheet1", columns=['a', 'b']) # only columns 'a' and 'b' will be populated
df2.to_excel(writer,"Sheet2",columns=['a','b']) # only columns 'a' and 'b' will be populated
writer.save()
--Append Excel Data Sheet to Spreadsheet
import pandas as pd
#import os
#from pandasql import sqldf
#pysqldf = lambda q: sqldf(q, globals())
df1 = pd.read_csv('MyData1.csv')
df2 = pd.read_csv('MyData2.csv')
print(df1)
print(df2)
Differences_df = df1.merge(df2, indicator=True, how='outer')
#Differences_df[merged['_merge'] == 'right_only']
print(Differences_df)
with pd.ExcelWriter('MyInputData.xlsx', mode='a') as writer:
Differences_df.to_excel(writer, sheet_name='Diff')
print("Spreadsheets Processed")

From password-protected Excel file to pandas DataFrame

I can open a password-protected Excel file with this:
import sys
import win32com.client
xlApp = win32com.client.Dispatch("Excel.Application")
print "Excel library version:", xlApp.Version
filename, password = sys.argv[1:3]
xlwb = xlApp.Workbooks.Open(filename, Password=password)
# xlwb = xlApp.Workbooks.Open(filename)
xlws = xlwb.Sheets(1) # counts from 1, not from 0
print xlws.Name
print xlws.Cells(1, 1) # that's A1
I'm not sure though how to transfer the information to a pandas dataframe. Do I need to read cells one by one and all, or is there a convenient method for this to happen?
Simple solution
import io
import pandas as pd
import msoffcrypto
passwd = 'xyz'
decrypted_workbook = io.BytesIO()
with open(i, 'rb') as file:
office_file = msoffcrypto.OfficeFile(file)
office_file.load_key(password=passwd)
office_file.decrypt(decrypted_workbook)
df = pd.read_excel(decrypted_workbook, sheet_name='abc')
pip install --user msoffcrypto-tool
Exporting all sheets of each excel from directories and sub-directories to seperate csv files
from glob import glob
PATH = "Active Cons data"
# Scaning all the excel files from directories and sub-directories
excel_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.xlsx'))]
for i in excel_files:
print(str(i))
decrypted_workbook = io.BytesIO()
with open(i, 'rb') as file:
office_file = msoffcrypto.OfficeFile(file)
office_file.load_key(password=passwd)
office_file.decrypt(decrypted_workbook)
df = pd.read_excel(decrypted_workbook, sheet_name=None)
sheets_count = len(df.keys())
sheet_l = list(df.keys()) # list of sheet names
print(sheet_l)
for i in range(sheets_count):
sheet = sheet_l[i]
df = pd.read_excel(decrypted_workbook, sheet_name=sheet)
new_file = f"D:\\all_csv\\{sheet}.csv"
df.to_csv(new_file, index=False)
Assuming the starting cell is given as (StartRow, StartCol) and the ending cell is given as (EndRow, EndCol), I found the following worked for me:
# Get the content in the rectangular selection region
# content is a tuple of tuples
content = xlws.Range(xlws.Cells(StartRow, StartCol), xlws.Cells(EndRow, EndCol)).Value
# Transfer content to pandas dataframe
dataframe = pandas.DataFrame(list(content))
Note: Excel Cell B5 is given as row 5, col 2 in win32com. Also, we need list(...) to convert from tuple of tuples to list of tuples, since there is no pandas.DataFrame constructor for a tuple of tuples.
from David Hamann's site (all credits go to him)
https://davidhamann.de/2018/02/21/read-password-protected-excel-files-into-pandas-dataframe/
Use xlwings, opening the file will first launch the Excel application so you can enter the password.
import pandas as pd
import xlwings as xw
PATH = '/Users/me/Desktop/xlwings_sample.xlsx'
wb = xw.Book(PATH)
sheet = wb.sheets['sample']
df = sheet['A1:C4'].options(pd.DataFrame, index=False, header=True).value
df
Assuming that you can save the encrypted file back to disk using the win32com API (which I realize might defeat the purpose) you could then immediately call the top-level pandas function read_excel. You'll need to install some combination of xlrd (for Excel 2003), xlwt (also for 2003), and openpyxl (for Excel 2007) first though. Here is the documentation for reading in Excel files. Currently pandas does not provide support for using the win32com API to read Excel files. You're welcome to open up a GitHub issue if you'd like.
Based on the suggestion provided by #ikeoddy, this should put the pieces together:
How to open a password protected excel file using python?
# Import modules
import pandas as pd
import win32com.client
import os
import getpass
# Name file variables
file_path = r'your_file_path'
file_name = r'your_file_name.extension'
full_name = os.path.join(file_path, file_name)
# print(full_name)
Getting command-line password input in Python
# You are prompted to provide the password to open the file
xl_app = win32com.client.Dispatch('Excel.Application')
pwd = getpass.getpass('Enter file password: ')
Workbooks.Open Method (Excel)
xl_wb = xl_app.Workbooks.Open(full_name, False, True, None, pwd)
xl_app.Visible = False
xl_sh = xl_wb.Worksheets('your_sheet_name')
# Get last_row
row_num = 0
cell_val = ''
while cell_val != None:
row_num += 1
cell_val = xl_sh.Cells(row_num, 1).Value
# print(row_num, '|', cell_val, type(cell_val))
last_row = row_num - 1
# print(last_row)
# Get last_column
col_num = 0
cell_val = ''
while cell_val != None:
col_num += 1
cell_val = xl_sh.Cells(1, col_num).Value
# print(col_num, '|', cell_val, type(cell_val))
last_col = col_num - 1
# print(last_col)
ikeoddy's answer:
content = xl_sh.Range(xl_sh.Cells(1, 1), xl_sh.Cells(last_row, last_col)).Value
# list(content)
df = pd.DataFrame(list(content[1:]), columns=content[0])
df.head()
python win32 COM closing excel workbook
xl_wb.Close(False)
Adding to #Maurice answer to get all the cells in the sheet without having to specify the range
wb = xw.Book(PATH, password='somestring')
sheet = wb.sheets[0] #get first sheet
#sheet.used_range.address returns string of used range
df = sheet[sheet.used_range.address].options(pd.DataFrame, index=False, header=True).value

Categories