Export/import dataframe as Excel sheet - python

I have an example from the pandas documentation site and can't get it run. Export as excel file works well, but the following import not:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4), columns=list("ABCD"))
df = df.cumsum()
df.to_excel("/tmp/foo.xlsx", sheet_name="Sheet1")
print("Reading data back from an excel file")
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])
#print(df2)
my error message:
python3 /tmp/downloads/tmp_358/main.py
Reading data back from an excel file
Traceback (most recent call last):
File "/tmp/downloads/tmp_358/main.py", line 10, in <module>
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])
File "/usr/local/lib/python3.6/dist-packages/pandas/util/_decorators.py", line 296, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 304, in read_excel
io = ExcelFile(io, engine=engine)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 867, in __init__
self._reader = self._engines[engine](self._io)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_xlrd.py", line 22, in __init__
super().__init__(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 353, in __init__
self.book = self.load_workbook(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_xlrd.py", line 37, in load_workbook
return open_workbook(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/xlrd/__init__.py", line 170, in open_workbook
raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
xlrd.biffh.XLRDError: Excel xlsx file; not supported
Python 3.6.9 and pandas==1.1.4 on Ubuntu 18.04

Can you try this:
df.to_excel("/tmp/foo.xlsx", sheet_name="Sheet1", engine='openpyxl')
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"], engine='openpyxl')

Related

Not able to open a .xlsx file using the load_workbook by openpyxl?

I have a .xlsx sheet with a huge filename like this: child_id_survey_918_choose_touching_results_1541687696.xlsx when I try to use the load_workbook() to load this file it gives me a weird error.
My python file and the excel sheet both lie in the same folder.
CODE:
from openpyxl import load_workbook
workbook = load_workbook('child_id_survey_918_choose_touching_results_1541687696.xlsx')
Error:
Traceback (most recent call last):
File "test.py", line 2, in <module>
workbook = load_workbook('child_id_survey_918_choose_touching_results_1541687696.xlsx')
File "C:\Python38\lib\site-packages\openpyxl\reader\excel.py", line 315, in load_workbook
reader.read()
File "C:\Python38\lib\site-packages\openpyxl\reader\excel.py", line 280, in read
self.read_worksheets()
File "C:\Python38\lib\site-packages\openpyxl\reader\excel.py", line 228, in read_worksheets
ws_parser.bind_all()
File "C:\Python38\lib\site-packages\openpyxl\worksheet\_reader.py", line 434, in bind_all
self.bind_cells()
File "C:\Python38\lib\site-packages\openpyxl\worksheet\_reader.py", line 337, in bind_cells
for idx, row in self.parser.parse():
File "C:\Python38\lib\site-packages\openpyxl\worksheet\_reader.py", line 153, in parse
row = self.parse_row(element)
File "C:\Python38\lib\site-packages\openpyxl\worksheet\_reader.py", line 274, in parse_row
cells = [self.parse_cell(el) for el in row]
File "C:\Python38\lib\site-packages\openpyxl\worksheet\_reader.py", line 274, in <listcomp>
cells = [self.parse_cell(el) for el in row]
File "C:\Python38\lib\site-packages\openpyxl\worksheet\_reader.py", line 181, in parse_cell
style_id = int(style_id)
ValueError: invalid literal for int() with base 10: 'string'
I have also tried using this: workbook = load_workbook(filename = 'child_id_survey_918_choose_touching_results_1541687696.xlsx')
Why am I getting this error and how can I solve it?

Pandas: Read excel file for specific columns, remove empty rows, create CSV

I am trying to read a folder of .xlsm files, take columns A:J, remove any empty rows, and then combine each excel file into a single CSV. The code below seems to work when I use just one specific excel file but has an error when I loop. Any help would be appreciated.
import pandas as pd
import os
import glob
# defines the folder to pull from and to save into
source = r"C:\Users\bwendt\QAR"
#defines list of files as dir and changes directory to source
os.chdir(source)
files = glob.glob(source + "/*.xlsm")
MultiRents = []
#loops through list of file paths, reads the file, removes blank cells, and adds to data frame
for f in files:
data = pd.read_excel(f,"Page2",usecols = "A:J")
#data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
MultiRents.append(data)
#create pandas DF
df = pd.DataFrame.from_records(MultiRents)
#Exports dataframe to a csv file
export_csv = df.to_csv("Multifamily_Rents.csv")
Traceback: Traceback (most recent call last):
File "", line 1, in
runfile('C:/Users/bwendt/.spyder-py3/Print_rents.py', wdir='C:/Users/bwendt/.spyder-py3')
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py",
line 827, in runfile
execfile(filename, namespace)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py",
line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/bwendt/.spyder-py3/Print_rents.py", line 21, in
data = pd.read_excel(f,"Page2",usecols = "A:J")
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util_decorators.py",
line 188, in wrapper
return func(*args, **kwargs)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\util_decorators.py",
line 188, in wrapper
return func(*args, **kwargs)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\excel.py",
line 350, in read_excel
io = ExcelFile(io, engine=engine)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\excel.py",
line 653, in init
self._reader = self._enginesengine
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\io\excel.py",
line 424, in init
self.book = xlrd.open_workbook(filepath_or_buffer)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd__init__.py",
line 157, in open_workbook
ragged_rows=ragged_rows,
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd\book.py",
line 92, in open_workbook_xls
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd\book.py",
line 1278, in getbof
bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
File
"C:\Users\bwendt\AppData\Local\Continuum\anaconda3\lib\site-packages\xlrd\book.py",
line 1272, in bof_error
raise XLRDError('Unsupported format, or corrupt file: ' + msg)
XLRDError: Unsupported format, or corrupt file: Expected BOF record;
found b'\x0eWendt, '

how to extract excel data in python

I want to statistic data from excel in python, but now I cannot extract the data
my code
import pandas as pd
import xlrd
file = 'Book1.xlsx'
x1 = pd.ExcelFile(file)
Print(x1.sheet_names)
results:
C:\Users\ldanl\PycharmProjects\1011\venv\Scripts\python.exe
C:/Users/ldanl/PycharmProjects/1011/draft.py
Traceback (most recent call last):
File "C:/Users/ldanl/PycharmProjects/1011/draft.py", line 82, in <module> x1 = pd.ExcelFile(file)
File"C:\Users\ldanl\PycharmProjects\1011\venv\lib\sitepackages\pandas\io\excel.py", line 394, in __init__ self.book = xlrd.open_workbook(self._io)
File "C:\Users\ldanl\PycharmProjects\1011\venv\lib\site-packages\xlrd\__init__.py", line 141, in open_workbook
ragged_rows=ragged_rows,
File "C:\Users\ldanl\PycharmProjects\1011\venv\lib\site-packages\xlrd\xlsx.py", line 808, in open_workbook_2007_xml
x12book.process_stream(zflo, 'Workbook')
File "C:\Users\ldanl\PycharmProjects\1011\venv\lib\site-packages\xlrd\xlsx.py", line 265, in process_stream
meth(self, elem)
File "C:\Users\ldanl\PycharmProjects\1011\venv\lib\site-packages\xlrd\xlsx.py", line 392, in do_sheet
sheet = Sheet(bk, position=None, name=name, number=sheetx)
File "C:\Users\ldanl\PycharmProjects\1011\venv\lib\site-packages\xlrd\sheet.py", line 326, in __init__
self.extract_formulas = book.extract_formulas
AttributeError: 'Book' object has no attribute 'extract_formulas'
Process finished with exit code 1
For Pandas --> Try pd.read_excel()
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html#pandas-read-excel
For Xlrd --> Try xlrd.open_workbook()
https://xlrd.readthedocs.io/en/latest/api.html#xlrd.open_workbook

error opening xlsx files in python

I am trying to open an xlsx file that is created by another system (and this is the format in which the data always comes, and is not in my control). I tried both openpyxl (v2.3.2) and xlrd (v1.0.0) (as well as pandas (v0.20.1) read_excel and pd.ExcelFile(), both of which are using xlrd, and so may be moot), and I am running into errors; plus not finding answers from my searches. Any help is appreciated.
xlrd code:
import xlrd
workbook = xlrd.open_workbook(r'C:/Temp/Data.xlsx')
Error:
Traceback (most recent call last):
File "<ipython-input-3-9e5d87f720d0>", line 2, in <module>
workbook = xlrd.open_workbook(r'C:/Temp/Data.xlsx')
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\__init__.py", line 422, in open_workbook
ragged_rows=ragged_rows,
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 833, in open_workbook_2007_xml
x12sheet.process_stream(zflo, heading)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 548, in own_process_stream
self_do_row(elem)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 685, in do_row
self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
ValueError: could not convert string to float:
openpyxl code:
import openpyxl
wb = openpyxl.load_workbook(r'C:/Temp/Data.xlsx')
Error:
Traceback (most recent call last):
File "<ipython-input-2-6083ad2bc875>", line 1, in <module>
wb = openpyxl.load_workbook(r'C:/Temp/Data.xlsx')
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\excel.py", line 234, in load_workbook
parser.parse()
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\worksheet.py", line 106, in parse
dispatcher[tag_name](element)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\worksheet.py", line 243, in parse_row_dimensions
self.parse_cell(cell)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\reader\worksheet.py", line 188, in parse_cell
value = _cast_number(value)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 23, in _cast_number
return long(value)
ValueError: invalid literal for int() with base 10: ' '
pandas code:
import pandas as pd
df = pd.read_excel(r'C:/Temp/Data.xlsx', sheetname='Sheet1')
Error:
Traceback (most recent call last):
File "<ipython-input-5-b86ec98a4e9e>", line 2, in <module>
df = pd.read_excel(r'C:/Temp/Data.xlsx', sheetname='Sheet1')
File "C:\Program Files\Anaconda3\lib\site-packages\pandas\io\excel.py", line 200, in read_excel
io = ExcelFile(io, engine=engine)
File "C:\Program Files\Anaconda3\lib\site-packages\pandas\io\excel.py", line 257, in __init__
self.book = xlrd.open_workbook(io)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\__init__.py", line 422, in open_workbook
ragged_rows=ragged_rows,
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 833, in open_workbook_2007_xml
x12sheet.process_stream(zflo, heading)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 548, in own_process_stream
self_do_row(elem)
File "C:\Program Files\Anaconda3\lib\site-packages\xlrd\xlsx.py", line 685, in do_row
self.sheet.put_cell(rowx, colx, None, float(tvalue), xf_index)
ValueError: could not convert string to float:
For what its worth, here is an example snippet of the input file:
I am guessing that the errors are coming from the first row having blanks beyond the first column - because the errors vanish when I delete the first two rows and . I cannot skip the first two rows, because I want to extract the value in cell A1. I would also like to force the values read to be string type, and will later convert to float with error checking. thanks!
===========
Update(Aug 9 10AM EDT): Using Charlie's suggestion, was able to open excel file in read only mode; and was able to read most of the contents - but still running into an error somewhere.
new code (sorry it is not very pythonic - still a newbie):
wb = openpyxl.load_workbook(r'C:/Temp/Data.xlsx', read_only=True)
ws = wb['Sheet1']
ws.max_row = ws.max_column = None
i=1
for row in ws.rows:
for cell in row:
if i<2000:
i += 1
try:
print(i, cell.value)
except:
print("error")
Error:
Traceback (most recent call last):
File "<ipython-input-65-2e8f3cf2294a>", line 2, in <module>
for row in ws.rows:
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\worksheet\read_only.py", line 125, in get_squared_range
yield tuple(self._get_row(element, min_col, max_col))
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\worksheet\read_only.py", line 165, in _get_row
value, data_type, style_id)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 36, in __init__
self.value = value
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 132, in value
value = _cast_number(value)
File "C:\Program Files\Anaconda3\lib\site-packages\openpyxl\cell\read_only.py", line 23, in _cast_number
return long(value)
ValueError: invalid literal for int() with base 10: ' '
=========
Update2 (10:35AM): when i read the file without ws.max_row and ws.max_column set as None, the code was reading just one column, without errors. The value in cell A66 is "Generated from:". But when i read the file with ws.max_row and ws.max_column set as None, this particular cell is causing trouble. But I can read all other cells before that, and that will work fine for me, right now. thanks, #Charlie.
Sounds like the source file is probably corrupt and contains cells that with empty strings that are typed as numbers. You might be able to use openpyxl's read-only mode to skip the first tow rows.
If your program works after you delete the first two rows then lets skip them. try use skiprows to ignore the first 2 rows that are blanks or are headers. you can use the parse method from panda.
xls = pd.read_excel('C:/Temp/Data.xlsx')
df = xls.parse('Sheet1', skiprows=2) #assuming your data is on sheet1.

"pandas.io.common.EmptyDataError: No columns to parse from file" after moving to mac

In Windows 8, the script works fine. After I moved script and data.csv to work in my mac, I keep getting error: "pandas.io.common.EmptyDataError: No columns to parse from file."
The script and data are in the same folder as
"/Users/myname/Downloads/test/testimport.py"
"/Users/myname/Downloads/test/test2.csv"
I've tried many file locations to read the csv but nothing works.
file_loc = "../test/test2.csv"
# as well as "../test2.csv", "/test2.csv", "/Users/myname/Downloads/test/test2.csv"
import pandas as pd
df = pd.read_csv(file_loc)
exp_mat = df.as_matrix()
print exp_mat
How can I read the csv here? Is it wrong location problem or is csv filetype in mac not compatible?
Here is OS X El Capitan. Full error is
h143% python testimport.py
Traceback (most recent call last):
File "test_importexcel.py", line 24, in <module>
df = pd.read_csv(file_loc)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 389, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 730, in __init__
self._make_engine(self.engine)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 923, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/myname/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py", line 1390, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas/parser.pyx", line 538, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:6171)
pandas.io.common.EmptyDataError: No columns to parse from file
Data (copying from Number is like)
x time value
445.1207 0.003626 21935450
445.1203 0.011099 36700932
445.1203 0.017235 35722172
445.1203 0.022958 33623668
445.1203 0.028689 33500360
352.3396 37.180567 307886720
352.3396 37.185836 303264100
352.3396 37.191101 292523810

Categories