Concat excel files and worksheets into one using python - python

I have many excel files in a directory, all of them has the same header row. Some of these excel files has multiple worksheets which again have the same headers. I'm trying to loop through the excel files in the directory and for each one check if there are multiple worksheets to concat them as well as the rest of the excel files.
This is what I tried:
import pandas as pd
import os
import ntpath
import glob
dir_path = os.path.dirname(os.path.realpath(__file__))
os.chdir(dir_path)
for excel_names in glob.glob('*.xlsx'):
# read them in
i=0
df = pd.read_excel(excel_names[i], sheet_name=None, ignore_index=True)
cdf = pd.concat(df.values())
cdf.to_excel("c.xlsx", header=False, index=False)
excels = [pd.ExcelFile(name) for name in excel_names]
# turn them into dataframes
frames = [x.parse(x.sheet_names[0], header=None,index_col=None) for x in excels]
# delete the first row for all frames except the first
# i.e. remove the header row -- assumes it's the first
frames[1:] = [df[1:] for df in frames[1:]]
# concatenate them..
combined = pd.concat(frames)
# write it out
combined.to_excel("c.xlsx", header=False, index=False)
i+=1
but then I get the below error any advice?
"concat excel.py", line 12, in <module>
df = pd.read_excel(excel_names[i], sheet_name=None, ignore_index=True)
File "/usr/local/lib/python2.7/site-packages/pandas/util/_decorators.py", line 188, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/pandas/util/_decorators.py", line 188, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/site-packages/pandas/io/excel.py", line 350, in read_excel
io = ExcelFile(io, engine=engine)
File "/usr/local/lib/python2.7/site-packages/pandas/io/excel.py", line 653, in __init__
self._reader = self._engines[engine](self._io)
File "/usr/local/lib/python2.7/site-packages/pandas/io/excel.py", line 424, in __init__
self.book = xlrd.open_workbook(filepath_or_buffer)
File "/usr/local/lib/python2.7/site-packages/xlrd/__init__.py", line 111, in open_workbook
with open(filename, "rb") as f:
IOError: [Errno 2] No such file or directory: 'G'

Your for statement is setting excel_names to each filename in turn (so a better variable name would be excel_name):
for excel_names in glob.glob('*.xlsx'):
But inside the loop your code does
df = pd.read_excel(excel_names[i], sheet_name=None, ignore_index=True)
where you are clearly expecting excel_names to be a list from which you are extracting one element. But it isn't a list, it's a string. So you are getting the first character of the first filename.

Related

Iterating over folder names in Python

I have two folders, 1 and 2. I want to go to each folder which has the file Test.xlsx. I tried to iterate on file_loc using i in range(1,3) but there's an error. The code works if I mention 1 or 2 on file_loc.
import pandas as pd
import numpy as np
for i in range(1,3):
file_loc = "C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\i\\Test.xlsx"
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols="A,C:AA")
A=df["N"].to_numpy()
print([A])
A = [x for x in A if str(x) != 'nan']
print(A)
A = [eval(e) for e in A]
print(A)
A=np.array(A)
print([A])
A_mean=[]
for i in range(0,len(A)):
A_mean.append(np.mean(A[i]))
print(*A_mean, sep='\n')
The error is
Traceback (most recent call last):
File "C:\Users\USER\OneDrive - Technion\Research_Technion\Python_PNM\Sept12_2022\Test.py", line 12, in <module>
df = pd.read_excel(file_loc, index_col=None, na_values=['NA'], usecols="A,C:AA")
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\util\_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 364, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1191, in __init__
ext = inspect_excel_format(
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\excel\_base.py", line 1070, in inspect_excel_format
with get_handle(
File "C:\Users\USER\anaconda3\lib\site-packages\pandas\io\common.py", line 711, in get_handle
handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\i\\Test.xlsx'
for i in range(1,3):
file_loc = f"C:\\Users\\USER\\OneDrive - Technion\\Research_Technion\\Python_PNM\\Sept12_2022\\{i}\\Test.xlsx"
...
Make sure you entered correct path

string.split() giving memory error in pandas dataframe

I am trying to split string but getting memory error. Is there any way to solve this or alternative solution for this?
I am getting error below code -
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
Error-
Traceback (most recent call last):
File "ravi_sir.py", line 47, in <module>
df1 = df1[0].str.split(',', expand=True)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2001, in wrapper
return func(self, *args, **kwargs)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2690, in split
return self._wrap_result(result, expand=expand, returns_string=expand)
File "/app/python3/lib/python3.6/site-packages/pandas/core/strings.py", line 2272, in _wrap_result
result = cons(result, columns=name, index=index, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/frame.py", line 520, in __init__
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/construction.py", line 93, in arrays_to_mgr
return create_block_manager_from_arrays(arrays, arr_names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1650, in create_block_manager_from_arrays
blocks = form_blocks(arrays, names, axes)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1739, in form_blocks
object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1784, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "/app/python3/lib/python3.6/site-packages/pandas/core/internals/managers.py", line 1830, in _stack_arrays
stacked = np.empty(shape, dtype=dtype)
MemoryError
I am trying to read zip file from s3 bucket and saving the content into dataframe to get the total count of files inside that zip file. Creating the dataframe. My full code is given below-
list_table = []
for table in d:
dict_table = OrderedDict()
s_time = datetime.datetime.now().strftime("%H:%M:%S")
print("start_time--->>",s_time)
print("tablename--->>", table)
s3 = boto3.resource('s3')
key='raw/vs-1/load-1619/data' +'/'+ table
obj = s3.Object('********',key)
n = obj.get()['Body'].read()
gzipfile = BytesIO(n)
gzipfile = gzip.GzipFile(fileobj=gzipfile)
content = gzipfile.read()
#print(content)
content_str = content.decode('utf-8')
content_str = str(content_str).split('\n')
df1 = pd.DataFrame(content_str)
df1 = df1[0].str.split(',', expand=True)
#df1 = pd.DataFrame([x.split(',') for x in str(content_str).split('\n')])
#print(df1)
#count = os.popen('aws s3 cp s3://itx-agu-lake/raw/vs-1/load-1619/data/{0} - | wc -l'.format(table)).read()
count = int(len(df1)) - 2
del(df1)
e_time = datetime.datetime.now().strftime("%H:%M:%S")
print("End_time---->>",e_time)
print(count)
dict_table['Table_Name'] = str(table)
dict_table['Count'] = count
list_table.append(dict_table)
Since you are splitting a huge string using a df column, then deleting the df, looks like you only need the count of commas for each row. So get the count, which is simple, rather than splitting the df -- which could generate a huge amount of columns and therefore cause your memory error.
row1list = ['1,2,3,4']
row2list = ['5,6']
row3list = ['7,8,9']
df = pd.DataFrame([row1list, row2list, row3list], columns=['col'])
df['count_commas'] = df['col'].str.count(',')
print(df)
# col count_commas
# 0 1,2,3,4 3
# 1 5,6 1
# 2 7,8,9 2

Export/import dataframe as Excel sheet

I have an example from the pandas documentation site and can't get it run. Export as excel file works well, but the following import not:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4), columns=list("ABCD"))
df = df.cumsum()
df.to_excel("/tmp/foo.xlsx", sheet_name="Sheet1")
print("Reading data back from an excel file")
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])
#print(df2)
my error message:
python3 /tmp/downloads/tmp_358/main.py
Reading data back from an excel file
Traceback (most recent call last):
File "/tmp/downloads/tmp_358/main.py", line 10, in <module>
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"])
File "/usr/local/lib/python3.6/dist-packages/pandas/util/_decorators.py", line 296, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 304, in read_excel
io = ExcelFile(io, engine=engine)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 867, in __init__
self._reader = self._engines[engine](self._io)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_xlrd.py", line 22, in __init__
super().__init__(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_base.py", line 353, in __init__
self.book = self.load_workbook(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/pandas/io/excel/_xlrd.py", line 37, in load_workbook
return open_workbook(filepath_or_buffer)
File "/usr/local/lib/python3.6/dist-packages/xlrd/__init__.py", line 170, in open_workbook
raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+'; not supported')
xlrd.biffh.XLRDError: Excel xlsx file; not supported
Python 3.6.9 and pandas==1.1.4 on Ubuntu 18.04
Can you try this:
df.to_excel("/tmp/foo.xlsx", sheet_name="Sheet1", engine='openpyxl')
df2=pd.read_excel("/tmp/foo.xlsx", "Sheet1", index_col=None, na_values=["NA"], engine='openpyxl')

Python - Trying to extract files from one location to another

I am trying to pull a set of files from a server and store in one of the folders in my local. The below code works well for this task. However if any of the files are empty it stops at that point and does not continue further.
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except EmptyDataError:
df = pd.DataFrame()
return df
Could anyone advice as to how could I by-pass these empty files and continue to extract the other files from the server. Thanks
Update:
Given below is the function I am trying to perform
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, header=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except pd.errors.EmptyDataError:
continue
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles] <<-- Error thrown on this line as per trackback
df = pd.concat(df_v1, ignore_index=True, sort=False)
Trackback:
Traceback (most recent call last):
File "/Users/PycharmProjects/venv/try.py", line 102, in <module>
s3_func("stores","store_a", "2018-10-03", "2018-10-05")
File "/Users/PycharmProjects/venv/try.py", line 86, in s3_func
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles]
File "/Users/PycharmProjects/venv/try.py", line 86, in <listcomp>
df_v1 = [pd.read_csv(fp, delim_whitespace=True).assign(FileName=os.path.basename(fp)) for fp in allFiles]
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 678, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 440, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 787, in __init__
self._make_engine(self.engine)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1014, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/Users/PycharmProjects/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 1708, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 542, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
Your loop is exiting upon reaching return condition.
If you want to continue the iteration if exception occurs you can do the following:
list_ = []
for file_ in allFiles:
try:
df = pd.read_csv(file_, index_col=None, delim_whitespace=True)
list_.append(df)
temp = pd.concat(list_)
except EmptyDataError:
df = pd.DataFrame()
continue # Changed return with continue, since return breaks the loop.
Also I see that you are creating empty data frame on exception. What you do with that empty data frame? Do you need it for future usage?
If you would need the empty data frames in future, consider appending them to the list as well
except EmptyDataError:
df = pd.DataFrame()
list_.append(df) # Appending empty dataframes to the list
continue

How to read the returned value (from previous function) into pandas, python? Getting error messages

In the following program
I want to access/pipe the data from one function in the downstream function.
With the python code something like below:
def main():
data1, data2, data3 = read_file()
do_calc(data1, data2, data3)
def read_file():
data1 = ""
data2 = ""
data3 = ""
file1 = open('file1.txt', 'r+').read()
for line in file1
do something....
data1 += calculated_values
file2 = open('file2.txt', 'r+').read()
for line in file1
do something...
data2 += calculated_values
file1 = open('file1.txt', 'r+').read()
for line in file1
do something...
data3 += calculated_values
return data1, data2, data3
def do_calc(data1, data2, data3):
d1_frame = pd.read_table(data1, sep='\t')
d2_frame = pd.read_table(data2, sep='\t')
d3_frame = pd.read_table(data3, sep='\t')
all_data = [d1_frame, d2_frame, d3_frame]
main()
What is wrong with the given code? looks like panda isn't able to read the input files properly but is printing the values from data1, 2 and 3 to the screen.
read_hdf seems to read the file but not properly. Is there a way to read the data returned from function directly into pandas (without writing/reading into a file).
Error message:
Traceback (most recent call last):
File "calc.py", line 757, in <module>
main()
File "calc.py", line 137, in main
merge_tables(pop1_freq_table, pop2_freq_table, f1_freq_table)
File "calc.py", line 373, in merge_tables
df1 = pd.read_table(pop1_freq_table, sep='\t')
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 645, in parser_f
return _read(filepath_or_buffer, kwds)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 388, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 729, in __init__
self._make_engine(self.engine)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 922, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/home/everestial007/.local/lib/python3.5/site-packages/pandas/io/parsers.py", line 1389, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas/parser.pyx", line 373, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4019)
File "pandas/parser.pyx", line 665, in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:7967)
FileNotFoundError: File b'0.667,0.333\n2\t15800126\tT\tT,A\t0.667,0.333\n2\t15800193\tC\tC,T\t0.667,0.333\n2\t15800244\tT\tT,C\......
I would appreciate any explanation.
read_table is expecting a file as input, but you pass a string of data instead of a string with the file location. You could write your data to a file and then read from that file. Assuming the string is already properly formatted:
filename = 'tab_separated_file_1.dat'
with open(filename, 'w') as f:
f.write(data1)
df1 = pd.read_table(filename, sep='\t')
As other answers have said, read_table expects a file for input--or, more accurately, a "file-like object". You can use a StringIO object to wrap the data1, data2, and data3 strings in an object that will "behave" like a file when fed to pandas with a few tweaks to your code:
#Import StringIO...
# python 2
from StringIO import StringIO
# python 3
from io import StringIO
def main():
data1, data2, data3 = read_file()
do_calc(data1, data2, data3)
def read_file():
# use StringIO objects instead of strings...
data1 = StringIO()
data2 = StringIO()
data3 = StringIO()
file1 = open('file1.txt', 'r+').read()
for line in file1
do something....
# note that " += " became ".write()"
data1.write(calculated_values)
file2 = open('file2.txt', 'r+').read()
for line in file1
do something...
data2.write(calculated_values)
file1 = open('file1.txt', 'r+').read()
for line in file1
do something...
data3.write(calculated_values)
return data1, data2, data3
def do_calc(data1, data2, data3):
d1_frame = pd.read_table(data1, sep='\t')
d2_frame = pd.read_table(data2, sep='\t')
d3_frame = pd.read_table(data3, sep='\t')
all_data = [d1_frame, d2_frame, d3_frame]
main()

Categories