Dynamically append dataframes in pandas - python

I want to load files from a list, calculate mean, median and standard deviation for each row of each file and then create a dataframe listing all the newly calculated fields.
I have the following code:
#list files to load
file_names = ["file_1", "file_2", ...]
#empty df
data = pd.DataFrame()
#for loop
for filename in file_names:
df = pd.read_csv(filename, index_col=False, header=0)
mean = df.mean(axis = 1)
median = df.median(axis = 1)
std = df.std(axis = 1)
df = pd.concat([mean, median, std], axis = 1, ignore_index = 1)
data = pd.concat(df, axis=1)
I'm getting an error:
TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"
Individual dfs that are being created in the for loop look exactly how I want it but I can't concatenate them all together.

As it is you're overwriting df every time through the loop.
Instead collect the DataFrames in a list, then concatenate that list together.
df_list = []
#for loop
for filename in file_names:
df = pd.read_csv(filename, index_col=False, header=0)
mean = df.mean(axis = 1)
median = df.median(axis = 1)
std = df.std(axis = 1)
df = pd.concat([mean, median, std], axis = 1, ignore_index = 1)
df_list.append(df)
data = pd.concat(df_list, axis=1)

Change this line
data = pd.concat(df, axis=1)
to
data = pd.concat([data,df], axis=1)
should work.. do let me know in any case pls

Related

How to read and manipulate multiple CSV files using pandas and for-loop?

I want to read a list of CSV files, for example exon_kipan.00001.csv, exon_kipan.00002.csv, exon_kipan.00003.csv, and exon_kipan.00004.csv (24 files in total), and then perform a series of operations using pandas before concatenating the dataframes.
For a single file, I would do:
df= pd.read_csv("exon_kipan.csv", sep="\t", index_col=0, low_memory=False)
df= df[df.columns[::3]]
df= df.T
del df[df.columns[0]]
df.index = df.index.str.upper()
df= df.sort_index()
df.index = ['-'.join( s.split('-')[:4]) for s in df.index.tolist() ]
df.rename_axis(None, axis=1, inplace=True)
However, now I want to read, manipulate, and concatenate multiple files.
filename = '/work/exon_kipan.{}.csv'
df_dict = {}
exon_clin_list = []
for i in range(1, 25):
df_dict[i] = pd.read_csv(filename, sep="\t", index_col=0, low_memory=False)
df_dict[i] = df_dict[i][df_dict[i].columns[::3]]
df_dict[i] = df_dict[i].T
del df_dict[i][df_dict[i].columns[0]]
df_dict[i].index = df_dict[i].index.str.upper()
df_dict[i] = df_dict[i].sort_index()
df_dict[i].index = ['-'.join( s.split('-')[:4]) for s in df_dict[i].index.tolist() ]
df_dict[i].rename_axis(None, axis=1, inplace=True)
exon_clin_list.append(df_dict[i])
exon_clin = pd.concat(df_list)
My code raised:
FileNotFoundError: [Errno 2] No such file or directory: '/work/exon_kipan.{}.csv'
You have to use format method of str:
filename = '/work/exon_kipan.{:05}.csv' # <- don't forget to modify here
...
for i in range(1, 25):
df_dict[i] = pd.read_csv(filename.format(i), ...)
Test:
filename = '/work/exon_kipan.{:05}.csv'
for i in range(1, 25):
print(filename.format(i))
# Output
/work/exon_kipan.00001.csv
/work/exon_kipan.00002.csv
/work/exon_kipan.00003.csv
/work/exon_kipan.00004.csv
/work/exon_kipan.00005.csv
/work/exon_kipan.00006.csv
/work/exon_kipan.00007.csv
/work/exon_kipan.00008.csv
/work/exon_kipan.00009.csv
/work/exon_kipan.00010.csv
/work/exon_kipan.00011.csv
/work/exon_kipan.00012.csv
/work/exon_kipan.00013.csv
/work/exon_kipan.00014.csv
/work/exon_kipan.00015.csv
/work/exon_kipan.00016.csv
/work/exon_kipan.00017.csv
/work/exon_kipan.00018.csv
/work/exon_kipan.00019.csv
/work/exon_kipan.00020.csv
/work/exon_kipan.00021.csv
/work/exon_kipan.00022.csv
/work/exon_kipan.00023.csv
/work/exon_kipan.00024.csv
may be something like this will work
#write a function to read file do some processing and return a dataframe
def read_file_and_do_some_actions(filename):
df = pd.read_csv(filename, index_col=None, header=0)
#############################
#do some processing
#############################
return df
path = r'/home/tester/inputdata/exon_kipan'
all_files = glob.glob(os.path.join(path, "/work/exon_kipan.*.csv"))
#for each file in all_files list, call function read_file_and_do_some_actions and then concatenate all the dataframes into one dataframe
df = pd.concat((read_file_and_do_some_actions(f) for f in all_files), ignore_index=True)

Append Values to CSV and retain the old data

I have to append the data in CSV, the problem I am facing is intead of appending I am overwriting the data, not able to retain the old data, example :
finalDf = pd.DataFrame(columns=['sourcez', 'tergetz', 'TMP'])
df = pd.DataFrame()
df["sourcez"] = ["str(source_Path)"]
df["tergetz"] = ["str(target_path)"]
df["TMP"] = ["total_matching_points"]
finalDf = finalDf.append(df)
finalDf.to_csv('Testing.csv', index=False)
Now if I add a new value
finalDf = pd.DataFrame(columns=['sourcez', 'tergetz', 'TMP'])
df = pd.DataFrame()
df["sourcez"] = ["str(source_Path)_New"]
df["tergetz"] = ["str(target_path)_New"]
df["TMP"] = ["total_matching_points_New"]
finalDf = finalDf.append(df)
finalDf.to_csv('Testing.csv', index=False)
It is keeping the latest data in csv instead I want both data to be updated in csv. any idea?
I have tried to create a new csv with pandas dataframe and I want to append the values instead overwriting
I have tried:
finalDf = pd.DataFrame(columns=['sourcez', 'tergetz', 'TMP'])
df = pd.DataFrame()
df["sourcez"] = ["str(source_Path)"]
df["tergetz"] = ["str(target_path)"]
df["TMP"] = ["total_matching_points"]
finalDf = finalDf.append(df)
finalDf.to_csv('Testing.csv', index=False, mode='a+')
But the problem is heading is repeating csv:
sourcez,tergetz,TMP
str(source_Path),str(target_path),total_matching_points
sourcez,tergetz,TMP
str(source_Path)_New,str(target_path)_New,total_matching_points_New
How to remove repeated headings sourcez,tergetz,TMP

Pandas dataframe concat after reading large number of txt files using glob takes never ending time

There are some 50k txt files which I am trying to read to pandas dataframe as per code below.
But the process is still running for 2 hrs. Is there any better way to speed up this?
folder_path = '/drive/My Drive/dataset/train'
file_list = glob.glob(folder_path + "/*.txt")
def read_clean_df(file_name) -> pd.DataFrame:
df = pd.read_fwf(file_name, header=None)
df = df.drop(df.index[19])
df = df.T
df.columns = df.iloc[0]
df = df[1:]
df.reset_index(drop=True, inplace=True)
return df
train_df = read_clean_df(file_list[0])
for file_name in file_list[1:len(file_list)]:
df = read_clean_df(file_name)
train_df = pd.concat([train_df, df], axis=0)
train_df.reset_index(drop=True, inplace=True)
print(train_df.head(30))
Yeah, repeatedly calling concat is slow, this is the reason DataFrame.append was deprecated
Instead, do
dfs = []
for file_name in file_list:
df = read_clean_df(file_name)
dfs.append(df)
train_df = pd.concat(dfs)
Do the concatenation once, at the end:
dfs = []
for file_name in file_list:
df = read_clean_df(file_name)
dfs.append(df)
tran_df = pd.concat(dfs, axis=0)
If that is not fast enough, use datatable which can do multithreaded IO reading of csv files:
df = dt.rbind(iread(file_list)).to_pandas()

Subseting in chunks in pandas

Here is my code:
path = 'C:\\Users\\Daniil\\Desktop\\dw_payments'
#list of all df:
all_files = glob.glob(path + '/*.csv')
all_payments_data = pd.DataFrame()
dfs = []
for file in all_files:
df = pd.read_csv(file,index_col = None,chunksize = 200000)
df_f = df[df['CUSTOMER_NO'] == 20069675]
df_f = pd.concat(df_f,ignore_index = True)
dfs.append(df_f)
all_payments_data = pd.concat(dfs)
As you see in the line df_f = df[df['CUSTOMER_NO'] == 20069675] i want to select the specific customer in one chunk and then merge it to the empty data frame. And I want to repeat the process many times(there are a lot of files).
But it throws me an error:
TypeError: 'TextFileReader' object is not subscriptable
How can i fix it?
I think you need iterate by TextFileReader, filter and append to df_s. Last only once concat.
Notice - Structure of all files has to be same (same columns names in same order)
df_s = []
for file in all_files:
txt = pd.read_csv(file,index_col = None,chunksize = 200000)
for df in txt:
df_s.append(df[df['CUSTOMER_NO'] == 20069675])
df_f = pd.concat(df_s,ignore_index = True)

how to append data in one dataframe from different files?

I have used the following code to read the data from the files. I tried to make a time series data in one data frame but I am missing somewhere.
files = glob.glob('*.txt')
files.sort()
for infile in files:
year,formatt = infile.split('.')
year = year.split('_')[1]
ws = [4,9,7,7,7,7,7,7,7,7,7,7,7]
df = pd.read_fwf(infile,widths=ws,header=9, nrows=31, keep_default_na = False)
df = df.drop('Day', 1)
df = np.array(df.T)
df = df[df != '']
data = pd.DataFrame([])
data['Discharge'] = df
data = data.set_index(pd.date_range(year, periods=len(data), freq='D'),
drop=True, append=False, inplace=False, verify_integrity=False)
new = pd.DataFrame([])
all_ = new.append(data)
print all_
can anyone help me to figure out my problem?
my sample data is in this link: https://drive.google.com/open?id=0B2rkXkOkG7ExSWQ5djloNkpNWmc

Categories