Organising columns in pandas DataFrame - python

I have a 2 sets of 92 columns. At the moment, all 92 columns are in one row. Is it possible to reogranise this such that the 92 columns are split into sets of 12, essentially having 8 sets of 12 data (one underneath the other). my code:
import glob
import pandas as pd
import os
os.chdir('C:/Users/peaches9/Desktop/')
Result = []
def FID_extract(filepath):
path_pattern = filepath
files = glob.glob(path_pattern)
dataframes = [pd.DataFrame.from_csv(f, index_col=None) for f in files]
new_dfa = pd.DataFrame()
for i, df in enumerate(dataframes):
colname = 'Run {}'.format(i+1)
selected_data = df['Unnamed: 3'].ix[12:17]
new_dfa[colname] = selected_data
#print new_dfa
#new_dfa.to_csv('FID_11169_Liquid.csv')
Result.append(new_dfa)
def TCD_extract(filepath):
path_pattern = filepath
files = glob.glob(path_pattern)
dataframes = [pd.DataFrame.from_csv(f, index_col=None) for f in files]
new_dfb = pd.DataFrame()
for i, df in enumerate(dataframes):
colname = 'Run {}'.format(i+1)
selected_data = df['Unnamed: 3'].ix[12:15]
new_dfb[colname] = selected_data
#print new_dfb
#new_dfb.to_csv('TCD_11169_liquid.csv')
Result.append(new_dfb)
FID_extract('C:/Users/peaches9/Desktop/Cryostat Verification/GC results/11169_Cryo_1bar/FID_0*') #files directory
TCD_extract('C:/Users/peaches9/Desktop/Cryostat Verification/GC results/11169_Cryo_1bar/TCD_0*')
dfc = pd.concat(Result)
Out:
Run 1..... Run 95 Run 96
12 5193791.85 5193915.21 5194343.34
13 1460874.04 1460929.33 1461072.84
14 192701.82 192729.55 192743.99
15 156836.4 156876.97 156889.26
16 98342.84 98346.7 98374.95
17 NaN NaN NaN
12 3982.69 3982.16 4017.66
13 2913008.04 2913627.33 2914075.7
14 226963.37 226956.1 227106.71
15 25208.2 25173.89 25197.88
I want all 96 columns split into 8 X 12 columns all underneath each other. Many thanks in advance.
EDIT:
I have managed to seperate the dataframes into sets of 8... but I can't get each dataframe to go beneath each other. They concat to the right, always!
dfc = pd.concat(Result)
df1 = dfc.ix[:,0:12]
df2 = dfc.ix[:,12:24]
df3 = dfc.ix[:,24:36]
df4 = dfc.ix[:,36:48]
df5 = dfc.ix[:,48:60]
df6 = dfc.ix[:,60:72]
df7 = dfc.ix[:,72:84]
df8 = dfc.ix[:,84:96]
pieces = [df1,df2,df3,df4,df5,df6,df7,df8]
df_final = pd.concat([df1, df2], levels = 1, axis = 3)

Assuming you are trying to take your 96 columns, and create a single 2-dimensional DataFrame with 12 columns and 8 times as many rows, then you want:
df_final = pd.concat( pieces, axis=0, ignore_index=True )
If you are trying to make a 3-dimensional DataFrame, with your new dimension having 8 values, you aren't trying to make a DataFrame, but a Panel.

Related

Pandas: How to sort rows based on particular suffix values?

My Pandas data frame contains the following data reading from a csv file:
id,values
1001-MAC, 10
1034-WIN, 20
2001-WIN, 15
3001-MAC, 45
4001-LINUX, 12
4001-MAC, 67
df = pd.read_csv('example.csv')
df.set_index('id', inplace=True)
I have to sort this data frame based on the id column order by given suffix list = ["WIN", "MAC", "LINUX"]. Thus, I would like to get the following output:
id,values
1034-WIN, 20
2001-WIN, 15
1001-MAC, 10
3001-MAC, 45
4001-MAC, 67
4001-LINUX, 12
How can I do that?
Here is one way to do that:
import pandas as pd
df = pd.read_csv('example.csv')
idx = df.id.str.split('-').str[1].sort_values(ascending=False).index
df = df.loc[idx]
df.set_index('id', inplace=True)
print(df)
Try:
df = df.sort_values(
by=["id"], key=lambda x: x.str.split("-").str[1], ascending=False
)
print(df)
Prints:
id values
1 1034-WIN 20
2 2001-WIN 15
0 1001-MAC 10
3 3001-MAC 45
5 4001-MAC 67
4 4001-LINUX 12
Add a column to a dataframe that would contain only prefixes (use str.split() function for that) and sort whole df based on that new column.
import pandas as pd
df = pd.DataFrame({
"id":["1001-MAC", "1034-WIN", "2001-WIN", "3001-MAC", "4001-LINUX", "4001-MAC"],
"values":[10, 20, 15, 45, 12, 67]
})
df["id_postfix"] = df["id"].apply(lambda x: x.split("-")[1])
df = df.sort_values("id_postfix", ascending=False)
df = df[["id", "values"]]
print(df)
Please be sure to answer the question. Provide details and share your research!

how to sum up columns from different dataframes into a single dataframe in pandas

Sample data
import pandas as pd
df1 = pd.DataFrame()
df1["Col1"] = [0,2,4,6,2]
df1["Col2"] = [5,1,3,4,0]
df1["Col3"] = [8,0,5,1,7]
df1["Col4"] = [1,4,6,0,8]
#df1_new = df1.iloc[:, 1:3]
df2 = pd.DataFrame()
df2["Col1"] = [8,2,4,6,2,3,5]
df2["Col2"] = [3,7,3,4,0,6,8]
df2["Col3"] = [5,0,5,1,7,9,1]
df2["Col4"] = [0,4,6,0,8,6,0]
#df2_new = df1.iloc[:, 1:3]
dataframes = [df1, df2]
for df in dataframes:
df_new=df.iloc[:, 1:3]
print(df_new.sum(axis=0))
result from above looks like this:
Col2 13
Col3 21
dtype: int64
Col2 31
Col3 28
dtype: int64
But how can I sum up both dataframes and put it into a single one?
Result should look like this:
Real example looks like this:
xlsx_files = glob.glob(os.path.join(path, "*.xlsx"))
#print(csv_files)
# loop over the list of csv files
for f in xlsx_files:
# create df from each excel file
dfs = pd.read_excel(f)
# grab file name to user it in summarized df
file_name = f.split("\\")[-1]
new_df = pd.concat([dfs]).iloc[:,13:28].sum()
You can either sum the dataframes separately and then add the results, or sum the concatenated dataframes:
df1.iloc[:,1:3].sum() + df2.iloc[:,1:3].sum()
pd.concat([df1,df2]).iloc[:,1:3].sum()
In both cases the result is
Col2 44
Col3 49
dtype: int64
You can convert the result from a series to a DataFrame and transpose using
.to_frame().T
to get this output:
Col2 Col3
0 44 49
For the code in your updated question, you probably want something like this:
xlsx_files = glob.glob(os.path.join(path, "*.xlsx"))
#print(csv_files)
# loop over the list of csv files
new_df = pd.DataFrame()
for f in xlsx_files:
# create df from each excel file
dfs = pd.read_excel(f)
# grab file name to user it in summarized df
file_name = f.split("\\")[-1]
new_df = pd.concat([new_df, dfs])
result = new_df.iloc[:,13:28].sum()
here is another way about it
combining the sum of the individual sum of the DFs, converting result to a DF and then choosing Col2 and Col3 after Transposing
(df1.sum() + df2.sum()).to_frame().T[['Col2','Col3']]
Col2 Col3
0 44 49
Get the columnwise sums of both dataframes, take the middle two columns of each, and add them together. Then, transpose the result to turn the rows into columns:
pd.DataFrame((df1.iloc[:, 1:3].sum() + df2.iloc[:, 1:3].sum())).T
This outputs:
Col2 Col3
0 44 49
Here is one way:
long, short = (df1, df2) if len(df1.index) > len(df2.index) else (df2, df1)
print((short[["Col2", "Col3"]].reindex(long.index, fill_value=0) + long[["Col2", "Col3"]]).sum().to_frame().T)
Or, if you need to use iloc for the columns, here is another way:
long, short = (df1, df2) if len(df1.index) > len(df2.index) else (df2, df1)
print((short.iloc[:, 1:3].reindex(long.index, fill_value=0) + long.iloc[:, 1:3]).sum().to_frame().T)
Output (same for both):
Col2 Col3
0 44 49

Merging DataFrames with Different Columns

Suppose I have two dataframes df1 and df2 as shown by the first two dataframes in the image below. I want to combine them to get df_desired as shown by the final dataframe in the image. My current attempts result in the third dataframe in the image; as you can see it is ignoring the fact that it has already seen a row with name a
My code:
df1 = pd.DataFrame({'name':['a','b'], 'data1':[3,4]})
df2 = pd.DataFrame({'name':['a','c'], 'data2':[1,5]})
def collect_results(target_list, df_list):
df = pd.DataFrame(columns = ['name','data1','data2'])
for i in range(2):
target = target_list[i]
df_target = df_list[i]
smiles = list(df_target['name'])
pxc50 = list(df_target[target])
target_col_names = ['name', target]
df_target_info = pd.DataFrame(columns=target_col_names)
df_target_info['name'] = smiles
df_target_info[target] = pxc50
try:
df = pd.merge(df,df_target_info, how="outer", on=["name",target])
except IndexError:
df = df.reindex_axis(df.columns.union(df_target_info.columns), axis=1)
return df
How can I get the desired behaviour?
You can merge on name with outer join using .merge()
df_desired = df1.merge(df2, on='name', how='outer')
Result:
print(df_desired)
name data1 data2
0 a 3.0 1.0
1 b 4.0 NaN
2 c NaN 5.0

Pandas: modify multiple dataframes (in a loop)

I have multiple data frames that I want to do the same function for them. therefore I need to iterate over my frameworks.
# read text files
df1 = pd.read_csv("df1.txt", sep="\t", error_bad_lines=False, index_col =None)
df2 = pd.read_csv("df2.txt", sep="\t", error_bad_lines=False, index_col =None)
df3 = pd.read_csv("df3.txt", sep="\t", error_bad_lines=False, index_col =None)
I have used the following code, however, it is not working (it means that all dataframes are still the same, and the changes do not affect them):
for df in [df1 , df2 , df3]:
df = df[df["Time"]>= 600.0].reset_index(drop=True)
df.head()
How I can iterate over them? and how can I overwrite dataframes?
The problem is that you're not changing the data frames in place, but rather creating new ones. Here's a piece of code that changes things in-place. I don't have your data, so I create fake data for the sake of this example:
df1 = pd.DataFrame(range(10))
df2 = pd.DataFrame(range(20))
df3 = pd.DataFrame(range(30))
df_list = [df1, df2, df3]
for df in df_list:
# use whatever condition you need in the following line
# for example, df.drop(df[df["Time"] < 600].index, inplace=True)
# in your case.
df.drop(df[df[0] % 2 == 0].index, inplace=True)
df.reset_index(inplace = True)
print(df2) # for example
The result for df2 is:
index 0
0 1 1
1 3 3
2 5 5
3 7 7
4 9 9
5 11 11
6 13 13
7 15 15
8 17 17
9 19 19
This might work:
df_list=[df1,df2,df3]
for df in range(len(df_list)):
df=df_list[i]
df_list[i]=df[df["Time"]>=600.0].reset_iundex(drop=True)
If you just store the new df to another list or same list you are all good.
newdf_list = [] # create new list to store df
for df in [df1 , df2 , df3]:
df = df[df["Time"]>= 600.0].reset_index(drop=True)
df.head()
newdf_list.append(df) # append changed df to new list

Concat dataframes on different columns

I have 3 different csv files and I'm looking for concat the values. The only condition I need is that the first csv dataframe must be in column A of the new csv, the second csv dataframe in the column B and the Thirth csv dataframe in the C Column. The quantity of rows is the same for all csv files.
Also I need to change the three headers to ['año_pasado','mes_pasado','este_mes']
import pandas as pd
df = pd.read_csv('año_pasado_subastas2.csv', sep=',')
df1 = pd.read_csv('mes_pasado_subastas2.csv', sep=',')
df2 = pd.read_csv('este_mes_subastas2.csv', sep=',')
df1
>>>
Subastas
166665859
237944547
260106086
276599496
251813654
223790056
179340698
177500866
239884764
234813107
df2
>>>
Subastas
212003586
161813617
172179313
209185016
203804433
198207783
179410798
156375658
130228140
124964988
df3
>>>
Subastas
142552750
227514418
222635042
216263925
196209965
140984000
139712089
215588302
229478041
222211457
The output that I need is:
año_pasado,mes_pasado,este_mes
166665859,124964988,142552750
237944547,161813617,227514418
260106086,172179313,222635042
276599496,209185016,216263925
251813654,203804433,196209965
223790056,198207783,140984000
179340698,179410798,139712089
177500866,156375658,215588302
239884764,130228140,229478041
234813107,124964988,222211457
I think you need concat of Series created by squeeze=True if one column data only or selecting columns and for new columns names use parameter keys:
df = pd.read_csv('año_pasado_subastas2.csv', squeeze=True)
df1 = pd.read_csv('mes_pasado_subastas2.csv', squeeze=True)
df2 = pd.read_csv('este_mes_subastas2.csv', squeeze=True)
cols = ['año_pasado','mes_pasado','este_mes']
df = pd.concat([df, df1, df2], keys = cols, axis=1)
Or:
df = pd.read_csv('año_pasado_subastas2.csv')
df1 = pd.read_csv('mes_pasado_subastas2.csv')
df2 = pd.read_csv('este_mes_subastas2.csv')
cols = ['año_pasado','mes_pasado','este_mes']
df = pd.concat([df['Subastas'], df1['Subastas'], df2['Subastas']], keys = cols, axis=1)
print (df)
año_pasado mes_pasado este_mes
0 166665859 212003586 142552750
1 237944547 161813617 227514418
2 260106086 172179313 222635042
3 276599496 209185016 216263925
4 251813654 203804433 196209965
5 223790056 198207783 140984000
6 179340698 179410798 139712089
7 177500866 156375658 215588302
8 239884764 130228140 229478041
9 234813107 124964988 222211457

Categories