newbie python learner here!
I have 20 participant csv files (P01.csv to P20.csv) with dataframes in them that contain stroop test data. The important columns for each are the condition column which has a random mix of incongruent and congruent conditions, the reaction time column for each condition and the column for if the response was correct, true or false.
Here is an example of the dataframe for P01 I'm not sure if this counts as a code snippet? :
trialnum,colourtext,colourname,condition,response,rt,correct
1,blue,red,incongruent,red,0.767041,True
2,yellow,yellow,congruent,yellow,0.647259,True
3,green,blue,incongruent,blue,0.990185,True
4,green,green,congruent,green,0.720116,True
5,yellow,yellow,congruent,yellow,0.562909,True
6,yellow,yellow,congruent,yellow,0.538918,True
7,green,yellow,incongruent,yellow,0.693017,True
8,yellow,red,incongruent,red,0.679368,True
9,yellow,blue,incongruent,blue,0.951432,True
10,blue,blue,congruent,blue,0.633367,True
11,blue,green,incongruent,green,1.289047,True
12,green,green,congruent,green,0.668142,True
13,blue,red,incongruent,red,0.647722,True
14,red,blue,incongruent,blue,0.858307,True
15,red,red,congruent,red,1.820112,True
16,blue,green,incongruent,green,1.118404,True
17,red,red,congruent,red,0.798532,True
18,red,red,congruent,red,0.470939,True
19,red,blue,incongruent,blue,1.142712,True
20,red,red,congruent,red,0.656328,True
21,red,yellow,incongruent,yellow,0.978830,True
22,green,red,incongruent,red,1.316182,True
23,yellow,yellow,congruent,green,0.964292,False
24,green,green,congruent,green,0.683949,True
25,yellow,green,incongruent,green,0.583939,True
26,green,blue,incongruent,blue,1.474140,True
27,green,blue,incongruent,blue,0.569109,True
28,green,green,congruent,blue,1.196470,False
29,red,red,congruent,red,4.027546,True
30,blue,blue,congruent,blue,0.833177,True
31,red,red,congruent,red,1.019672,True
32,green,blue,incongruent,blue,0.879507,True
33,red,red,congruent,red,0.579254,True
34,red,blue,incongruent,blue,1.070518,True
35,blue,yellow,incongruent,yellow,0.723852,True
36,yellow,green,incongruent,green,0.978838,True
37,blue,blue,congruent,blue,1.038232,True
38,yellow,green,incongruent,yellow,1.366425,False
39,green,red,incongruent,red,1.066038,True
40,blue,red,incongruent,red,0.693698,True
41,red,blue,incongruent,blue,1.751062,True
42,blue,blue,congruent,blue,0.449651,True
43,green,red,incongruent,red,1.082267,True
44,blue,blue,congruent,blue,0.551023,True
45,red,blue,incongruent,blue,1.012258,True
46,yellow,green,incongruent,yellow,0.801443,False
47,blue,blue,congruent,blue,0.664119,True
48,red,green,incongruent,yellow,0.716189,False
49,green,green,congruent,yellow,0.630552,False
50,green,yellow,incongruent,yellow,0.721917,True
51,red,red,congruent,red,1.153943,True
52,blue,red,incongruent,red,0.571019,True
53,yellow,yellow,congruent,yellow,0.651611,True
54,blue,blue,congruent,blue,1.321344,True
55,green,green,congruent,green,1.159240,True
56,blue,blue,congruent,blue,0.861646,True
57,yellow,red,incongruent,red,0.793069,True
58,yellow,yellow,congruent,yellow,0.673190,True
59,yellow,red,incongruent,red,1.049320,True
60,red,yellow,incongruent,yellow,0.773447,True
61,red,yellow,incongruent,yellow,0.693554,True
62,red,red,congruent,red,0.933901,True
63,blue,blue,congruent,blue,0.726794,True
64,green,green,congruent,green,1.046116,True
65,blue,blue,congruent,blue,0.713565,True
66,blue,blue,congruent,blue,0.494177,True
67,green,green,congruent,green,0.626399,True
68,blue,blue,congruent,blue,0.711896,True
69,blue,blue,congruent,blue,0.460420,True
70,green,green,congruent,yellow,1.711978,False
71,blue,blue,congruent,blue,0.634218,True
72,yellow,blue,incongruent,yellow,0.632482,False
73,yellow,yellow,congruent,yellow,0.653813,True
74,green,green,congruent,green,0.808987,True
75,blue,blue,congruent,blue,0.647117,True
76,green,red,incongruent,red,1.791693,True
77,red,yellow,incongruent,yellow,1.482570,True
78,red,red,congruent,red,0.693132,True
79,red,yellow,incongruent,yellow,0.815830,True
80,green,green,congruent,green,0.614441,True
81,yellow,red,incongruent,red,1.080385,True
82,red,green,incongruent,green,1.198548,True
83,blue,green,incongruent,green,0.845769,True
84,yellow,blue,incongruent,blue,1.007089,True
85,green,blue,incongruent,blue,0.488701,True
86,green,green,congruent,yellow,1.858272,False
87,yellow,yellow,congruent,yellow,0.893149,True
88,yellow,yellow,congruent,yellow,0.569597,True
89,yellow,yellow,congruent,yellow,0.483542,True
90,yellow,red,incongruent,red,1.669842,True
91,blue,green,incongruent,green,1.158416,True
92,blue,red,incongruent,red,1.853055,True
93,green,yellow,incongruent,yellow,1.023785,True
94,yellow,blue,incongruent,blue,0.955395,True
95,yellow,yellow,congruent,yellow,1.303260,True
96,blue,yellow,incongruent,yellow,0.737741,True
97,yellow,green,incongruent,green,0.730972,True
98,green,red,incongruent,red,1.564596,True
99,yellow,yellow,congruent,yellow,0.978911,True
100,blue,yellow,incongruent,yellow,0.508151,True
101,red,green,incongruent,green,1.821969,True
102,red,red,congruent,red,0.818726,True
103,yellow,yellow,congruent,yellow,1.268222,True
104,yellow,yellow,congruent,yellow,0.585495,True
105,green,green,congruent,green,0.673404,True
106,blue,yellow,incongruent,yellow,1.407036,True
107,red,red,congruent,red,0.701050,True
108,red,green,incongruent,red,0.402334,False
109,red,green,incongruent,green,1.537681,True
110,green,yellow,incongruent,yellow,0.675118,True
111,green,green,congruent,green,1.004550,True
112,yellow,blue,incongruent,blue,0.627439,True
113,yellow,yellow,congruent,yellow,1.150248,True
114,blue,yellow,incongruent,yellow,0.774452,True
115,red,red,congruent,red,0.860966,True
116,red,red,congruent,red,0.499595,True
117,green,green,congruent,green,1.059725,True
118,red,red,congruent,red,0.593180,True
119,green,yellow,incongruent,yellow,0.855915,True
120,blue,green,incongruent,green,1.335018,True
But I am only interested in the 'condition', 'rt', and 'correct' columns.
I need to create a table that says the mean reaction time for the congruent conditions, and the incongruent conditions, and the percentage correct for each condition. But I want to create an overall table of these results for each participant. I am aiming to get something like this as an output table:
Participant
Stimulus Type
Mean Reaction Time
Percentage Correct
01
Congruent
0.560966
80
01
Incongruent
0.890556
64
02
Congruent
0.460576
89
02
Incongruent
0.956556
55
Etc. for all 20 participants. This was just an example of my ideal output because later I'd like to plot a graph of the means from each condition across the participants. But if anyone thinks that table does not make sense or is inefficient, I'm open to any advice!
I want to use pandas but don't know where to begin finding the rt means for each condition when there are two different conditions in the same column in each dataframe? And I'm assuming I need to do it in some kind of loop that can run over each participant csv file, and then concatenates the results in a table for all the participants?
Initially, after struggling to figure out the loop I would need and looking on the web, I ran this code, which worked to concatenate all of the dataframes of the participants, I hoped this would help me to do the same analysis on all of them at once but the problem is it doesn't identify the individual participants for each of the rows from each participant csv file (there are 120 rows for each participant like the example I give above) that I had put into one table:
import os
import glob
import pandas as pd
#set working directory
os.chdir('data')
#find all csv files in the folder
#use glob pattern matching -> extension = 'csv'
#save result in list -> all_filenames
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#print(all_filenames)
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
Perhaps I could do something to add a participant column to identify each participant's data set in the concatenated table and then perform the mean and percentage correct analysis on the two conditions for each participant in that big concatenated table?
Or would it be better to do the analysis and then loop it over all of the individual participant csv files of dataframes?
I'm sorry if this is a really obvious process, I'm new to python and trying to learn to analyse my data more efficiently, have been scouring the Internet and Panda tutorials but I'm stuck. Any help is welcome! I've also never used Stackoverflow before so sorry if I haven't formatted things correctly here but thanks for the feedback about including examples of the input data, code I've tried, and desired output data, I really appreciate the help.
Try this:
from pathlib import Path
# Use the Path class to represent a path. It offers more
# functionalities when perform operations on paths
path = Path("./data").resolve()
# Create a dictionary whose keys are the Participant ID
# (the `01` in `P01.csv`, etc), and whose values are
# the data frames initialized from the CSV
data = {
p.stem[1:]: pd.read_csv(p) for p in path.glob("*.csv")
}
# Create a master data frame by combining the individual
# data frames from each CSV file
df = pd.concat(data, keys=data.keys(), names=["participant", None])
# Calculate the statistics
result = (
df.groupby(["participant", "condition"]).agg(**{
"Mean Reaction Time": ("rt", "mean"),
"correct": ("correct", "sum"),
"size": ("trialnum", "size")
}).assign(**{
"Percentage Correct": lambda x: x["correct"] / x["size"]
}).drop(columns=["correct", "size"])
.reset_index()
)
I have a 2 column dataframe with the following values, unfotunately, since the list of values is too long, it gets chopped with , ...] while saving to a csv with to_csv, How can I retain the entire list while saving
attribute_name list_of_values attribute1 [1320698, 1320699, 1096323,
1320690, 1839190, 1091359, 1325750, 1569072, 1829679, 142100, 1320163,
1829673, 588914, 418137, 757085, 588910, 1321158, 1073897, 1823533,
1823535, 1091363, 1383908, 1834826, 36191, 1829641, 767536, 1829597,
1829591, 1326727, 1834700, 1317721, 1317802, 1834838, 52799, 1383915,
1320042, 1829654, 1829655, 1829658, 647089, 1829581, 1829586, 1829587,
1321116, 1829585, 1829588, 1839799, 1588509, 1834471, 1793632,
1327850, 1793599, 1456968, 1315869, 1793605, 1321236, 1829579,
1829577, 1793609, 1829571, 1829570, 1320139, 777057, 1829671, 1829566,
1831047, 1829567, 588927, 60484, 1793596, 1829634, 1839580, 1829569,
1793615, 1323529, 1793619, 1834758, 1612974, 1320007, 1839780,
1291475, 1834835, 1834453, 1823663, 418112, 1092106, 1829689, 1829688,
1793606, 647050, 1834742, 1839551, 1839553, 1834746, 1839556, 1834745,
1575978, 1834749, 1320711, 1317910, ...]
df.to_csv(loc,index=False,header=False,sep='\t',mode='a',encoding='utf8').
I tried the display options here, http://pandas.pydata.org/pandas-docs/dev/options.html, with pd.set_option('max_colwidth',20000000000), but I think since it works only on display mode, not on dumping to a csv, this does not work.
What else can I set, to retain the contents of the entire list.
Edit -: Try creating a dataframe with this orignal data, once you save this, it will give you the distorted data as pointed above.
import pandas as pd
pd.options.display.multi_sparse = False
pd.set_option('max_colwidth',2000000000000000000)
headers=["attribute_name", "list_of_values"]
file_name='/home/ekta/abcd.csv'
data = ['attribute1', ['1320698', '1320699', '1096323', '1320690', '1839190', '1091359', '1325750', '1569072', '1829679', '142100', '1320163', '1829673', '588914', '418137', '757085', '588910', '1321158', '1073897', '1823533', '1823535', '1091363', '1383908', '1834826', '36191', '1829641', '767536', '1829597', '1829591', '1326727', '1834700', '1317721', '1317802', '1834838', '52799', '1383915', '1320042', '1829654', '1829655', '1829658', '647089', '1829581', '1829586', '1829587', '1321116', '1829585', '1829588', '1839799', '1588509', '1834471', '1793632', '1327850', '1793599', '1456968', '1315869', '1793605', '1321236', '1829579', '1829577', '1793609', '1829571', '1829570', '1320139', '777057', '1829671', '1829566', '1831047', '1829567', '588927', '60484', '1793596', '1829634', '1839580', '1829569', '1793615', '1323529', '1793619', '1834758', '1612974', '1320007', '1839780', '1291475', '1834835', '1834453', '1823663', '418112', '1092106', '1829689', '1829688', '1793606', '647050', '1834742', '1839551', '1839553', '1834746', '1839556', '1834745', '1575978', '1834749', '1320711', '1317910', '1829700', '1839791', '1839796', '1320019', '1829494', '437131', '1829696', '1839576', '721318', '1829699', '1838874', '1315822', '647049', '1325775', '1320708', '133913', '835588', '1839564', '1320700', '1320707', '1839563', '1834737', '1834736', '1834734', '1823669', '1321159', '1320577', '1839768', '1823665', '1838602', '1823667', '1321099', '1753590', '1753593', '1320688', '1839583', '1326633', '1320681', '1793646', '1323683', '1091348', '982081', '1793648', '1478516', '1317650', '1829663', '1829667', '1829666', '1793640', '1839577', '1315855', '1317796', '1839775', '1321163', '1793642']]
def write_file(data,flag,headers,file_name):
# open a df & write recursively
print " \n \n data", data
df = pd.DataFrame(data).T
print "df \n", df
# write to a df recursively
loc=file_name
#loc="%s%s_%s"%(args.base_path,args.merchant_domain,file_name)
if flag ==True :
df.to_csv(loc,index=False,header=headers,sep='\t',mode='a',encoding='utf8')
flag = False
elif flag == False :
df.to_csv(loc,index=False,header=False,sep='\t',mode='a',encoding='utf8')
return loc
# I call the function above with this data & headers, I pass flag as "True" the 1st time around, after which I write recursively with flag=False.
write_file(data,flag=True,headers,file_name)
debug :
length of original list is 155, the distorted list saved to_csv has 100 datapoints.
Purpose of loc & flag : location of file & flag = indicates whether I am writing the 1st row, or 2nd row onwrads I dont need to write headers again if the 1st row has been written already.
Here's how I solved it
The main diagnosis was that I couldn't store the entire list I was passing, even if I treated it as a dict object, may be it is because of the way pandas treats the column lengths, but this is only the diagnosis.
I got the whole list back, by writing the file, not with to_csv(pandas), but writing it as a simple file, and then reading it back with pandas, in which case, I could get the entire file back.
import pandas as pd
# Note that I changed my headers from the initial format as a list
headers="attribute_name\tlist_of_values"
data = ['attribute1',['1320698', '1320699', '1096323', '1320690', '1839190', '1091359', '1325750', '1569072', '1829679', '142100', '1320163', '1829673', '588914', '418137', '757085', '588910', '1321158', '1073897', '1823533', '1823535', '1091363', '1383908', '1834826', '36191', '1829641', '767536', '1829597', '1829591', '1326727', '1834700', '1317721', '1317802', '1834838', '52799', '1383915', '1320042', '1829654', '1829655', '1829658', '647089', '1829581', '1829586', '1829587', '1321116', '1829585', '1829588', '1839799', '1588509', '1834471', '1793632', '1327850', '1793599', '1456968', '1315869', '1793605', '1321236', '1829579', '1829577', '1793609', '1829571', '1829570', '1320139', '777057', '1829671', '1829566', '1831047', '1829567', '588927', '60484', '1793596', '1829634', '1839580', '1829569', '1793615', '1323529', '1793619', '1834758', '1612974', '1320007', '1839780', '1291475', '1834835', '1834453', '1823663', '418112', '1092106', '1829689', '1829688', '1793606', '647050', '1834742', '1839551', '1839553', '1834746', '1839556', '1834745', '1575978', '1834749', '1320711', '1317910', '1829700', '1839791', '1839796', '1320019', '1829494', '437131', '1829696', '1839576', '721318', '1829699', '1838874', '1315822', '647049', '1325775', '1320708', '133913', '835588', '1839564', '1320700', '1320707', '1839563', '1834737', '1834736', '1834734', '1823669', '1321159', '1320577', '1839768', '1823665', '1838602', '1823667', '1321099', '1753590', '1753593', '1320688', '1839583', '1326633', '1320681', '1793646', '1323683', '1091348', '982081', '1793648', '1478516', '1317650', '1829663', '1829667', '1829666', '1793640', '1839577', '1315855', '1317796', '1839775', '1321163', '1793642']]
flag=True
# write to a file
with open(loc, 'a') as f:
if flag :
f.write(headers+"\n")
flag=False
#Explicitly writing a tab separated file
f.write(str(data[0])+"\t"+str(data[1])+"\n")
# read the file & confirm
df=pd.read_csv(loc,sep='\t',header='infer')
print df['list_of_values'].ix[0]
print len(df['list_of_values'].ix[0])
#Yah !! 155
Thanks to #paul who diagnosed the problem & pointed me in this direction.