I have the following dataframe:
The column numeroLote is between a range of 5 to 25 values.
I want to create an export csv file to each data when numeroLote change their value and I perform the following:
for i in range(5,26):
print(i)
a = racimitos[racimitos['numeroLote']==i][['peso','fecha','numeroLote']]
a.to_csv('racimitos{}.csv'.format(i), sep=',', header=True, index=True)
And then, I get datasets similar to:
An additional column is generated like the one enclosed in the red box above …
I try to remove this column of the following way:
for i in range(5,26):
print(i)
a = racimitos[racimitos['numeroLote']==i][['peso','fecha','numeroLote']]
a.to_csv('racimitos{}.csv'.format(i), sep=',', header=True, index=True)
a.drop(columns=[' '], axis=1,)
But I get this error:
KeyError Traceback (most recent call last)
<ipython-input-18-e3ad718d5396> in <module>()
9 a = racimitos[racimitos['numeroLote']==i][['peso','fecha','numeroLote']]
10 a.to_csv('racimitos{}.csv'.format(i), sep=',', header=True, index=True)
---> 11 a.drop(columns=[' '], axis=1,)
~/anaconda3/envs/sioma/lib/python3.6/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
4385 if errors != 'ignore':
4386 raise KeyError(
-> 4387 'labels %s not contained in axis' % labels[mask])
4388 indexer = indexer[~mask]
4389 return self.delete(indexer)
KeyError: "labels [' '] not contained in axis"
How to can I remove this empty column index which is generated when I execute the export to.csv ?
You instead want index=False, like so:
for i in range(5,26):
a = racimitos[racimitos['numeroLote']==i][['peso','fecha','numeroLote']]
a.to_csv('racimitos{}.csv'.format(i), sep=',', header=True, index=False)
As an aside, I don't think it's necessary to include the numeroLote column when printing to the .csv file, simply because you capture it's value in the filename.
Here is a much more efficient solution IMO using groupby():
grouped = racimitos.groupby('numeroLote')[['peso','fecha']]
[grouped.get_group(key).to_csv('racimitos{}.csv'.format(key), index=False) for key, item in grouped]
Instead of trying to drop that unnamed column, you could select all columns starting from index 1.
a = a.iloc[:, 1:]
Related
So I have this script
mport pandas as pd
import numpy as np
PRIMARY_TUMOR_PATIENT_ID_REGEX = '^.{4}-.{2}-.{4}-01.*'
SHORTEN_PATIENT_REGEX = '^(.{4}-.{2}-.{4}).*'
def mutations_for_gene(df):
mutated_patients = df['identifier'].unique()
return pd.DataFrame({'mutated': np.ones(len(mutated_patients))}, index=mutated_patients)
def prep_data(mutation_path):
df = pd.read_csv(mutation_path, low_memory=True, dtype=str, header = 0)#Line 24 reads in a line memory csv file from the given path and parses it based on '\t' delimators, and casts the data to str
df = df[~df['Hugo_Symbol'].str.contains('Hugo_Symbol')] #analyzes the 'Hugo_Symbol' heading within the data and makes a new dataframe where any row that contains 'Hugo_Symbol' is dropped
df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str) # Appends ''\'' to all the data remaining in that column
df['Tumor_Sample_Barcode'] = df['Tumor_Sample_Barcode'].str.strip() #strips away whitespace from the data within this heading
non_silent = df.where(df['Variant_Classification'] != 'Silent') #creates a new dataframe where the data within the column 'Variant_Classification' is not equal to 'Silent'
df = non_silent.dropna(subset=['Variant_Classification']) #Drops all the rows that are missing at least one element
non_01_barcodes = df[~df['Tumor_Sample_Barcode'].str.contains(PRIMARY_TUMOR_PATIENT_ID_REGEX)]
#TODO: Double check that the extra ['Tumor_Sample_Barcode'] serves no purpose
df = df.drop(non_01_barcodes.index)
print(df)
shortened_patients = df['Tumor_Sample_Barcode'].str.extract(SHORTEN_PATIENT_REGEX, expand=False)
df['identifier'] = shortened_patients
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
gene_mutation_df.columns = gene_mutation_df.columns.str.strip()
gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
gene_mutation_df = gene_mutation_df.reset_index()
gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated')
return gene_patient_mutations.transpose().fillna(0)
This is the csv file that the script reads in:
identifier,Hugo_Symbol,Tumor_Sample_Barcode,Variant_Classification,patient
1,patient,a,Silent,6
22,mutated,d,e,7
1,Hugo_Symbol,f,g,88
The script gives this error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-60-3f9c00f320bc> in <module>
----> 1 prep_data('test.csv')
<ipython-input-59-2a67d5c44e5a> in prep_data(mutation_path)
21 display(gene_mutation_df)
22 gene_mutation_df.columns = gene_mutation_df.columns.str.strip()
---> 23 gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
24 gene_mutation_df = gene_mutation_df.reset_index()
25 gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated')
e:\Anaconda3\lib\site-packages\pandas\core\frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
4546
4547 if missing:
-> 4548 raise KeyError(f"None of {missing} are in the columns")
4549
4550 if inplace:
KeyError: "None of ['Hugo_Symbol', 'patient'] are in the columns"
Previously, I had this is as that line
gene_mutation_df.index.set_names(['Hugo_Symbol', 'patient'], inplace=True)
But that also gave an error that the set_name length expects one argument but got two
Any help would be much appreciated
I would really prefer if the csv data was changed instead of the script and somehow the script could work with set_names instead of set_index
The issue is:
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
'Hugo_Symbol is used for a groupby, so now it's in the index, not a column
In the case of the sample data, an empty dataframe, with no columns, has been created.
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
print(gene_mutation_df) # print the dataframe to see what it looks like
print(gene_mutation_df.info()) # print the information for the dataframe
gene_mutation_df.columns = gene_mutation_df.columns.str.strip()
gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
# output
Empty DataFrame
Columns: [identifier, Hugo_Symbol, Tumor_Sample_Barcode, Variant_Classification, patient]
Index: []
Empty DataFrame
Columns: []
Index: []
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrameNone
reset the index
Resetting the index, will make Hugo_Symbol a column again
As long as the dataframe is not empty, the KeyError should be resolved.
gene_mutation_df = gene_mutation_df.reset_index() # try adding this line
gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True)
Addition Notes
There are a number of lines of code, that may be resulting in an empty dataframe
non_01_barcodes = df[~df['Tumor_Sample_Barcode'].str.contains(PRIMARY_TUMOR_PATIENT_ID_REGEX)]
shortened_patients = df['Tumor_Sample_Barcode'].str.extract(SHORTEN_PATIENT_REGEX, expand=False)
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
Test if the dataframe is empty
Use .empty to determine if a dataframe is empty
def prep_data(mutation_path):
df = pd.read_csv(mutation_path, low_memory=True, dtype=str, header = 0)#Line 24 reads in a line memory csv file from the given path and parses it based on '\t' delimators, and casts the data to str
df.columns = df.columns.str.strip() # clean the column names here if there is leading or trailing whitespace.
df = df[~df['Hugo_Symbol'].str.contains('Hugo_Symbol')] #analyzes the 'Hugo_Symbol' heading within the data and makes a new dataframe where any row that contains 'Hugo_Symbol' is dropped
df['Hugo_Symbol'] = '\'' + df['Hugo_Symbol'].astype(str) # Appends ''\'' to all the data remaining in that column
df['Tumor_Sample_Barcode'] = df['Tumor_Sample_Barcode'].str.strip() #strips away whitespace from the data within this heading
non_silent = df.where(df['Variant_Classification'] != 'Silent') #creates a new dataframe where the data within the column 'Variant_Classification' is not equal to 'Silent'
df = non_silent.dropna(subset=['Variant_Classification']) #Drops all the rows that are missing at least one element
non_01_barcodes = df[~df['Tumor_Sample_Barcode'].str.contains(PRIMARY_TUMOR_PATIENT_ID_REGEX)]
#TODO: Double check that the extra ['Tumor_Sample_Barcode'] serves no purpose
df = df.drop(non_01_barcodes.index)
print(df)
shortened_patients = df['Tumor_Sample_Barcode'].str.extract(SHORTEN_PATIENT_REGEX, expand=False)
df['identifier'] = shortened_patients
gene_mutation_df = df.groupby(['Hugo_Symbol']).apply(mutations_for_gene)
gene_mutation_df = gene_mutation_df.reset_index() # reset the index here
print(gene_mutation_df)
if gene_mutation_df.empty: # check if the dataframe is empty
print('The dataframe is empty')
else:
# gene_mutation_df.set_index(['Hugo_Symbol', 'patient'], inplace=True) # this is not needed, pivot won't work if you do this
# gene_mutation_df = gene_mutation_df.reset_index() # this is not needed, the dataframe was reset already
gene_patient_mutations = gene_mutation_df.pivot(index='Hugo_Symbol', columns='patient', values='mutated') # values needs to be a column in the dataframe
return gene_patient_mutations.transpose().fillna(0)
I'm in the initial stages of doing some 'machine learning'.
I'm trying to create a new data frame and one of the columns doesn't appear to be recognised..?
I've loaded an Excel file with 2 columns (removed the index). All fine.
Code:
df = pd.read_excel('scores.xlsx',index=False)
df=df.rename(columns=dict(zip(df.columns,['Date','Amount'])))
df.index=df['Date']
df=df[['Amount']]
#creating dataframe
data = df.sort_index(ascending=True, axis=0)
new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date','Amount'])
for i in range(0,len(data)):
new_data['Date'][i] = data['Date'][i]
new_data['Amount'][i] = data['Amount'][i]
The error:
KeyError: 'Date'
Not really sure what's the problem here.
Any help greatly appreciated
I think in line 4 you reduce your dataframe to just one column "Amount"
To add to #Grzegorz Skibinski's answer, the problem is after line 4, there is no longer a 'Date' column. The Date column was assigned to the index and removed, and while the index has a name "Date", you can't use 'Date' as a key to get the index - you have to use data.index[i] instead of data['Date'][i].
It seems that you have an error in the formatting of your Date column.
To check that you don't have an error on the name of the columns you can print the columns names:
import pandas as pd
# create data
data_dict = {}
data_dict['Fruit '] = ['Apple', 'Orange']
data_dict['Price'] = [1.5, 3.24]
# create dataframe from dict
df = pd.DataFrame.from_dict(data_dict)
# Print columns names
print(df.columns.values)
# Print "Fruit " column
print(df['Fruit '])
This code outputs:
['Fruit ' 'Price']
0 Apple
1 Orange
Name: Fruit , dtype: object
We clearly see that the "Fruit " column as a trailing space. This is an easy mistake to do, especially when using excel.
If you try to call "Fruit" instead of "Fruit " you obtain the error you have:
KeyError: 'Fruit'
I have a data frame that I want to remove duplicates on column named "sample" and the add string information in gene and status columns to new column as shown in the attached pics.
Thank you so much in advance
below is the modified version of data frame.where gene in rows are replaced by actual gene names
Here, df is your Pandas DataFrame.
def new_1(g):
return ','.join(g.gene)
def new_2(g):
return ','.join(g.gene + '-' + g.status)
new_1_data = df.groupby("sample").apply(new_1).to_frame(name="new_1")
new_2_data = df.groupby("sample").apply(new_2).to_frame(name="new_2")
new_data = pd.merge(new_1_data, new_2_data, on="sample")
new_df = pd.merge(df, new_data, on="sample").drop_duplicates("sample")
If you wish to have "sample" as a column instead of an index, then add
new_df = new_df.reset_index(drop=True)
Lastly, as you did not specify which of the original rows of duplicates to retain, I simply use the default behavior of Pandas and drop all but the first occurrence.
Edit
I converted your example to the following CSV file (delimited by ',') which I will call "data.csv".
sample,gene,status
ppar,p53,gain
ppar,gata,gain
ppar,nb,loss
srty,nf1,gain
srty,cat,gain
srty,cd23,gain
tygd,brac1,loss
tygd,brac2,gain
tygd,ras,loss
I load this data as
# Default delimiter is ','. Pass `sep` argument to specify delimiter.
df = pd.read_csv("data.csv")
Running the code above and printing the dataframe produces the output
sample gene status new_1 new_2
0 ppar p53 gain p53,gata,nb p53-gain,gata-gain,nb-loss
3 srty nf1 gain nf1,cat,cd23 nf1-gain,cat-gain,cd23-gain
6 tygd brac1 loss brac1,brac2,ras brac1-loss,brac2-gain,ras-loss
This is exactly the expected output given in your example.
Note that the left-most column of numbers (0, 3, 6) are the remnants of the index of the original dataframes produced after the merges. When you write this dataframe to file you can exclude it by setting index=False for df.to_csv(...).
Edit 2
I checked the CSV file you emailed me. You have a space after the word "gene" in the header of your CSV file.
Change the first line of your CSV file from
sample,gene ,status
to
sample,gene,status
Also, there are spaces in your entries. If you wish to remove them, you can
# Strip spaces from entries. Only works for string entries
df = df.applymap(lambda x: x.strip())
Might not be the most efficient solution but this should get you there:
samples = []
genes= []
statuses = []
for s in set(df["sample"]):
#grab unique samples
samples.append(s)
#get the genes for each sample and concatenate them
g = df["gene"][df["sample"]==s].str.cat(sep=",")
genes.append(g)
#loop through the genes for the sample and get the statuses
status = ''
for gene in g.split(","):
gene_status = df["status"][(df["sample"] == s) & (df["gene"] == gene)].to_string(index=False)
status += gene
status += "-"
status += gene_status
status += ','
statuses.append(status)
#create new df
new_df = pd.DataFrame({'sample': samples,
'new': genes,
'new1': statuses})
I use pandas's astype function to parse a string into data in datetime64[ns] format, but because there are some outliers in the original data, it causes the program to go wrong.
I want to get the wrong data index from the ValueError exception and delete the index data,rather than interrupt the program because of ValueError.Or is there any other way to achieve my goal?
when parsing datetime by astype, I got a the following error prompts. I want to get the wrong data index from the ValueError exception and delete the index data.:
File "/home/xiaopeng/anaconda3/envs/tensorflow/lib/python3.5/site-packages/pandas/core/dtypes/cast.py", line 636, in astype_nansafe
return arr.astype(dtype)
ValueError: Error parsing datetime string "2017-06-01VERSION=1.0" at position 10
the code as follows, the main function of this function is to read data from the text file, and to parse the data:
def file_to_df(file):
print('converting file:%r(%r MB)' %(file,(os.path.getsize(file)/(1024*1024))))
df = pd.read_csv(file, sep='\t', header=None, names=columns)
for k in df.columns:
_, df[k] = df[k].astype(str).str.split('=',1).str
df = df[columns_use]
# startswith() ,delete the wrong data when startswith is not '20'
df = df[df['PASSTIME'].astype(str).str.startswith("20")]
print('Log: Get %r number of data' % len(df))
df['PASSTIME'] = df['PASSTIME'].astype(str).str.replace(' ', '?', n=1)
df['PASSTIME'] = df['PASSTIME'].astype(str).str.replace(' ', '.', n=1)
df['PASSTIME'] = df['PASSTIME'].astype(str).str.replace('?', ' ', n=1)
df['PASSTIME'] = df['PASSTIME'].astype('datetime64[ns]')
return df
and the Parsing error data as follows:
VERSION=1.0 PASSTIME=2017-06-01 11:01:46 625 CARSTATE=1 ...
VERSION=1.0 PASSTIME=2017-06-01VERSION=1.0 PASSTIME=2017-06-01 11:04:02 618 CARSTATE=1 ...
VERSION=1.0 PASSTIME=2017-06-01 11:04:49 595 CARSTATE=1 ...
I think you need to_datetime + dropna for remove NaT rows:
df['PASSTIME'] = pd.to_datetime(df['PASSTIME'], errors='coerce')
df = df.dropna('PASSTIME')
df.columns = df.columns.str.strip() ##found a fix for leading whitespaces
arrest_only_Y= df.loc[df['ARREST'] == 'Y']
arrest_only_Y_two_col=arrest_only_Y[["ARREST",'LOCATION DESCRIPTION','CASE#']]##running fine here
arrest_only_Y_two_col.reset_index()
arrest_only_Y_two_col_groupby = arrest_only_Y_two_col.groupby('LOCATION DESCRIPTION').count() ##and here as well ## arrest_only_Y_two_col_groupby_desc=arrest_only_Y_two_col_groupby.sort_values(['ARREST'],ascending = False).head()
arrest_only_Y_two_col_groupby_desc.reset_index(drop = True)
arrest_only_Y_two_col_groupby_desc
In output LOCATION DESCRIPTION becomes as index and i cant use it as a column to run this code
locdesc_list = arrest_only_Y_two_col_groupby_desc['LOCATION
DESCRIPTION'].tolist()
I get: Key Error : 'LOCATION DESCRIPTION'
Replace your line:
arrest_only_Y_two_col_groupby_desc.reset_index(drop=True)
With:
arrest_only_Y_two_col_groupby_desc.reset_index(inplace=True)
You can just try this
df =pd.DataFrame(df,index=index,column=['A','B'])