how to drop multiple columns in pandas? - python

How to drop multiple columns in pandas and python ?
import pandas as pd
df =pd.DataFrame({
"source_number": [
[11199,11328,11287,32345,12342,1232,13456,123244,13456],
"location":
["loc2","loc1-loc3","loc3","loc1","loc2-loc1","loc2","loc3-loc2","loc2","loc1"],
"category":
["cat1","cat2","cat1","cat3","cat3","cat3","cat2","cat3","cat2"],
})
def remove_columns(dataset,cols):
for col in cols:
del dataset[col]
return dataset
for col in df.columns:
df=remove_columns(df,col)
df.head()
in the code above the task is done and the columns are droped.
But when
I tried this code on streamlit where the user select multiple columns that he want to remove from the dataframe.
But the problem is that the system just take the first element and not all the item in the list.
like if the user select location and source number the col variable will contains just location and display the below error:
KeyError: 'location'
Traceback:
File "f:\aienv\lib\site-packages\streamlit\script_runner.py", line 333, in _run_script
exec(code, module.__dict__)
File "F:\AIenv\streamlit\app.py", line 373, in <module>
sidebars[y]=st.sidebar.multiselect('Filter '+y, df[y].unique(),key="1")
File "f:\aienv\lib\site-packages\pandas\core\frame.py", line 2902, in __getitem__
indexer = self.columns.get_loc(key)
File "f:\aienv\lib\site-packages\pandas\core\indexes\base.py", line 2893, in get_loc
raise KeyError(key) from err
Streamlit code:
import numpy as np
import pandas as pd
import streamlit as st
#function drop unwanted columns
def remove_columns(dataset,cols):
for col in cols:
del dataset[col]
return dataset
df =pd.DataFrame({
"source_number": [
[11199,11328,11287,32345,12342,1232,13456,123244,13456],
"location":
["loc2","loc1-loc3","loc3","loc1","loc2-loc1","loc2","loc3-loc2","loc2","loc1"],
"category":
["cat1","cat2","cat1","cat3","cat3","cat3","cat2","cat3","cat2"],
})
drop_button = st.sidebar.button("Remove")
columns = st.sidebar.multiselect("Select column/s", df.columns)
sidebars = {}
for y in columns:
ucolumns=list(df[y].unique())
st.write(y)
if (drop_button):
df_drop=df.drop(y,axis=1,inplace=True)
print(y)
st.table(df)

Use DataFrame.drop:
def remove_columns(dataset,cols):
return dataset.drop(cols, axis=1)
And for call pass function with no loop - is possible pass scalar or list:
df = remove_columns(df,'location')
df = remove_columns(df,['location','category'])
EDIT:
If need remove column seelcted in multiselect use:
drop_button = st.sidebar.button("Remove")
#in columns variable are selected values
columns = st.sidebar.multiselect("Select column/s", df.columns)
print (columns)
#so if use button remove values by variable columns
if (drop_button):
df.drop(columns,axis=1,inplace=True)
st.table(df)

Pandas have already implemented this inside of the function drop.
You can use pandas.drop with parameter columns = [columns that you want to drop] like this instead:
df.drop(columns = ["source_number","location"])
I hope this is what you are looking for

Related

Why does panda dataframe give a key error on 'Date'?

Still new to python. Pardon me for asking a noob equation. Using the mftool library which helps in downloading NAV data for mutual funds. It gives a key Error'Date'.
Would be really grateful on identifying and helping on the error.
Input:
from mftool import Mftool
mf = Mftool()
scheme_codes = mf.get_scheme_codes()
scheme_code_list = [x for x in scheme_codes.keys()]
def HistoricalNav(scheme_code_list, start_date, end_date):
assert (isinstance(scheme_code_list, list) is True), "Argument scheme_code_list should be a list"
assert (isinstance(start_date, str) is True), "start_date must be a str in %d-%m-%Y format" # checks whether start date is present and is in correct format.
assert (isinstance(end_date, str) is True), "end_date must be a str in %d-%m-%Y format" # checks whether end date is present and is in correct format
main_df = pd.DataFrame() #empty dataframe
for schemes in scheme_code_list:
data = mf.get_scheme_historical_nav_for_dates(schemes, start_date, end_date) # requesting NAV data from the api.
df = pd.DataFrame(data['data'])
df['scheme_code'] = pd.Series([data['scheme_code'] for x in range(len(df.index))]) #adding Pandas Series(scheme_code) as a column in Pandas Dataframe.
df['scheme_name'] = pd.Series([data['scheme_name'] for x in range(len(df.index))]) #adding Pandas Series(scheme_name) as a column in Pandas Dataframe.
df = df.sort_values(by = 'date') # sorting the values of every Scheme code based on Date
main_df = main_df.append(df) # appending the data in the main_df dataframe.
main_df = main_df[['scheme_code', 'scheme_name', 'date', 'nav']] #creating names of dataframe columns
main_df.reset_index(drop = True, inplace = True)
return main_df #Returning the required Dataframe.
values_df = HistoricalNav(scheme_code_list = scheme_code_list[0:5], start_date= '01-05-2021', end_date= '01-05-2021')
values_df
Error:
Traceback (most recent call last):
File "C:/Users/am364971/Desktop/Python/Working/amfi.py", line 31, in
values_df = HistoricalNav(scheme_code_list = scheme_code_list[0:5], start_date= '01-05-2021', end_date= '01-05-2021')
File "C:/Users/am364971/Desktop/Python/Working/amfi.py", line 22, in HistoricalNav
df = df.sort_values(by = 'date') # sorting the values of every Scheme code based on Date
File "C:\Users\am364971\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py", line 5455, in sort_values
k = self._get_label_or_level_values(by, axis=axis)
File "C:\Users\am364971\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\generic.py", line 1684, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'date'
I can see that date is not a column in the given dataframe. You could check by typing df.columns if there is date column.

how to replace values on a dataframe using pandas and streamlit in python?

i have a python script that read dataframe using pandas and display its content using streamlit.
What i want is to replace current value with a new value based on the user input.
Where user select the required column and than enter the current value in a text field than the new value in the second text field when button replace is pressed the old value is replaced by the new value and the new dataframe is displayed.
the problem is that when it display the dataframe nothing is changed
code:
import pandas as pd
import streamlit as st
df =pd.DataFrame({
"source_number": [
[11199,11328,11287,32345,12342,1232,13456,123244,13456],
"location":
["loc2","loc1","loc3","loc1","loc2","loc2","loc3","loc2","loc1"],
"category":
["cat1","cat2","cat1","cat3","cat3","cat3","cat2","cat3","cat2"],
})
columns = st.selectbox("Select column", df.columns)
old_values = st.multiselect("Current Values",list(df[columns].unique()),list(df[columns].unique()))
col1,col2 = st.beta_columns(2)
with col1:
old_val = st.text_input("old value")
with col2:
new_val = st.text_input("new value")
if st.button("Replace"):
df[columns]=df[columns].replace({old_val:new_val})
st.dataframe(df)
There is a little error in your code.
df[columns]=df[columns].replace({old_val:new_val})
When you look at a pandas docs in examples there is a code like that
s.replace({'a': None}) - it replaces value 'a' with None value
When looking at your code what it means, that you are trying to replace value that is a list with another list, but it does not work like that, because your column doesn't have a list as an element so can't change it like that.
When I ran your code in Jupyter I got an error that list is unhashable
--------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-41a02888936d> in <module>
30 oldVal = [11199,11328,11287,32345]
31
---> 32 df["source_number"] = df["source_number"].replace({oldVal:newVal})
33 df
TypeError: unhashable type: 'list'
And this a reason why it doesn't change anything for you.
If you want to change all values using lists you will have to write it like that:
df[column] = df[column].replace(old_values, new_values)
This code works just fine.
I hope I was clear enough and it will work for you.
Your code works for text columns (location and category). It doesn't work for the numeric source_number column as you're trying to replace one string by another.
For numeric columns you'll need to use number_input instead of text_input:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import streamlit as st
df = pd.DataFrame({
"source_number":
[11199,11328,11287,32345,12342,1232,13456,123244,13456],
"location":
["loc2","loc1","loc3","loc1","loc2","loc2","loc3","loc2","loc1"],
"category":
["cat1","cat2","cat1","cat3","cat3","cat3","cat2","cat3","cat2"],
})
columns = st.selectbox("Select column", df.columns)
old_values = st.multiselect("Current Values",list(df[columns].unique()),list(df[columns].unique()))
col1,col2 = st.beta_columns(2)
st_input = st.number_input if is_numeric_dtype(df[columns]) else st.text_input
with col1:
old_val = st_input("old value")
with col2:
new_val = st_input("new value")
if st.button("Replace"):
df[columns]=df[columns].replace({old_val:new_val})
st.dataframe(df)
Update as per comment: you could cache the df to prevent re-initalization upon each widget interaction (you'll have to manually clear the cache to start over):
#st.cache(allow_output_mutation=True)
def get_df():
return pd.DataFrame({
"source_number":
[11199,11328,11287,32345,12342,1232,13456,123244,13456],
"location":
["loc2","loc1","loc3","loc1","loc2","loc2","loc3","loc2","loc1"],
"category":
["cat1","cat2","cat1","cat3","cat3","cat3","cat2","cat3","cat2"],
})
df = get_df()

Receiving list index out of range error while looping through multiple words in PyTrends?

I am having some trouble iterating through Google trends data using the pseudo google trends API PyTrends. When I use the google trends website it is fine with me using multiple words together so long as I use the "+" symbol in between words. For example: "a-kasse+akasse+arbejdsformidling+arbejdsformidlinger+dagpenge+dagpengeperiode". When I use a single word in my code, the program works as expected. But when I use multiple words concatenated it breaks.
When I use the following function, it throws a list index out of range error:
def my_funct(Keyword, Dates, Country, Col_name):
KEYWORDS=[Keyword]
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
df_CODES= pd.DataFrame(KEYWORDS_CODES)
EXACT_KEYWORDS=df_CODES['mid'].to_list()
DATE_INTERVAL= Dates
COUNTRY=[Country] #Use this link for iso country code
CATEGORY=0 # Use this link to select categories
SEARCH_TYPE='' #default is 'web searches',others include 'images','news','youtube','froogle' (google shopping)
Individual_EXACT_KEYWORD = list(zip(*[iter(EXACT_KEYWORDS)]*1))
Individual_EXACT_KEYWORD = [list(x) for x in Individual_EXACT_KEYWORD]
dicti = {}
i = 1
for Country in COUNTRY:
for keyword in Individual_EXACT_KEYWORD:
try:
pytrend.build_payload(kw_list=keyword,
timeframe = DATE_INTERVAL,
geo = Country,
cat = CATEGORY,
gprop = SEARCH_TYPE)
dicti[i] = pytrend.interest_over_time()
i+=1
time.sleep(9)
print(dicti)
except requests.exceptions.Timeout:
print("Timeout occured")
df_trends = pd.concat(dicti, axis=1)
df_trends.columns = df_trends.columns.droplevel(0) #drop outside header
df_trends = df_trends.drop('isPartial', axis = 1) #drop "isPartial"
df_trends.reset_index(level=0,inplace=True) #reset_index
df_trends.columns=['date', Col_name] #change column names
return df_trends
I execute that function through another function as follows:
df_merged3 = excelConcatFunct("a-kasse+akasse+arbejdsformidling+arbejdsformidlinger+dagpenge+dagpengeperiode", 'DK', 'DANISH search terms')
And here is how that function works:
def excelConcatFunct(Word, Country_code, Col_name):
# generic plug-n-chug data
x1 = my_funct(Word, '2004-01-04 2009-01-04', Country_code, Col_name)
x2 = my_funct(Word, '2009-01-05 2014-01-05', Country_code, Col_name)
x3 = my_funct(Word, '2014-01-06 2019-01-06', Country_code, Col_name)
x4 = my_funct(Word, '2019-01-07 {0}'.format(Today), Country_code, Col_name)
# generic plug-n-chug data
df1 = pd.DataFrame(x1)
df2 = pd.DataFrame(x2)
df3 = pd.DataFrame(x3)
df4 = pd.DataFrame(x4)
# Creates an empty dataframe to add to merged df for column spacing
df0 = pd.DataFrame()
df0[''] = ''
# this concats the df horizontally
df_merged = pd.concat([df1, df0, df2, df0, df3, df0, df4], axis=1)
df_merged.reset_index(inplace=True)
# This removes the dangling last column that said "Unamed:0"
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^Unnamed')]
# This removes the dangling first index column
df_merged = df_merged.loc[:, ~df_merged.columns.str.contains('^index')]
# returns the merged dataframe
return df_merged
And here is the error message I am getting:
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 111, in <module>
df_merged3 = excelConcatFunct("a-kasse+akasse+arbejdsformidling+arbejdsformidlinger+dagpenge+dagpengeperiode", 'DK', 'DANISH search terms')
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 83, in excelConcatFunct
x1 = my_funct(Word, '2004-01-04 2009-01-04', Country_code, Col_name)
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 31, in my_funct
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
File "C:\Users\JohnReese\Desktop\G_Trends\G_Trender.py", line 31, in <listcomp>
KEYWORDS_CODES=[pytrend.suggestions(keyword=i)[0] for i in KEYWORDS]
IndexError: list index out of range
Please help. Any and all help is welcomed.
Thank you!

Dataframe with empty column in the data

I have a list of lists with an header row and then the different value rows.
It could happen that is some cases the last "column" has an empty value for all the rows (if just a row has a value it works fine), but DataFrame is not happy about that as the number of columns differs from the header.
I'm thinking to add a None value to the first list without any value before creating the DF, but I wondering if there is a better way to handle this case?
data = [
["data1", "data2", "data3"],
["value11", "value12"],
["value21", "value22"],
["value31", "value32"]]
headers = data.pop(0)
dataframe = pandas.DataFrame(data, columns = headers)
You could do this:
import pandas as pd
data = [
["data1", "data2", "data3"],
["value11", "value12"],
["value21", "value22"],
["value31", "value32"]
]
# create dataframe
df = pd.DataFrame(data)
# set new column names
# this will use ["data1", "data2", "data3"] as new columns, because they are in the first row
df.columns = df.iloc[0].tolist()
# now that you have the right column names, just jump the first line
df = df.iloc[1:].reset_index(drop=True)
df
data1 data2 data3
0 value11 value12 None
1 value21 value22 None
2 value31 value32 None
Is this that you want?
You can use pd.reindex function to add missing columns. You can possibly do something like this:
import pandas as pd
df = pd.DataFrame(data)
# To prevent throwing exception.
df.columns = headers[:df.shape[1]]
df = df.reindex(headers,axis=1)

How do I reorder rows in a CSV file by referring to a single column?

In Test1.csv, in all strings after the second line of the Entry column, I would like to write a code that sorts all the lines of Test1.csv according to the order of the Entry column in Test2.csv.
I would appreciate your advice. Thank you for your cooperation.
This is a simplified version of this data (more than 1000 lines).
import pandas as pd
input_path1 = "Test1.csv"
input_path2 = "Test2.csv"
output_path = "output.csv"
df1 = pd.read_csv(filepath_or_buffer=input_path1, encoding="utf-8")
df2 = pd.read_csv(filepath_or_buffer=input_path2, encoding="utf-8")
(df1.merge(df2, how='left', on='Entry')
.set_index('Entry')
.drop('Number_x', axis='columns')
.rename({'Number_y': 'Number'}, axis='columns')
.to_csv(output_path)
Error massage
Traceback (most recent call last):
File "narabekae.py", line 28, in <module>
.drop('Number_x', axis='columns')
File "/Users/macuser/downloads/yes/lib/python3.7/site-packages/pandas/core/frame.py", line 4102, in drop
errors=errors,
File "/Users/macuser/downloads/yes/lib/python3.7/site-packages/pandas/core/generic.py", line 3914, in drop
obj = obj._drop_axis(labels, axis, level=level, errors=errors)
File "/Users/macuser/downloads/yes/lib/python3.7/site-packages/pandas/core/generic.py", line 3946, in _drop_axis
new_axis = axis.drop(labels, errors=errors)
File "/Users/macuser/downloads/yes/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 5340, in drop
raise KeyError("{} not found in axis".format(labels[mask]))
KeyError: "['Number_x'] not found in axis"
The output what I want
,V1,V2,>sp,Entry,details,PepPI
1,OS=Ha,MTNKG,>sp,A4G4K7,HFQ_HERAR,7.028864399
2,OS=Sh,MAKGQ,>sp,B4TFA6,HFQ_SALHS,7.158609631
3,OS=Oi,MAQSV,>sp,Q8EQQ9,HFQ_OCEIH,9.229953074
4,OS=Bc,MAERS,>sp,A9M5C4,HFQ_BRUC2,8.154348935
5,OS=Re,MAERS,>sp,Q2K8U6,HFQ_RHIEC,8.154348935
Test1.csv
,V1,V2,>sp,Entry,details,PepPI
1,OS=Re,MAERS,>sp,Q2K8U6,HFQ_RHIEC,8.154348935
2,OS=Sh,MAKGQ,>sp,B4TFA6,HFQ_SALHS,7.158609631
3,OS=Ha,MTNKG,>sp,A4G4K7,HFQ_HERAR,7.028864399
4,OS=Bc,MAERS,>sp,A9M5C4,HFQ_BRUC2,8.154348935
5,OS=Oi,MAQSV,>sp,Q8EQQ9,HFQ_OCEIH,9.229953074
Test2.csv
pI,Molecular weight (average),Entry,Entry name,Organism
6.82,8763.13,A4G4K7,HFQ_HERAR,Rat
6.97,11119.33,B4TFA6,HFQ_SALHS,Pig
9.22,8438.69,Q8EQQ9,HFQ_OCEIH,Bacteria
7.95,8854.28,A9M5C4,HFQ_BRUC2,Mouse
7.95,9044.5,Q2K8U6,HFQ_RHIEC,Human
Additional information
macOS10.15.4 Python3.7.3 Atom
To reorder the columns, you just define list of columns in the order that you want, and use df[columns];
In [17]: columns = ["V1","V2",">sp","Entry","details","PepPI"]
In [18]: df = df1.merge(df2, how='left', on='Entry')
In [19]: df[columns]
Out[19]:
V1 V2 >sp Entry details PepPI
0 OS=Re MAERS >sp Q2K8U6 HFQ_RHIEC 8.154349
1 OS=Sh MAKGQ >sp B4TFA6 HFQ_SALHS 7.158610
2 OS=Ha MTNKG >sp A4G4K7 HFQ_HERAR 7.028864
3 OS=Bc MAERS >sp A9M5C4 HFQ_BRUC2 8.154349
4 OS=Oi MAQSV >sp Q8EQQ9 HFQ_OCEIH 9.229953
Naturally, you can save it normally with the to_csv() method:
df[columns].to_csv(output_path)
Notes
The errors are not reproducible with the data given, since there are no Number columns in the dataframes df1 and df2.
You should not set_index("Entry") if you want to have it saved in the .csv in the middle (since in the "The output what I want" you have simple integer based indexing).

Categories