This may be an easy process, but has been confusing me. Let's say I have the following df:
import pandas as pd
# creating a DataFrame
df = pd.DataFrame({'col1' :['A', 'B', 'B', 'C'],
'col2' :['B', 'A', 'B', 'H'],
'col3' :['', '', '', ''],
'val' :['','','','']})
print(df)
col1 col2 col3 val
0 A B
1 B A
2 B B
3 C H
Now I wanna identify empty columns and drop them. Here is what I am doing:
empty_cols = [col for col in df if df[col].isnull().all()]
print(empty_cols)
[]
False
which returns [] and False. Am I making a mistake somewhere here? Thanks!
You could try this
empty = []
for col in df.columns:
if df[col].sum() == "":
empty.append(col)
Alternatively like this, if you want a one liner:
empty = [col for col in df if df[col].sum() == ""]
If by "empty" you mean and string with no charatchers like '', then you can chek for that instead of null values
empty_cols = [col for col in df if df[col].str.contains('').all()]
I guess, you need to check '' instead of isnull(), because, null is None in Python, but you have just empty stings, so they are not None.
P.S my code to detect and drop.
empty = [df[col].name for col in df if not df[col].all()]
df = df.drop(empty, axis=1)
Related
does anyone know how to concatenate multiple columns excluding duplicated values?
I'm a student of python, this is my first project and I have a problem
I have a dataset like this one:
each number represents a column from my dataframe
df = {'col1': ['a','b','a','c'], 'col2': ["","",'a',''], 'col3': ['','a','','b'], 'col4': ['a','','b',''], 'col2': ['b','c','c','']}
Need a output like this:
new colum
a-b
a-b-c
a-c-b
b-c
Need the data sorted, concatenated and with unique values
I was able to do this in excel using transpose, sort and unique, like this:
=TEXTJOIN("-";;TRANSPOSE(UNIQUE(SORT(TRANSPOSE(A1:E1)))))
But I couldn't figure out how to do it on pandas. Can anoyne help me plz?
Example
you can make example by to_dict and make null from None
data = {'col1': ['a','b','a','c'], 'col2': [None,None,'a',None], 'col3': [None,'a',None,'b'], 'col4': ['a',None,'b',None], 'col5': ['b','c','c',None]}
df = pd.DataFrame(data)
df
col1 col2 col3 col4 col5
0 a None None a b
1 b None a None c
2 a a None b c
3 c None b None None
Code
df.apply(lambda x: '-'.join(sorted(x.dropna().unique())) ,axis=1)
output:
0 a-b
1 a-b-c
2 a-b-c
3 b-c
dtype: object
Assuming your DataFrame's identifier is just 'df', you could try something like this:
# List for output
newList = []
# Iterate over the DataFrame rows
for i, row in df.iterrows():
# Remove duplicates
row.drop_duplicates(inplace=True)
# Remove NaNs
row.dropna(inplace=True)
# Sort alphabetically
row.sort_values(inplace=True)
# Add adjusted row to the list
newList.append(row.to_list())
# Output
print(newList)
If your DataFrame isn't named 'df', just substitute 'df.iterrows()' for '[your dataframe].iterrows()'.
This gives the output:
[['a', 'b'], ['a', 'b', 'c'], ['a', 'b', 'c'], ['b', 'c']]
If you really need the output to be formatted like you said (a-b, a-b-c, etc): you can iterate over 'newList' to concatenate them and add the hyphens.
unique id
col1
col2
col3
New Col
1
Yes
No
Yes
col1, col3
2
No
Yes
No
col2
3
Yes
Yes
No
col1, col2
4
No
No
No
I was wondering how I can get the respective column names if "Yes" to a new column call "New Col".
You could apply a list comprehension to each row to return the column names where the value in the column is 'Yes'.
import pandas as pd
data = {'unique id': [1,2,3,4],
'col1': ['Yes','No','Yes','No'],
'col2': ['No','Yes','Yes','No'],
'col3': ['Yes','No','No','No']}
df = pd.DataFrame (data)
df['new_col'] = df.apply(lambda row:','.join([col for col in df.columns[1:] if row[col]=='Yes']) , axis=1)
print(df)
""" OUTPUT
unique id col1 col2 col3 new_col
0 1 Yes No Yes col1,col3
1 2 No Yes No col2
2 3 Yes Yes No col1,col2
3 4 No No No
"""
I would use pandas.DataFrame.iterrows to solve the problem. It returns each row as a Series, which allows you to do such comparison.
import pandas as pd
data = {
'unique id': [1,2,3,4],
'col1': ['Yes','No','Yes','No'],
'col2': ['No','Yes','Yes','No'],
'col3': ['Yes','No','No','No']
}
df = pd.DataFrame(data)
new_col = []
# We only need the Series part of returning tuples from df.iterrows
for _, row in df.iterrows():
# Get indices that match the desired condition
match_columns = row[row == 'Yes'].index.tolist()
new_col.append(match_columns)
# Assign the result back to the table (DataFrame) as a new column
df['New Col'] = new_col
Now df is what you want. Although I would suggest not using space for a column name (just for coding convention), so df['new_col'] = new_col may be better. I did that just to meet your original needs.
this is one solution, although I am sure it's not ideal. Please ask for any details.
# import pandas and make fake data
import pandas as pd
data = {'unique id': [1,2,3,4],
'col1': ['Yes','No','Yes','No'],
'col2': ['No','Yes','Yes','No'],
'col3': ['Yes','No','No','No']}
df = pd.DataFrame (data)
# now find all locations where each column contains 'Yes'
mask1 = df['col1']=='Yes'
mask2 = df['col2']=='Yes'
mask3 = df['col3']=='Yes'
# now build the new desired output column using string manipulation
output = []
for m1,m2,m3 in zip(mask1,mask2,mask3): # for each row of the dataframe
contains = [] # collect the components of the string in the new column
if m1:
contains.append('col1')
if m2:
contains.append('col2')
if m3:
contains.append('col3')
output.append(', '.join(contains)) # build the string
# and add the new column of data to the dataframe
df['New Col'] = output
Basic idea:
Define a function get_filtered_colnames we can apply to row by row. Let this function return the names of columns where the value in that row is 'Yes'.
Apply get_filtered_colnames to every row in df.
import numpy as np
import pandas as pd
# Creating the dataframe.
df = pd.DataFrame(np.zeros((4, 3)), columns=['col1', 'col2', 'col3'])
df['col1'] = ['Yes', 'No', 'Yes', 'No']
df['col2'] = ['No', 'Yes', 'Yes', 'No']
df['col3'] = ['Yes', 'No', 'No', 'No']
# Defining a function that can be applied row by row to the dataframe.
def get_filtered_colnames(row, colnames):
# Extracting the column indices where the column contains 'Yes'.
filtered_idxs = np.where(row == 'Yes')[0]
# Column names where the column containes 'Yes'.
filtered_colnames = [colnames[filtered_idx] for filtered_idx in filtered_idxs]
return filtered_colnames
# Applying the above function row by row (hence, axis=1). Passing col names as parameter.
df['New Col'] = df.apply(get_filtered_colnames, colnames=df.columns.tolist(), axis=1)
print(df)
This gives the following desired output:
col1 col2 col3 New Col
0 'Yes' 'No' 'Yes' ['col1', 'col3']
1 'No' 'Yes' 'No' ['col2']
2 'Yes' 'Yes' 'No' ['col1', 'col2']
3 'No' 'No' 'No' []
Edit: You can follow #kevinkayaks comment if you want the output as str instead of list.
I have a CSV that has a list of URLs that I need to see if they exist in other columns. I have the following code that loops through each row of the column called "URLS" that checks to see if this exists on another specific column. If this does, then I need to add a string to a specific column for the row. I have it functioning, but I'm not sure how I update the column for the row? I'm reading through the docs and I'm thinking I might be over thinking a bit on this.
import pandas as pd
# Import CSV
data = pd.read_csv(r'URL_export.csv')
# Looping through the URLS of this column
df = pd.DataFrame(data, columns = ['URL', 'Exists'])
# Checking if URLs exist in this row
v = pd.DataFrame(data, columns = ['Check'])
for row in df.itertuples():
if row.URL in v.Check.values:
print(row)
# Add string "Yes" under column name "Exists" for this row
import pandas as pd
df = pd.DataFrame({
'URL': ['a', 'b', 'c' ,'d', 'e', 'f'],
'Exists': ['','','', '', '', '']
})
v = pd.DataFrame({
'Check': ['a', 'c', 'e']
})
df['Exists'] = df['URL'].apply(lambda x: 'Yes' if x in v['Check'].values else 'No')
Output:
If it's needed just assign "Yes" (without "No"):
df['Exists'] = df['Exists'] + ' ' + df['URL'].apply(lambda x: 'Yes' if x in v['Check'].values else '')
If column "Exists" already contains a value and you need to append "Yes" to it:
df['Exists'] = df['Exists'] + ' ' + df['URL'].apply(lambda x: 'Yes' if x in v['Check'].values else '')
It's probably better to use booleans, instead of the strings 'Yes' and 'No'.
This also helps simplify the code:
import pandas as pd
df_1 = pd.DataFrame({'URL': ['a', 'b', 'd', 'c', 'e', 'f']})
print(df_1, end='\n\n')
df_2 = pd.DataFrame({'Check': ['a', 'c', 'e']})
print(df_2, end='\n\n')
df_1['Exists'] = df_1['URL'].isin(df_2['Check'])
print(df_1)
Output:
URL
0 a
1 b
2 d
3 c
4 e
5 f
Check
0 a
1 c
2 e
URL Exists
0 a True
1 b False
2 d False
3 c True
4 e True
5 f False
I have the following dataframe:
import pandas as pd
dt = pd.DataFrame({'col': ['A','A_B']})
I would like for the rows that col==A to have the value all into a new column (col2) and for the rest to do a str.split operation
The final df I would like to look like this:
dt = pd.DataFrame({'col': ['A', 'A_B'],
'col2': ['all', 'B']})
I tried:
dt['col2'] = np.where(dt.col == 'A', 'all',
dt.col.apply(lambda x: x.split('_')[1]))
but I get this error: IndexError: list index out of range
If use pandas text functions it working correct in your situtation - return missing value, because second list not exist:
print (dt.col.str.split('_').str[1])
0 NaN
1 B
Name: col, dtype: object
dt['col2'] = np.where(dt.col == 'A', 'all', dt.col.str.split('_').str[1])
print (dt)
col col2
0 A all
1 A_B B
Or use [-1] for select last lists after split:
dt['col2'] = np.where(dt.col == 'A', 'all',
dt.col.apply(lambda x: x.split('_')[-1]))
Or is possible filter values by inverting mask:
m = dt.col == 'A'
dt['col2'] = np.where(m, 'all',
dt.loc[~m, 'col'].apply(lambda x: x.split('_')[1]))
you could do
dt['col2'] = dt.col.str.split('_', expand = True).fillna('all')[1]
I am wondering if there a fast way to merge two pandas tables by the regular expression in python .
For example:
table A
col1 col2
1 apple_3dollars_5
2 apple_2dollar_4
1 orange_5dollar_3
1 apple_1dollar_3
table B
col1 col2
good (apple|oragne)_\dollars_5
bad .*_1dollar_.*
ok oragne_\ddollar_\d
Output:
col1 col2 col3
1 apple_3dollars_5 good
1 orange_5dollar_3 ok
1 apple_1dollar_3 bad
this is just an example, what I want is instead of merging by one col that exactly match, I want to join by some regular expression. Thank you!
First of all fix RegEx'es in the B DataFrame:
In [222]: B
Out[222]:
col1 col2
0 good (apple|oragne)_\ddollars_5
1 bad .*_1dollar_.*
2 ok orange_\ddollar_\d
Now we can prepare the following variables:
In [223]: to_repl = B.col2.values.tolist()
In [224]: vals = B.col1.values.tolist()
In [225]: to_repl
Out[225]: ['(apple|oragne)_\\ddollars_5', '.*_1dollar_.*', 'orange_\\ddollar_\\d']
In [226]: vals
Out[226]: ['good', 'bad', 'ok']
Finally we can use them in the replace function:
In [227]: A['col3'] = A['col2'].replace(to_repl, vals, regex=True)
In [228]: A
Out[228]:
col1 col2 col3
0 1 apple_3dollars_5 good
1 2 apple_2dollar_4 apple_2dollar_4
2 1 orange_5dollar_3 ok
3 1 apple_1dollar_3 bad
I took the idea from https://python.tutorialink.com/can-i-perform-a-left-join-merge-between-two-dataframes-using-regular-expressions-with-pandas/ and improved it a little so that the original data can have more than one column and now we can make a real left join (merge) with regex!
import pandas as pd
d = {'extra_colum1': ['x', 'y', 'z', 'w'],'field': ['ab', 'a', 'cd', 'e'], 'extra_colum2': ['x', 'y', 'z', 'w']}
df = pd.DataFrame(data=d)
df_dict = pd.DataFrame(['a', 'b', 'c', 'd'], columns =
['destination'])
df_dict['field'] = '.*' + df_dict['destination'] + '.*'
df_dict.columns=['destination','field']
dataframe and dict
def merge_regex(df, df_dict, how, field):
import re
df_dict = df_dict.drop_duplicates()
idx = [(i,j) for i,r in enumerate(df_dict[f'{field}']) for j,v in enumerate(df[f'{field}']) if re.match(r,v)]
df_dict_idx, df_idx = zip(*idx)
t = df_dict.iloc[list(df_dict_idx),0].reset_index(drop=True)
t1 = df.iloc[list(df_idx),df.columns.get_loc(f'{field}')].reset_index(drop=True)
df_dict_translated = pd.concat([t,t1], axis=1)
data = pd.merge(
df,
df_dict_translated,
how=f'{how}',
left_on=f'{field}',
right_on=f'{field}'
)
data = data.drop_duplicates()
return data
Similar to #MaxU, I use .replace, but I replace the column of values that you want to merge on with the regex strings that they match on. Small warning that this can cause some issues like non-unique index if your normal text matches more than one regex pattern. So using your dataframe A and #MaxU's fixed regexes for dataframe B:
A['joinCol'] = A.col2.replace(B.col2, B.col2, regex=True)
B.rename({'col2': 'joinCol'}) # the join columns should have the same name
C = A.join(B, on='joinCol')
If you want, you can then drop that join column:
C = C.drop('joinCol', axis=1)