Related
I want to compare two dataframes that have similar columns(not all) and print a new dataframe that shows the missing rows of df1 compare to df2 and a second dataframe that shows this time the missing values of df2 compare to df1 based on given columns.
Here the "key_columns" are named key_column1 and key_column2
import pandas as pd
data1 = {'first_column': ['4', '2', '7', '2', '2'],
'second_column': ['1', '2', '2', '2', '2'],
'key_column1':['1', '3', '2', '6', '4'],
'key_column2':['1', '2', '2', '1', '1'],
'fourth_column':['1', '2', '2', '2', '2'],
'other':['1', '2', '3', '2', '2'],
}
df1 = pd.DataFrame(data1)
data2 = {'first': ['1', '2', '2', '2', '2'],
'second_column': ['1', '2', '2', '2', '2'],
'key_column1':['1', '3', '2', '6', '4'],
'key_column2':['1', '5', '2', '2', '2'],
'fourth_column':['1', '2', '2', '2', '2'],
'other2':['1', '4', '3', '2', '2'],
'other3':['6', '8', '1', '4', '2'],
}
df2 = pd.DataFrame(data2)
I have modified the data1 and data2 dictionaries so that the resulting dataframes have only same columns to demonstrate that the solution provided in the answer by Emi OB relies on existence of columns in one dataframe which are not in the other one ( in case a common column is used the code fails with KeyError on the column chosen to collect NaNs). Below an improved version which does not suffer from that limitation creating own columns for the purpose of collecting NaNs:
df1['df1_NaNs'] = '' # create additional column to collect NaNs
df2['df2_NaNs'] = '' # create additional column to collect NaNs
df1_s = df1.merge(df2[['key_column1', 'key_column2', 'df2_NaNs']], on=['key_column1', 'key_column2'], how='outer')
df2 = df2.drop(columns=["df2_NaNs"]) # clean up df2
df1_s = df1_s.loc[df1_s['df2_NaNs'].isna(), df1.columns]
df1_s = df1_s.drop(columns=["df1_NaNs"]) # clean up df1_s
print(df1_s)
print('--------------------------------------------')
df2_s = df2.merge(df1[['key_column1', 'key_column2', 'df1_NaNs']], on=['key_column1', 'key_column2'], how='outer')
df1 = df1.drop(columns=["df1_NaNs"]) # clean up df1
df2_s = df2_s.loc[df2_s['df1_NaNs'].isna(), df2.columns]
df2_s = df2_s.drop(columns=["df2_NaNs"]) # clean up df2_s
print(df2_s)
gives:
first second_column key_column1 key_column2 fourth_column
1 2 2 3 2 2
3 2 2 6 1 2
4 2 2 4 1 2
--------------------------------------------
first second_column key_column1 key_column2 fourth_column
1 2 2 3 5 3
3 2 2 6 2 5
4 2 2 4 2 6
Also the code below works in case the columns of both dataframes are the same and in addition saves memory and computation time by not creating temporary full-sized dataframes required to achieve the final result:
""" I want to compare two dataframes that have similar columns(not all)
and print a new dataframe that shows the missing rows of df1 compare to
df2 and a second dataframe that shows this time the missing values of
df2 compare to df1 based on given columns. Here the "key_columns"
"""
import pandas as pd
#data1 ={ 'first_column':['4', '2', '7', '2', '2'],
data1 = { 'first':['4', '2', '7', '2', '2'],
'second_column':['1', '2', '2', '2', '2'],
'key_column1':['1', '3', '2', '6', '4'],
'key_column2':['1', '2', '2', '1', '1'],
'fourth_column':['1', '2', '2', '2', '2'],
# 'other':['1', '2', '3', '2', '2'],
}
df1 = pd.DataFrame(data1)
#print(df1)
data2 = { 'first':['1', '2', '2', '2', '2'],
'second_column':['1', '2', '2', '2', '2'],
'key_column1':['1', '3', '2', '6', '4'],
'key_column2':['1', '5', '2', '2', '2'],
# 'fourth_column':['1', '2', '2', '2', '2'],
'fourth_column':['2', '3', '4', '5', '6'],
# 'other2':['1', '4', '3', '2', '2'],
# 'other3':['6', '8', '1', '4', '2'],
}
df2 = pd.DataFrame(data2)
#print(df2)
data1_key_cols = dict.fromkeys( zip(data1['key_column1'], data1['key_column2']) )
data2_key_cols = dict.fromkeys( zip(data2['key_column1'], data2['key_column2']) )
# for Python versions < 3.7 (dictionaries are not ordered):
#data1_key_cols = list(zip(data1['key_column1'], data1['key_column2']))
#data2_key_cols = list(zip(data2['key_column1'], data2['key_column2']))
from collections import defaultdict
missing_data2_in_data1 = defaultdict(list)
missing_data1_in_data2 = defaultdict(list)
for indx, val in enumerate(data1_key_cols.keys()):
#for indx, val in enumerate(data1_key_cols): # for Python version < 3.7
if val not in data2_key_cols:
for key, val in data1.items():
missing_data1_in_data2[key].append(data1[key][indx])
for indx, val in enumerate(data2_key_cols.keys()):
#for indx, val in enumerate(data2_key_cols): # for Python version < 3.7
if val not in data1_key_cols:
for key, val in data2.items():
missing_data2_in_data1[key].append(data2[key][indx])
df1_s = pd.DataFrame(missing_data1_in_data2)
df2_s = pd.DataFrame(missing_data2_in_data1)
print(df1_s)
print('--------------------------------------------')
print(df2_s)
prints
first second_column key_column1 key_column2 fourth_column
0 2 2 3 2 2
1 2 2 6 1 2
2 2 2 4 1 2
--------------------------------------------
first second_column key_column1 key_column2 fourth_column
0 2 2 3 5 3
1 2 2 6 2 5
2 2 2 4 2 6
If you outer merge on the 2 key columns, with an additional unique column in the second dataframe, that unique column will show Nan where the row is in the first dataframe but not the second. For example:
df2.merge(df1[['key_column1', 'key_column2', 'first_column']], on=['key_column1', 'key_column2'], how='outer')
gives:
first second_column key_column1 ... other2 other3 first_column
0 1 1 1 ... 1 6 4
1 2 2 3 ... 4 8 NaN
2 2 2 2 ... 3 1 7
3 2 2 6 ... 2 4 NaN
4 2 2 4 ... 2 2 NaN
5 NaN NaN 3 ... NaN NaN 2
6 NaN NaN 6 ... NaN NaN 2
7 NaN NaN 4 ... NaN NaN 2
Here the Nans in 'first_column' correspond to the rows in df2 that are not in df1. You can then use this fact with .loc[] to filter on those Nan rows, and only the columns in df2 like so:
df2_outer.loc[df2_outer['first_column'].isna(), df2.columns]
Output:
first second_column key_column1 key_column2 fourth_column other2 other3
1 2 2 3 5 2 4 8
3 2 2 6 2 2 2 4
4 2 2 4 2 2 2 2
Full code for both tables is:
df2_outer = df2.merge(df1[['key_column1', 'key_column2', 'first_column']], on=['key_column1', 'key_column2'], how='outer')
print('missing values of df1 compare df2')
df2_output = df2_outer.loc[df2_outer['first_column'].isna(), df2.columns]
print(df2_output)
df1_outer = df1.merge(df2[['key_column1', 'key_column2', 'first']], on=['key_column1', 'key_column2'], how='outer')
print('missing values of df2 compare df1')
df1_output = df1_outer.loc[df1_outer['first'].isna(), df1.columns]
print(df1_output)
Which outputs:
missing values of df1 compare df2
first second_column key_column1 key_column2 fourth_column other2 other3
1 2 2 3 5 2 4 8
3 2 2 6 2 2 2 4
4 2 2 4 2 2 2 2
missing values of df2 compare df1
first_column second_column key_column1 key_column2 fourth_column other
1 2 2 3 2 2 2
3 2 2 6 1 2 2
4 2 2 4 1 2 2
I wanted to get a one hot data based on the number of elements in the list when using sklearn transform.
Code:
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain
x = [['1234', '5678', '910', 'baba'], ['8', '1'],
[], ['9', '3'], [], ['7', '6'], [], []]
vector = CountVectorizer(token_pattern=r".+", min_df=1, max_df=1.0, lowercase=False,
max_features=None)
vec = [xxx for xx in x for xxx in xx]
vector.fit(chain.from_iterable([vec]))
print(vector.get_feature_names())
new = []
for xx in x:
new.append(vector.transform(xx))
for x in new:
for xx in x.toarray():
print(xx)
Current output:
['1', '1234', '3', '5678', '6', '7', '8', '9', '910', 'baba']
[0 1 0 0 0 0 0 0 0 0]
[0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0]
[0 0 0 0 0 0 0 0 0 1]
[0 0 0 0 0 0 1 0 0 0]
[1 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0]
[0 0 1 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0]
My expected output:
['1', '1234', '3', '5678', '6', '7', '8', '9', '910', 'baba']
[0 1 0 1 0 0 0 0 1 1]
[1 0 0 0 0 0 1 0 0 0]
[0 0 1 0 0 0 0 1 0 0]
[0 0 0 0 1 1 0 0 0 0]
Is there a way to do it using my code? I have tried to change it many times but unfortunately to no luck. Somehow, my brain stops to process anything now.
You shouldn't need explicit for loops for this task. You can use MultiLabelBinarizer instead, also from the sklearn library. It doesn't handle empty lists, so just filter those out first.
Here's an example with Pandas:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
L = [['1234', '5678', '910', 'baba'], ['8', '1'],
[], ['9', '3'], [], ['7', '6'], [], []]
s = pd.Series(list(filter(None, L)))
mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(s),
columns=mlb.classes_,
index=s.index)
print(res)
1 1234 3 5678 6 7 8 9 910 baba
0 0 1 0 1 0 0 0 0 1 1
1 1 0 0 0 0 0 1 0 0 0
2 0 0 1 0 0 0 0 1 0 0
3 0 0 0 0 1 1 0 0 0 0
You can try of using intersect and np isin
intersect function will give closed elements and isin will create boolean list
mask = ['1', '1234', '3', '5678', '6', '7', '8', '9', '910', 'baba']
for xx in x:
if len(xx)>1:
print(np.isin(mask,np.array(list(set(xx).intersection(set(mask))))).astype(int))
Out:
[0 1 0 1 0 0 0 0 1 1]
[1 0 0 0 0 0 1 0 0 0]
[0 0 1 0 0 0 0 1 0 0]
[0 0 0 0 1 1 0 0 0 0]
Flattening the lists
#if you have big lists of elements you can flatten by
sum(x,[])
Out:
['1234', '5678', '910', 'baba', '8', '1', '9', '3', '7', '6']
For future readers:
I somehow solved it with a SUPER NAIVE way.
Here is the codes:
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain
x = [['1234', '5678', '910', 'baba'], ['8', '1'],
[], ['9', '3'], [], ['7', '6'], [], []]
vector = CountVectorizer(token_pattern=r"\S*\d+\S*", min_df=1, max_df=1.0, lowercase=False,
max_features=None)
vec = [xxx for xx in x for xxx in xx]
vector.fit(chain.from_iterable([vec]))
print(vector.get_feature_names())
new = []
for xx in x:
new.append(" ".join(xx))
neww = vector.transform(new)
print(neww.toarray())
This been bugging me for a while now. How can I achieve =INDEX(A:A,MATCH(E1&F1,B:B&C:C,0))in python? This will return an error if not found.
So I started playing with the pd.merge_asof. But either way I try it only returns errors.
df_3 = pd.merge_asof(df_1, df_2, on=['x', 'y'], allow_exact_matches=False)
Would give the error:
pandas.tools.merge.MergeError: can only asof on a key for left
Edit:
import pandas as pd
df_1 = pd.DataFrame({'x': ['1', '1', '2', '2', '3', '3', '4', '5', '5', '5'],
'y': ['smth1', 'smth2', 'smth1', 'smth2', 'smth1', 'smth2', 'smth1', 'smth1', 'smth2', 'smth3']})
df_2 = pd.DataFrame({'x': ['1', '2', '2', '3', '4', '5', '5'],
'y': ['smth1','smth1','smth2','smth3','smth1','smth1','smth3'],
'z': ['other1','other1','other2','other3','other1','other1','other3',]})
So that's a sample, where I could simply do this in excel with above formula and get something like this:
x y z
1 smth1 other1
1 smth2 #NA
2 smth1 other1
2 smth2 other2
3 smth1 #NA
3 smth2 #NA
4 smth1 other1
5 smth1 other1
5 smth2 #NA
5 smth3 other3
So, is there an easy way to achieve the INDEX MATCH formula in excel in pandas?
Let's try merge with how='left':
df_1.merge(df_2, on=['x','y'], how='left')
Output:
x y z
0 1 smth1 other1
1 1 smth2 NaN
2 2 smth1 other1
3 2 smth2 other2
4 3 smth1 NaN
5 3 smth2 NaN
6 4 smth1 other1
7 5 smth1 other1
8 5 smth2 NaN
9 5 smth3 other3
I am a beginner with pandas at best and I couldn't find a solution to this problem anywhere.
Let's say I have two variables: variable1, variable2.
They can have the following predefined values:
variable1 = ['1', '4', '9', '15', '20']
variable2 = ['2', '5', '6']
However, the current data set only has some of those values:
df = pd.DataFrame({variable1 : ['1', '9', '20'],
variable2 : ['2', '2', '6']})
When crossing the variables:
pd.crosstab(df.variable1, df.variable2)
I get:
variable2 2 6
variable1
1 1 0
20 0 1
9 1 0
Is there a way to put all the possible categorical values in both the columns and the rows even if the current data set does not have all of them? The goal is to have a table of the same size when running the script with an updated data set which may have the values that were not present in the previous data set.
Use DataFrame.reindex:
variable1 = ['1', '4', '9', '15', '20']
variable2 = ['2', '5', '6']
df = pd.DataFrame({'variable1' : ['1', '9', '20'],
'variable2' : ['2', '2', '6']})
print (df)
variable1 variable2
0 1 2
1 9 2
2 20 6
df = pd.crosstab(df.variable1, df.variable2)
df = df.reindex(index=variable1, columns=variable2, fill_value=0)
print (df)
variable2 2 5 6
variable1
1 1 0 0
4 0 0 0
9 1 0 0
15 0 0 0
20 0 0 1
from collections import OrderedDict
valuelabels = OrderedDict([('S8', [['1', 'Medical oncology'],
['2', 'Hematology'],
['3', 'Hematology/Oncology'],
['4', 'Other']]),
('S9', [['1', 'Academic / Teaching Hospital'],
['2', 'Community-Based Solo Private Practice'],
['3', 'Community-Based Group Private Practice (record practice size )'], ['4', 'Community Non-Teaching Hospital'],
['5', 'Comprehensive Cancer Center'],
['6', 'Other (specify)']])])
#print (valuelabels)
df = pd.DataFrame({'variable1' : ['1', '2', '4'],
'variable2' : ['2', '3', '1']})
table = pd.crosstab(df.variable1, df.variable2)
print (table)
variable2 1 2 3
variable1
1 0 1 0
2 0 0 1
4 1 0 0
d1 = dict(list(zip([a[0] for a in valuelabels['S8']], [a[1] for a in valuelabels['S8']])))
print (d1)
{'4': 'Other', '1': 'Medical oncology', '2': 'Hematology', '3': 'Hematology/Oncology'}
d2 = dict(list(zip([a[0] for a in valuelabels['S9']], [a[1] for a in valuelabels['S9']])))
print (d2)
{'1': 'Academic / Teaching Hospital',
'3': 'Community-Based Group Private Practice (record practice size )',
'4': 'Community Non-Teaching Hospital',
'6': 'Other (specify)',
'2': 'Community-Based Solo Private Practice',
'5': 'Comprehensive Cancer Center'}
table = table.reindex(index=[a[0] for a in valuelabels['S8']],
columns=[a[0] for a in valuelabels['S9'], fill_value=0)
print (table)
variable2 1 2 3 4 5 6
variable1
1 0 1 0 0 0 0
2 0 0 1 0 0 0
3 0 0 0 0 0 0
4 1 0 0 0 0 0
table.index = table.index.to_series().map(d1).values
table.columns = table.columns.to_series().map(d2).values
print (table)
Academic / Teaching Hospital \
Medical oncology 0
Hematology 0
Hematology/Oncology 0
Other 1
Community-Based Solo Private Practice \
Medical oncology 1
Hematology 0
Hematology/Oncology 0
Other 0
Community-Based Group Private Practice (record practice size ) \
Medical oncology 0
Hematology 1
Hematology/Oncology 0
Other 0
Community Non-Teaching Hospital \
Medical oncology 0
Hematology 0
Hematology/Oncology 0
Other 0
Comprehensive Cancer Center Other (specify)
Medical oncology 0 0
Hematology 0 0
Hematology/Oncology 0 0
Other 0 0
You can use reindex:
ct = pd.crosstab(df.variable1, df.variable2)
ct.reindex(index=variable1, columns=variable2).fillna(0).astype('int')
Out:
variable2 2 5 6
variable1
1 1 0 0
4 0 0 0
9 1 0 0
15 0 0 0
20 0 0 1
def TargetPercentByNominal (
targetVar, # target variable
predictor): # nominal predictor
countTable = pandas.crosstab(index = predictor, columns = targetVar, margins = True, dropna = True)
x = countTable.drop('All', 1)
percentTable = countTable.div(x.sum(1), axis='index')*100
print("Frequency Table: \n")
print(countTable)
print( )
print("Percent Table: \n")
print(percentTable)
return
I have some data in Microsoft excel that I save them as CSV file for ease of use. the data Structure is like this:
MS Excel format:
L1
0 1 0 0 0 1 1
0 0 1 0 0 1 0
0 0 0 1 0 0 1
0 0 0 0 1 0 0
1 1 1 1 1 1 1
1 1 1 1 1 1 1
1 1 1 1 1 1 1
1 1 1 1 1 1 1
CSV format
L1,,,,,,,,,,,,,,
0,1,0,0,0,1,1,
0,0,1,0,0,1,0,
0,0,0,1,0,0,1,
0,0,0,0,1,0,0,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
As you see only the first column has label now I want to read the CSV file (or it's easier the excel file) to get each column and do some bit manipulation operation on them. How can I acheive this? I have read something about pandas But I can't find anything useful in order to fetch each coloumn
Given the .csv file temp.csv
L1x,,,,,,,
0,1,0,0,0,1,1,
0,0,1,0,0,1,0,
0,0,0,1,0,0,1,
0,0,0,0,1,0,0,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
read it in as follows:
import pandas
a = pandas.read_csv('temp.csv', names = ["c%d" % i for i in range(8)], skiprows = 1)
a
Output:
c0 c1 c2 c3 c4 c5 c6 c7
0 0 1 0 0 0 1 1 NaN
1 0 0 1 0 0 1 0 NaN
2 0 0 0 1 0 0 1 NaN
3 0 0 0 0 1 0 0 NaN
4 1 1 1 1 1 1 1 NaN
5 1 1 1 1 1 1 1 NaN
6 1 1 1 1 1 1 1 NaN
7 1 1 1 1 1 1 1 NaN
The 'NaN's in the last column come from the pesky trailing commas. The 8 in the range needs to match the number of columns. To access the columns in a use either
a.c3
or
a[c3]
both of which result in
0 0
1 0
2 1
3 0
4 1
5 1
6 1
7 1
Name: c3
The cool thing about pandas is that if you want to XOR two columns you can very simply.
a.c0^a.c2
Output
0 0
1 1
2 0
3 0
4 0
5 0
6 0
7 0
Name: c0
Assume I have:
Which you can save into a CSV file that looks like so:
L1,,,
L2,0,10,20
L3,1,11,21
L4,2,12,22
L5,3,13,23
L6,4,14,24
L7,5,15,25
L8,6,16,26
L9,7,17,27
L10,8,18,28
To get just any col, use CSV reader and transpose with zip:
import csv
with open('test.csv', 'rU') as fin:
reader=csv.reader(fin)
data=list(reader)
print 'data:', data
# data: [['L1', '', '', ''], ['L2', '0', '10', '20'], ['L3', '1', '11', '21'], ['L4', '2', '12', '22'], ['L5', '3', '13', '23'], ['L6', '4', '14', '24'], ['L7', '5', '15', '25'], ['L8', '6', '16', '26'], ['L9', '7', '17', '27'], ['L10', '8', '18', '28']]
Notice the data is a list of rows. You can transpose that List of Lists using zip to get a list of columns:
trans=zip(*data)
print 'trans:',trans
# trans: [('L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10'), ('', '0', '1', '2', '3', '4', '5', '6', '7', '8'), ('', '10', '11', '12', '13', '14', '15', '16', '17', '18'), ('', '20', '21', '22', '23', '24', '25', '26', '27', '28')]
Then just index to get a specific column:
print trans[0]
# ('L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10')
Of course if you want to do arithmetic on the cells, you will need to convert the string to ints or floats as appropriate.
import pandas as pd
pd.read_excel("foo.xls", "Sheet 1",
names=["c%d" % i for i in range(7)])
Output:
c0 c1 c2 c3 c4 c5 c6
0 0 1 0 0 0 1 1
1 0 0 1 0 0 1 0
2 0 0 0 1 0 0 1
3 0 0 0 0 1 0 0
4 1 1 1 1 1 1 1
5 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1
7 1 1 1 1 1 1 1
Sample Code returns column as array.:
input = """L1,,,,,,,,,,,,,,
0,1,0,0,0,1,1,
0,0,1,0,0,1,0,
0,0,0,1,0,0,1,
0,0,0,0,1,0,0,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
1,1,1,1,1,1,1,
"""
def getColumn(data,column_number):
dump_array=[]
lines=data.split("\n")
for line in lines:
tmp_cell = line.split(",")
dump_array.append(tmp_cell[3])
return dump_array
#for ex. get column 3
getColumn(3,input)
This may give an idea to manuplate your grid...
Note: I dont have an interpreter for testing code now, so sorry if there is typo...