How to aggregate and restructure dataframe data in pyspark (column wise) - python

I am trying to aggregate data in pyspark dataframe on a particular criteria. I am trying to align the acct based on switchOUT amount to switchIN amount. So that accounts with money switching out of becomes from account and other accounts become to_accounts.
Data I am getting in the dataframe to begin with
+--------+------+-----------+----------+----------+-----------+
| person | acct | close_amt | open_amt | switchIN | switchOUT |
+--------+------+-----------+----------+----------+-----------+
| A | 1 | 125 | 50 | 75 | 0 |
+--------+------+-----------+----------+----------+-----------+
| A | 2 | 100 | 75 | 25 | 0 |
+--------+------+-----------+----------+----------+-----------+
| A | 3 | 200 | 300 | 0 | 100 |
+--------+------+-----------+----------+----------+-----------+
To this table
+--------+--------+-----------+----------+----------+
| person | from_acct| to_acct | switchIN | switchOUT|
+--------+----------+--------+----------+-----------+
| A | 3 | 1 | 75 | 100 |
+--------+----------+--------+----------+-----------+
| A | 3 | 2 | 25 | 100 |
+--------+----------+--------+----------+-----------+
And also how can I do it so that it works for N number of rows (not just 3 accounts)
So far I have used this code
# define udf
def sorter(l):
res = sorted(l, key=operator.itemgetter(1))
return [item[0] for item in res]
def list_to_string(l):
res = 'from_fund_' +str(l[0]) + '_to_fund_'+str(l[1])
return res
def listfirstAcc(l):
res = str(l[0])
return res
def listSecAcc(l):
res = str(l[1])
return res
sort_udf = F.udf(sorter)
list_str = F.udf(list_to_string)
extractFirstFund = F.udf(listfirstAcc)
extractSecondFund = F.udf(listSecAcc)
# Add additional columns
df= df.withColumn("move", sort_udf("list_col").alias("sorted_list"))
df= df.withColumn("move_string", list_str("move"))
df= df.withColumn("From_Acct",extractFirstFund("move"))
df= df.withColumn("To_Acct",extractSecondFund("move"))
Current outcome I am getting:
+--------+--------+-----------+----------+----------+
| person | from_acct| to_acct | switchIN | switchOUT|
+--------+----------+--------+----------+-----------+
| A | 3 | 1,2 | 75 | 100 |
+--------+----------+--------+----------+-----------+

Related

Python Pandas Extracting specific data from column

I have one column as an object contains multiple data separated by ( | )
I would like to extract only the customer order number which is start with
( 44 ) sometimes the order number in the beginning, sometimes in the middle, sometimes in the end
And sometimes is duplicated
44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 |
87186978110202440 | 44019119315 | 87186978110202440 | 44019119315
87186978110326832 | 44019453624 | 87186978110326832 | 44019453624
44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|
44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 |
Wishing result
44019541285
44019119315
44019453624
44019406029
44019480564
My code:
import pandas as pd
from io import StringIO
data = '''
Order_Numbers
44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 | 0652569032
87186978110202440 | 44019119315 | 87186978110202440 | 44019119315
87186978110326832 | 44019453624 | 87186978110326832 | 44019453624
44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|
44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 | 630552498
'''
df = pd.read_csv(StringIO(data.replace(' ','')))
df
'''
Order_Numbers
0 44019541285_P_002|0317209757|87186978110350851...
1 87186978110202440|44019119315|8718697811020244...
2 87186978110326832|44019453624|8718697811032683...
3 44019406029|0317196878|87186978110313085|38718...
4 44019480564|0317202711|87186978110335810|38718...
'''
Final code:
(
df.Order_Numbers.str.split('|', expand=True)
.astype(str)
.where(lambda x: x.applymap(lambda y: y[:2] == '44'))
.bfill(axis=1)
[0]
.str.split('_').str.get(0)
)
0 44019541285
1 44019119315
2 44019453624
3 44019406029
4 44019480564
Name: 0, dtype: object
import pandas as pd
df = pd.DataFrame({
'order_number':[
'44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 | 0652569032',
'87186978110202440 | 44019119315 | 87186978110202440 | 44019119315',
'87186978110326832 | 44019453624 | 87186978110326832 | 44019453624',
'44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|',
'44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 | 630552498'
]
})
def extract_customer_order(order_number):
order_number = order_number.replace(' ','') # remove all space to make it easy to process e.g. '44019541285_P_002 | 0317209757 ' -> '44019541285_P_002|0317209757'
order_number_list = order_number.split('|') # split the string at every | to multiple string in list '44019541285_P_002|0317209757' -> ['44019541285_P_002', '0317209757']
result = []
for order in order_number_list:
if order.startswith('44'): # select only order number starting with '44'
if order not in result: # to prevent duplicate order number
result += [order]
# if you want the result as string separated by '|', uncomment line below
# result = '|'.join(result)
return result
df['customer_order'] = df['order_number'].apply(extract_customer_order)

Check Multiple condition for same row

I have to compare 2 different sources and identify all the mismatches for all IDs
Source_excel table
+-----+-------------+------+----------+
| id | name | City | flag |
+-----+-------------+------+----------+
| 101 | Plate | NY | Ready |
| 102 | Back washer | NY | Sold |
| 103 | Ring | MC | Planning |
| 104 | Glass | NMC | Ready |
| 107 | Cover | PR | Ready |
+-----+-------------+------+----------+
Source_dw table
+-----+----------+------+----------+
| id | name | City | flag |
+-----+----------+------+----------+
| 101 | Plate | NY | Planning |
| 102 | Nut | TN | Expired |
| 103 | Ring | MC | Planning |
| 104 | Top Wire | NY | Ready |
| 105 | Bolt | MC | Expired |
+-----+----------+------+----------+
Expected result
+-----+-------------+----------+------------+----------+------------+---------+------------------+
| ID | excel_name | dw_name | excel_flag | dw_flag | excel_city | dw_city | RESULT |
+-----+-------------+----------+------------+----------+------------+---------+------------------+
| 101 | Plate | Plate | Ready | Planning | NY | NY | FLAG_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | NAME_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | FLAG_MISMATCH |
| 102 | Back washer | Nut | Sold | Expired | NY | TN | CITY_MISMATCH |
| 103 | Ring | Ring | Planning | Planning | MC | MC | ALL_MATCH |
| 104 | Glass | Top Wire | Ready | Ready | NMC | NY | NAME_MISMATCH |
| 104 | Glass | Top Wire | Ready | Ready | NMC | NY | CITY_MISMATCH |
| 107 | Cover | | Ready | | PR | | MISSING IN DW |
| 105 | | Bolt | | Expired | | MC | MISSING IN EXCEL |
+-----+-------------+----------+------------+----------+------------+---------+------------------+
I'm new to python and I have tried the below query but it not giving the expected result.
import pandas as pd
source_excel = pd.read_csv('C:/Mypython/Newyork/excel.csv',encoding = "ISO-8859-1")
source_dw = pd.read_csv('C:/Mypython/Newyork/dw.csv',encoding = "ISO-8859-1")
comparison_result = pd.merge(source_excel,source_dw,on='ID',how='outer',indicator=True)
comparison_result.loc[(comparison_result['_merge'] == 'both') & (name_x != name_y), 'Result'] = 'NAME_MISMATCH'
comparison_result.loc[(comparison_result['_merge'] == 'both') & (city_x != city_y), 'Result'] = 'CITY_MISMATCH'
comparison_result.loc[(comparison_result['_merge'] == 'both') & (flag_x != flag_y), 'Result'] = 'FLAG_MISMATCH'
comparison_result.loc[comparison_result['_merge'] == 'left_only', 'Result'] = 'Missing in dw'
comparison_result.loc[comparison_result['_merge'] == 'right_only', 'Result'] = 'Missing in excel'
comparison_result.loc[comparison_result['_merge'] == 'both', 'Result'] = 'ALL_Match'
csv_column = comparison_result[['ID','name_x','name_y','city_x','city_y','flag_x','flag_y','Result']]
print(csv_column)
Is there any other way I can check all the condition and report each in separate row. If separate row not possible, atleast i need in same column separated by all mismatches. something like FLAG_MISMATCH,CITY_MISMATCH
You could do:
df = pd.merge(Source_excel, Source_dw, on = 'ID', how = 'left', suffixes = (None, '_dw'))
This will create a new dataframe like the one you want, although you'll have to reorder the columns as you want. Note that the '_dw' is a suffix and not a prefix in this case.
You can reorder the columns as you like by using this code:
#Complement with the order you want
df = df[['ID', 'excel_name']]
For the result column I think you'll have to create a column for each condition you're trying to check (at least that's the way I know how to). Here's an example:
#This will return 1 if there's a match and 0 otherwise
df['result_flag'] = df.apply(lambda x: 1 if x.excel_flag == x.flag_dw else 0, axis = 1)
Here is a way to do the scoring:
df['result'] = 0
# repeated mask / df.loc statements suggests a loop, over a list of tuples
mask = df['excel_flag'] != df['df_flag']
df.loc[mask, 'result'] += 1
mask = df['excel_name'] != df['dw_name']
df.loc[mask, 'result'] += 10
df['result'] = df['result'].map({ 0: 'all match',
1: 'flag mismatch',
10: 'name mismatch',
11: 'all mismatch',})

Python: decrease running time of for loops

I want to calculate APRU for several countries.
country_list = ['us','gb','ca','id']
count = {}
for i in country_list:
count[i] = df_day_country[df_day_country.isin([i])]
count[i+'_reverse'] = count[i].iloc[::-1]
for j in range(1,len(count[i+'_reverse'])):
count[i+'_reverse']['count'].iloc[j] = count[i+'_reverse']['count'][j-1:j+1].sum()
for k in range(1,len(count[i])):
count[i][revenue_sum].iloc[k] = count[i][revenue_sum][k-1:k+1].sum()
count[i]['APRU'] = count[i][revenue_sum] / count[i]['count'][0]/100
After that, I will create 4 dataframes: df_us, df_gb, df_ca, df_id that show each country's APRU.
But the size of dataset is large. The running time is extremely slow after the country list become larger. So is there a way to decrease the running time?
Consider using numba
Your code thus becomes
from numba import njit
country_list = ['us','gb','ca','id']
#njit
def count(country_list):
count = {}
for i in country_list:
count[i] = df_day_country[df_day_country.isin([i])]
count[i+'_reverse'] = count[i].iloc[::-1]
for j in range(1,len(count[i+'_reverse'])):
count[i+'_reverse']['count'].iloc[j] = count[i+'_reverse']['count'][j-1:j+1].sum()
for k in range(1,len(count[i])):
count[i][revenue_sum].iloc[k] = count[i][revenue_sum][k-1:k+1].sum()
count[i]['APRU'] = count[i][revenue_sum] / count[i]['count'][0]/100
return count
Numba makes python loops a lot faster and is in the process of being integrated into the more heavy duty python libraries like scipy. Deffinetly give this a look.
IIUC, from your code and variable names, it looks like you are trying to compute average:
# toy data set:
country_list = ['us','gb']
np.random.seed(1)
datalen=10
df_day_country = pd.DataFrame({'country': np.random.choice(country_list, datalen),
'count': np.random.randint(0,100, datalen),
'revenue_sum': np.random.uniform(0,100,datalen)})
df_day_country['APRU'] = (df_day_country.groupby('country',group_keys=False)
.apply(lambda x: x['revenue_sum']/x['count'].sum())
)
Output:
+----------+--------+--------------+------------+----------+
| country | count | revenue_sum | APRU | |
+----------+--------+--------------+------------+----------+
| 0 | gb | 16 | 20.445225 | 0.150333 |
| 1 | gb | 1 | 87.811744 | 0.645675 |
| 2 | us | 76 | 2.738759 | 0.011856 |
| 3 | us | 71 | 67.046751 | 0.290246 |
| 4 | gb | 6 | 41.730480 | 0.306842 |
| 5 | gb | 25 | 55.868983 | 0.410801 |
| 6 | gb | 50 | 14.038694 | 0.103226 |
| 7 | gb | 20 | 19.810149 | 0.145663 |
| 8 | gb | 18 | 80.074457 | 0.588783 |
| 9 | us | 84 | 96.826158 | 0.419161 |
+----------+--------+--------------+------------+----------+

Best way to compare 2 dfs, get the name of different col & before + after vals?

What is the best way to compare 2 dataframes w/ the same column names, row by row, if a cell is different have the Before & After value and which cellis different in that dataframe.
I know this question has been asked a lot, but none of the applications fit my use case. Speed is important. There is a package called datacompy but it is not good if I have to compare 5000 dataframes in a loop (i'm only comparing 2 at a time, but around 10,000 total, and 5000 times).
I don't want to join the dataframes on a column. I want to compare them row by row. Row 1 with row 1. Etc. If a column in row 1 is different, I only need to know the column name, the before, and the after. Perhaps if it is numeric I could also add a column w/ the abs val. of the dif.
The problem is, there is sometimes an edge case where rows are out of order (only by 1 entry), and don’t want these to come up as false positives.
Example:
These dataframes would be created when I pass in race # (there are 5,000 race numbers)
df1
+-----+-------+--+------+--+----------+----------+-------------+--+
| Id | Speed | | Name | | Distance | | Location | |
+-----+-------+--+------+--+----------+----------+-------------+--+
| 181 | 10.3 | | Joe | | 2 | | New York | |
| 192 | 9.1 | | Rob | | 1 | | Chicago | |
| 910 | 1.0 | | Fred | | 5 | | Los Angeles | |
| 97 | 1.8 | | Bob | | 8 | | New York | |
| 88 | 1.2 | | Ken | | 7 | | Miami | |
| 99 | 1.1 | | Mark | | 6 | | Austin | |
+-----+-------+--+------+--+----------+----------+-------------+--+
df2:
+-----+-------+--+------+--+----------+----------+-------------+--+
| Id | Speed | | Name | | Distance | | | Location |
+-----+-------+--+------+--+----------+----------+-------------+--+
| 181 | 10.3 | | Joe | | 2 | | New York | |
| 192 | 9.4 | | Rob | | 1 | | Chicago | |
| 910 | 1.0 | | Fred | | 5 | | Los Angeles | |
| 97 | 1.5 | | Bob | | 8 | | New York | |
| 99 | 1.1 | | Mark | | 6 | | Austin | |
| 88 | 1.2 | | Ken | | 7 | | Miami | |
+-----+-------+--+------+--+----------+----------+-------------+--+
diff:
+-------+----------+--------+-------+
| Race# | Diff_col | Before | After |
+-------+----------+--------+-------+
| 123 | Speed | 9.1 | 9.4 |
| 123 | Speed | 1.8 | 1.5 |
An example of a false positive is with the last 2 rows, Ken + Mark.
I could summarize the differences in one line per race, but if the dataframe has 3000 records and there are 1,000 differences (unlikely, but possible) than I will have tons of columns. I figured this was was easier as I could export to excel and then sort by race #, see all the differences, or by diff_col, see which columns are different.
def DiffCol2(df1, df2, race_num):
is_diff = False
diff_cols_list = []
row_coords, col_coords = np.where(df1 != df2)
diffDf = []
alldiffDf = []
for y in set(col_coords):
col_df1 = df1.iloc[:,y].name
col_df2 = df2.iloc[:,y].name
for index, row in df1.iterrows():
if df1.loc[index, col_df1] != df2.loc[index, col_df2]:
col_name = col_df1
if col_df1 != col_df2: col_name = (col_df1, col_df2)
diffDf.append({‘Race #’: race_num,'Column Name': col_name, 'Before: df2.loc[index, col_df2], ‘After’: df1.loc[index, col_df1]})
try:
check_edge_case = df1.loc[index, col_df1] == df2.loc[index+1, col_df1]
except:
check_edge_case = False
try:
check_edge_case_two = df1.loc[index, col_df1] == df2.loc[index-1, col_df1]
except:
check_edge_case_two = False
if not (check_edge_case or check_edge_case_two):
col_name = col_df1
if col_df1 != col_df2:
col_name = (col_df1, col_df2) #if for some reason column name isn’t the same, which should never happen but in case, I want to know both col names
is_diff = True
diffDf.append({‘Race #’: race_num,'Column Name': col_name, 'Before: df2.loc[index, col_df2], ‘After’: df1.loc[index, col_df1]})
return diffDf, alldiffDf, is_diff
[apologies in advance for weirdly formatted tables, i did my best given how annoying pasting tables into s/o is]
The code below works if dataframes have the same number and names of columns and the same number of rows, so comparing only values in the tables
Not sure where you want to get Race# from
df1 = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df2 = df1.copy(deep=True)
df2['B'][5] = 100 # Creating difference
df2['C'][6] = 100 # Creating difference
dif=[]
for col in df1.columns:
for bef, aft in zip(df1[col], df2[col]):
if bef!=aft:
dif.append([col, bef, aft])
print(dif)
Results below
Alternative solution without loops
df = df1.melt()
df.columns=['Column', 'Before']
df.insert(2, 'After', df2.melt().value)
df[df.Before!=df.After]

search for string in pandas row

How can I search through the entire row in a pandas dataframe for a phrase and if it exist create a new col where says it says 'Yes' and what columns in that row it found it in? I would like to be able to ignore case as well.
You could use Pandas apply function, which allows you to traverse rows or columns and apply your own function to them.
For example, given a dataframe
+--------------------------------------+------------+---+
| deviceid | devicetype | 1 |
+--------------------------------------+------------+---+
| b569dcb7-4498-4cb4-81be-333a7f89e65f | Google | 1 |
| 04d3b752-f7a1-42ae-8e8a-9322cda4fd7f | Android | 2 |
| cf7391c5-a82f-4889-8d9e-0a423f132026 | Android | 3 |
+--------------------------------------+------------+---+
Define a function
def pr(array, value):
condition = array[array.str.contains(value).fillna(False)].index.tolist()
if condition:
ret = array.append(pd.Series({"condition":['Yes'] + condition}))
else:
ret = array.append(pd.Series({"condition":['No'] + condition}))
return ret
Use it
df.apply(pr, axis=1, args=('Google',))
+---+--------------------------------------+------------+---+-------------------+
| | deviceid | devicetype | 1 | condition |
+---+--------------------------------------+------------+---+-------------------+
| 0 | b569dcb7-4498-4cb4-81be-333a7f89e65f | Google | 1 | [Yes, devicetype] |
| 1 | 04d3b752-f7a1-42ae-8e8a-9322cda4fd7f | Android | 2 | [No] |
| 2 | cf7391c5-a82f-4889-8d9e-0a423f132026 | Android | 3 | [No] |
+---+--------------------------------------+------------+---+-------------------+

Categories