I have a huge dataset with contents such as given below:
+------+------------------------------------------------------------------+----------------------------------+--+
| HHID | VAL_CD64 | VAL_CD32 | |
+------+------------------------------------------------------------------+----------------------------------+--+
| 203 | 8c5bfd9b6755ffcdb85dc52a701120e0876640b69b2df0a314dc9e7c2f8f58a5 | 373aeda34c0b4ab91a02ecf55af58e15 | |
| 203 | 0511dc19cb09f8f4ba3d140754dafb1471dacdbb6747cdb5a2bc38e278d229c8 | 6f3606577eadacef1b956307558a1efd | |
| 203 | a18adc1bcae1b570a610b13565b82e5647f05fef8a4680bd6ccdd717cdd34af7 | 332321ab150879e930869c15b1d10c83 | |
| 720 | f6c581becbac4ec1291dc4b9ce566334b1cb2c85e234e489e7fd5e1393bd8751 | 2c4f97a04f02db5a36a85f48dab39b5b | |
| 720 | abad845107a699f5f99575f8ed43e0440d87a8fc7229c1a1db67793561f0f1c3 | 2111293e946703652070968b224875c9 | |
| 348 | 25c7cf022e6651394fa5876814a05b8e593d8c7f29846117b8718c3dd951e496 | 5c80a555fcda02d028fc60afa29c4a40 | |
| 348 | 67d9c0a4bb98900809bcfab1f50bef72b30886a7b48ff0e9eccf951ef06542f9 | 6c10cd11b805fa57d2ca36df91654576 | |
| 348 | 05f1e412e7765c4b54a9acfd70741af545564f6fdfe48b073bfd3114640f5e37 | 6040b29107adf1a41c4f5964e0ff6dcb | |
| 403 | 3e8da3d63c51434bcd368d6829c7cee490170afc32b5137be8e93e7d02315636 | 71a91c4768bd314f3c9dc74e9c7937e8 | |
+------+------------------------------------------------------------------+----------------------------------+--+
I'm processing the file in order to have output in below given format:
+------+------------------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------------------+----------------------------------+----------------------------------+----------------------------------+--+
| HHID | VAL1_CD64 | VAL2_CD64 | VAL3_CD64 | VAL1_CD32 | VAL2_CD32 | VAL3_CD32 | |
+------+------------------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------------------+----------------------------------+----------------------------------+----------------------------------+--+
| 203 | 8c5bfd9b6755ffcdb85dc52a701120e0876640b69b2df0a314dc9e7c2f8f58a5 | 0511dc19cb09f8f4ba3d140754dafb1471dacdbb6747cdb5a2bc38e278d229c8 | a18adc1bcae1b570a610b13565b82e5647f05fef8a4680bd6ccdd717cdd34af7 | 373aeda34c0b4ab91a02ecf55af58e15 | 6f3606577eadacef1b956307558a1efd | 332321ab150879e930869c15b1d10c83 | |
| 720 | f6c581becbac4ec1291dc4b9ce566334b1cb2c85e234e489e7fd5e1393bd8751 | abad845107a699f5f99575f8ed43e0440d87a8fc7229c1a1db67793561f0f1c3 | | 2c4f97a04f02db5a36a85f48dab39b5b | 2111293e946703652070968b224875c9 | | |
| 348 | 25c7cf022e6651394fa5876814a05b8e593d8c7f29846117b8718c3dd951e496 | 67d9c0a4bb98900809bcfab1f50bef72b30886a7b48ff0e9eccf951ef06542f9 | 05f1e412e7765c4b54a9acfd70741af545564f6fdfe48b073bfd3114640f5e37 | 5c80a555fcda02d028fc60afa29c4a40 | 6c10cd11b805fa57d2ca36df91654576 | 6040b29107adf1a41c4f5964e0ff6dcb | |
| 403 | 3e8da3d63c51434bcd368d6829c7cee490170afc32b5137be8e93e7d02315636 | | | 71a91c4768bd314f3c9dc74e9c7937e8 | | | |
+------+------------------------------------------------------------------+------------------------------------------------------------------+------------------------------------------------------------------+----------------------------------+----------------------------------+----------------------------------+--+
My current code is:
import pandas as pd
import numpy as np
import os
import shutil
import glob
import time
start=time.time()
print('\nFile Processing Started\n')
path=r'C:\Users\xyz\Sample Data'
input_file=r'C:\Users\xyz\Sample Data\test'
output_file=r'C:\Users\xyz\Sample Data\test_MOD'
chunk=pd.read_csv(input_file+'.psv',sep='|',chunksize=10000,dtype={"HH_ID":"string","VAL_CD64":"string","VAL_CD32":"string"})
chunk_list=[]
for c_no in chunk:
chunk_list.append(c_no)
file_no=1
rec_cnt=0
for i in chunk_list:
start2=time.time()
rec_cnt=rec_cnt+len(i)
rec_cnt2=0
rec_cnt2=len(i)
df=pd.DataFrame(i)
df_ = df.groupby('HH_ID').agg({'VAL_CD64': list, 'VAL_CD32': list})
data = []
for col in df_.columns:
d = pd.DataFrame(df_[col].values.tolist(), index=df_.index)
d.columns = [f'{col}_{i}' for i in map(str, range(1, len(d.columns)+1))]
data.append(d)
res = pd.concat(data, axis=1)
# res.columns=['MAID1_SHA256', 'MAID2_SHA256', 'MAID3_SHA256', 'MAID1_MD5','MAID2_MD5', 'MAID3_MD5']
res.to_csv(output_file+str(file_no)+'.psv',index=True,sep='|')
with open(output_file+str(file_no)+'.psv','r') as istr:
with open(input_file+str(file_no)+'.psv','w') as ostr:
for line in istr:
line=line.strip('\n')+'|'
print(line,file=ostr)
os.remove(output_file+str(file_no)+'.psv')
file_no+=1
end2=time.time()
duration2=end2-start2
print("\nProcessed "+ str(rec_cnt2)+ " records in "+ str(round((duration2),2))+ " seconds. \nTotal Processed Records: "+str(rec_cnt))
os.remove(input_file+'.psv')
allFiles = glob.glob(path + "/*.psv")
allFiles.sort()
with open(os.path.join(path,'someoutputfile.csv'), 'wb') as outfile:
for i, fname in enumerate(allFiles):
with open(fname, 'rb') as infile:
if i != 0:
infile.readline()
shutil.copyfileobj(infile, outfile)
test=os.listdir(path)
for item in test:
if item.endswith(".psv"):
os.remove(os.path.join(path,item))
final_file_name=input_file+'.psv'
os.rename(os.path.join(path,'someoutputfile.csv'),final_file_name)
end=time.time()
duration=end-start
print("\n"+ str(rec_cnt)+ " records added in "+ str(round((duration),2))+ " seconds. \n")
However, this code is taking a lot of time to process a 400 million records file, approx 18-19 hours, running on unix. And the whole script gets killed if I try to process a 700 million records file. By my google search, I believe it is being killed due to high memory usage of groupby function.
Is there any way I can reduce the memory footprint of this program, so that a 700 million file can be processed through it?
I'm not sure how to do it with pandas, but you can do this without ever keeping more than a few rows in memory.
First, make sure the dataset is sorted by the column you want to group by. If not, sort them using an external merge sort algorithm.
Then, just follow this simple algorithm
read the first HHID, and start a new list of VAL_CD64 and VAL_CD32
while there are more lines
read the next line
if the HHID is the same as the previous, add VAL_CD64 and VAL_CD32 to the current lists
else
write out the previous HHID and cumulated values,
start collecting a new list for the new HHID
write out the last HHID and cumulated values
Related
I have this massive dataset and I need to subset the data by using criteria. This is for illustration:
| Group | Name | Value |
|--------------------|-------------|-----------------|
| A | Bill| 256 |
| A | Jack| 268 |
| A | Melissa| 489 |
| B | Amanda | 787 |
| B | Eric| 485 |
| C | Matt| 1236 |
| C | Lisa| 1485 |
| D | Ben | 785 |
| D | Andrew| 985 |
| D | Cathy| 1025 |
| D | Suzanne| 1256 |
| D | Jim| 1520 |
I know how to handle this problem manually, such as:
import pandas as pd
df=pd.read_csv('Test.csv')
A=df[df.Group =="A "].to_numpy()
B=df[df.Group =="B "].to_numpy()
C=df[df.Group =="C "].to_numpy()
D=df[df.Group =="D "].to_numpy()
But considering the size of the data, it will take a lot of time if I handle it in this way.
With that in mind, I would like to know if it is possible to build an iteration with an IF statement that can look at the values in column “Group”(table above) . I was thinking, IF statement to see if the first value is the same with one the below if so, group them and create a new array/ dataframe.
I have a data frame of rows of more than 1,000,000 and 15 columns.
I have to make new columns and assign the value to the columns w.r.t the other string values in the other columns via matching them either with regex or exact character match.
For example, if a column called FIle path is there. I have to make a column as a feature that will be assigned values with the input of the folder path (Full | partial) and match it with the file path and update the feature column.
I thought about using the iteration with for loop but it is so much time taking and while using pandas for this I think iterating would consume more time if looping components increase in the future.
Is there an efficient way for the pandas to do this type of operation
Please help me with this.
Example:
I have a df as:
| ID | File |
| -------- | -------------- |
| 1 | SWE_Toot |
| 2 | SWE_Thun |
| 3 | IDH_Toet |
| 4 | SDF_Then |
| 5 | SWE_Toot |
| 6 | SWE_Thun |
| 7 | SEH_Toot |
| 8 | SFD_Thun |
I will get components in other tables as
| ID | File |
| -------- | -------------- |
| Software | */SWE_Toot/*.h |
| |*/IDH_Toet/*.c |
| |*/SFD_Toto/*.c |
second as:
| ID | File |
| -------- | -------------- |
| Wire | */SDF_Then/*.h |
| |*/SFD_Thun/*.c |
| |*/SFD_Toto/*.c |
etc., will me around like 1000000 files and 278 components are received
I want as
| ID | File |Component|
| -------- | -------------- |---------|
| 1 | SWE_Toot |Software |
| 2 | SWE_Thun |Other |
| 3 | IDH_Toet |Software |
| 4 | SDF_Then |Wire |
| 5 | SWE_Toto |Various |
| 6 | SWE_Thun |Other |
| 7 | SEH_Toto |Various |
| 8 | SFD_Thun |Wire |
Other - will be filled at last once all the fields and regex are checked and do not belong to any component.
Various - It may belong to more than one (or) we can give a list of components it belong to.
I was able to read the components tables and create a regex and if I want to create the component column then I have to write for loops for all the 278 columns and I have to loop the same table with the component.
Is there a way to do this with the pandas easier
Because the date will be very large
I have one column as an object contains multiple data separated by ( | )
I would like to extract only the customer order number which is start with
( 44 ) sometimes the order number in the beginning, sometimes in the middle, sometimes in the end
And sometimes is duplicated
44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 |
87186978110202440 | 44019119315 | 87186978110202440 | 44019119315
87186978110326832 | 44019453624 | 87186978110326832 | 44019453624
44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|
44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 |
Wishing result
44019541285
44019119315
44019453624
44019406029
44019480564
My code:
import pandas as pd
from io import StringIO
data = '''
Order_Numbers
44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 | 0652569032
87186978110202440 | 44019119315 | 87186978110202440 | 44019119315
87186978110326832 | 44019453624 | 87186978110326832 | 44019453624
44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|
44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 | 630552498
'''
df = pd.read_csv(StringIO(data.replace(' ','')))
df
'''
Order_Numbers
0 44019541285_P_002|0317209757|87186978110350851...
1 87186978110202440|44019119315|8718697811020244...
2 87186978110326832|44019453624|8718697811032683...
3 44019406029|0317196878|87186978110313085|38718...
4 44019480564|0317202711|87186978110335810|38718...
'''
Final code:
(
df.Order_Numbers.str.split('|', expand=True)
.astype(str)
.where(lambda x: x.applymap(lambda y: y[:2] == '44'))
.bfill(axis=1)
[0]
.str.split('_').str.get(0)
)
0 44019541285
1 44019119315
2 44019453624
3 44019406029
4 44019480564
Name: 0, dtype: object
import pandas as pd
df = pd.DataFrame({
'order_number':[
'44019541285_P_002 | 0317209757 | 87186978110350851 | 387186978103840544 | 0652569032',
'87186978110202440 | 44019119315 | 87186978110202440 | 44019119315',
'87186978110326832 | 44019453624 | 87186978110326832 | 44019453624',
'44019406029 | 0317196878 | 87186978110313085 | 387186978120481881|',
'44019480564 | 0317202711 | 87186978110335810 | 387186978103844160 | 630552498'
]
})
def extract_customer_order(order_number):
order_number = order_number.replace(' ','') # remove all space to make it easy to process e.g. '44019541285_P_002 | 0317209757 ' -> '44019541285_P_002|0317209757'
order_number_list = order_number.split('|') # split the string at every | to multiple string in list '44019541285_P_002|0317209757' -> ['44019541285_P_002', '0317209757']
result = []
for order in order_number_list:
if order.startswith('44'): # select only order number starting with '44'
if order not in result: # to prevent duplicate order number
result += [order]
# if you want the result as string separated by '|', uncomment line below
# result = '|'.join(result)
return result
df['customer_order'] = df['order_number'].apply(extract_customer_order)
I am trying to aggregate data in pyspark dataframe on a particular criteria. I am trying to align the acct based on switchOUT amount to switchIN amount. So that accounts with money switching out of becomes from account and other accounts become to_accounts.
Data I am getting in the dataframe to begin with
+--------+------+-----------+----------+----------+-----------+
| person | acct | close_amt | open_amt | switchIN | switchOUT |
+--------+------+-----------+----------+----------+-----------+
| A | 1 | 125 | 50 | 75 | 0 |
+--------+------+-----------+----------+----------+-----------+
| A | 2 | 100 | 75 | 25 | 0 |
+--------+------+-----------+----------+----------+-----------+
| A | 3 | 200 | 300 | 0 | 100 |
+--------+------+-----------+----------+----------+-----------+
To this table
+--------+--------+-----------+----------+----------+
| person | from_acct| to_acct | switchIN | switchOUT|
+--------+----------+--------+----------+-----------+
| A | 3 | 1 | 75 | 100 |
+--------+----------+--------+----------+-----------+
| A | 3 | 2 | 25 | 100 |
+--------+----------+--------+----------+-----------+
And also how can I do it so that it works for N number of rows (not just 3 accounts)
So far I have used this code
# define udf
def sorter(l):
res = sorted(l, key=operator.itemgetter(1))
return [item[0] for item in res]
def list_to_string(l):
res = 'from_fund_' +str(l[0]) + '_to_fund_'+str(l[1])
return res
def listfirstAcc(l):
res = str(l[0])
return res
def listSecAcc(l):
res = str(l[1])
return res
sort_udf = F.udf(sorter)
list_str = F.udf(list_to_string)
extractFirstFund = F.udf(listfirstAcc)
extractSecondFund = F.udf(listSecAcc)
# Add additional columns
df= df.withColumn("move", sort_udf("list_col").alias("sorted_list"))
df= df.withColumn("move_string", list_str("move"))
df= df.withColumn("From_Acct",extractFirstFund("move"))
df= df.withColumn("To_Acct",extractSecondFund("move"))
Current outcome I am getting:
+--------+--------+-----------+----------+----------+
| person | from_acct| to_acct | switchIN | switchOUT|
+--------+----------+--------+----------+-----------+
| A | 3 | 1,2 | 75 | 100 |
+--------+----------+--------+----------+-----------+
I want to calculate APRU for several countries.
country_list = ['us','gb','ca','id']
count = {}
for i in country_list:
count[i] = df_day_country[df_day_country.isin([i])]
count[i+'_reverse'] = count[i].iloc[::-1]
for j in range(1,len(count[i+'_reverse'])):
count[i+'_reverse']['count'].iloc[j] = count[i+'_reverse']['count'][j-1:j+1].sum()
for k in range(1,len(count[i])):
count[i][revenue_sum].iloc[k] = count[i][revenue_sum][k-1:k+1].sum()
count[i]['APRU'] = count[i][revenue_sum] / count[i]['count'][0]/100
After that, I will create 4 dataframes: df_us, df_gb, df_ca, df_id that show each country's APRU.
But the size of dataset is large. The running time is extremely slow after the country list become larger. So is there a way to decrease the running time?
Consider using numba
Your code thus becomes
from numba import njit
country_list = ['us','gb','ca','id']
#njit
def count(country_list):
count = {}
for i in country_list:
count[i] = df_day_country[df_day_country.isin([i])]
count[i+'_reverse'] = count[i].iloc[::-1]
for j in range(1,len(count[i+'_reverse'])):
count[i+'_reverse']['count'].iloc[j] = count[i+'_reverse']['count'][j-1:j+1].sum()
for k in range(1,len(count[i])):
count[i][revenue_sum].iloc[k] = count[i][revenue_sum][k-1:k+1].sum()
count[i]['APRU'] = count[i][revenue_sum] / count[i]['count'][0]/100
return count
Numba makes python loops a lot faster and is in the process of being integrated into the more heavy duty python libraries like scipy. Deffinetly give this a look.
IIUC, from your code and variable names, it looks like you are trying to compute average:
# toy data set:
country_list = ['us','gb']
np.random.seed(1)
datalen=10
df_day_country = pd.DataFrame({'country': np.random.choice(country_list, datalen),
'count': np.random.randint(0,100, datalen),
'revenue_sum': np.random.uniform(0,100,datalen)})
df_day_country['APRU'] = (df_day_country.groupby('country',group_keys=False)
.apply(lambda x: x['revenue_sum']/x['count'].sum())
)
Output:
+----------+--------+--------------+------------+----------+
| country | count | revenue_sum | APRU | |
+----------+--------+--------------+------------+----------+
| 0 | gb | 16 | 20.445225 | 0.150333 |
| 1 | gb | 1 | 87.811744 | 0.645675 |
| 2 | us | 76 | 2.738759 | 0.011856 |
| 3 | us | 71 | 67.046751 | 0.290246 |
| 4 | gb | 6 | 41.730480 | 0.306842 |
| 5 | gb | 25 | 55.868983 | 0.410801 |
| 6 | gb | 50 | 14.038694 | 0.103226 |
| 7 | gb | 20 | 19.810149 | 0.145663 |
| 8 | gb | 18 | 80.074457 | 0.588783 |
| 9 | us | 84 | 96.826158 | 0.419161 |
+----------+--------+--------------+------------+----------+