What is the fastest way to manipulate large csv files in Python? - python

I have been working on a python code, which reads a csv file with 800 odd rows and around 17000 columns.
I would like to check each entry in the csv file and see if this number is bigger than or smaller than a value, if it is, I assign a default value. I used pandas and worked with dataframes, apply and lambda functions. It takes me 172 minutes to finish going through all entries in the csv file. Is it normal? Is there any faster way to do this?. I am using Python 2.7. I don't know if it helps, but I am running it on a windows 10 machine with 32GB ram. Thanks in advance for the help.
The code is attached below.
def do_something(some_dataframe):
col = get_req_colm(some_dataframe)
modified_dataframe = pd.DataFrame()
for k in col:
temp_data = some_dataframe.apply(lambda x: check_for_range(x[k]), axis=1).tolist()
dictionary = {}
dictionary[str(k)] = temp_data
temp_frame = pd.DataFrame(dictionary)
modified_dataframe = pd.concat([modified_dataframe, temp_frame], axis=1)
return modified_dataframe
def check_for_range(var):
var = int(var)
try:
if var == 0:
return 0
if var == 1 or var == 4:
return 1
if var == 2 or var == 3 or var == 5 or var == 6:
return 2
except:
print('error')
def get_req_colm(df):
col = list(df)
try:
col.remove('index/Sample count')
col.remove('index / Sample')
col.remove('index')
col.remove('count')
except:
pass
return col
df_after_doing_something = do_something(some_dataframe)
df_after_doing_something.to_csv(output_folder + '\\df_after_doing_something.csv', index=False)

using pandas,for cvs data, makes it efficient. but your code is not efficient.it will be faster if you try code given blow.
def do_something(some_dataframe):
col = get_req_colm(some_dataframe)
col = col.to_numpy()
np_array = np.zeros_like(col)
for i in range(len(col)):
k = np_array[i]
temp_data = np.zeros_like()
temp_data[k == 1 or k == 4] = 1
temp_data[k == 2 or k == 3 or k == 5 or k == 6] = 2
np_array[i] = k
modified_dataframe = pandas.Dataframe(np_array)
return modified_dataframe
def get_req_colm(df):
col = list(df)
try:
col.remove('index/Sample count')
col.remove('index / Sample')
col.remove('index')
col.remove('count')
except:
pass
return col
it will work perfectly and don't forget to import numpy.
import numpy as np
if you didn't get this go and check some numpy tutorial and do it then. the link given below will help you otherwise
Replacing elements in a numpy array when there are multiple conditions

Related

Vectorization with python

My current code is extremely slow with the nested for loop setup. I would like to speed up the process, my assumption would be that the solution is the vectorization with Pandas or NumPy. I do not know how to transfer my current code into the new format.
I have created an example code below.
import pandas as pd
import numpy as np
balance = 10000
raw_data = [[1,2,4,1,3],[2,3,7,2,4],[3,4,5,3,4],[4,4,9,1,5],[5,5,6,4,5]]
raw_df = pd.DataFrame(raw_data, columns=['D','O','H','L','C'])
history_data = [[1,1,5,np.nan,4],[0,1,3,np.nan,4],[1,0,4,2,3],[1,0,1,6,0],[0,1,7,np.nan,8]]
history_df = pd.DataFrame(history_data, columns=['TY','ST','OP','CL','SL'])
for n in raw_df.index:
for p in history_df.index:
if history_df['ST'][p] == 1 and history_df['TY'][p] == 1 and history_df['SL'][p] >= raw_df['L'][n]:
history_df['CL'][p] = raw_df['L'][n]
history_df['ST'][p] = 0
balance = balance + 20
if raw_df['C'][n] > 4:
history_df = history_df.append({'TY':0,'ST':1,'OP':5,'CL':np.nan,'SL':9,},ignore_index = True)
Check out this example, see if it helps :
import numpy as np
# Use NumPy's where function to perform the check for each row of history_df and raw_df simultaneously
mask = np.where((history_df['ST'] == 1) & (history_df['TY'] == 1) & (history_df['SL'] >= raw_df['L']))
history_df.loc[mask, 'CL'] = raw_df.loc[mask, 'L']
history_df.loc[mask, 'ST'] = 0
# Calculate the balance change
balance_change = 20 * len(mask[0])
balance += balance_change
# Append rows to history_df where raw_df['C'] > 4
new_rows = raw_df[raw_df['C'] > 4]
new_rows['TY'] = 0
new_rows['ST'] = 1
new_rows['OP'] = 5
new_rows['CL'] = np.nan
new_rows['SL'] = 9
history_df = history_df.append(new_rows, ignore_index=True)

How to create dummy variables from predefined bins in python

I want to create a dataset with dummy variables from the original data based on predefined bins. I have tried using loops and splits but its not efficient. I'll appreciate your help.
## original data
data_dict = {"Age":[29,35,42,11,43],"Salary":[4380,3280,8790,1200,5420],
"Payments":[23190,1780,3400,12900,7822]}
df = pd.DataFrame(data_dict)
df
Predefined bins:
card_dict = {"Dummy Variable":["Age:(-inf,24)","Age:(24,35)","Age:(35,49)","Age:(49,60)","Age:(60,inf)",
"Payments:(-inf,7654)","Payments:(7654,9088)","Payments:(9088,12055)","Payments:(12055,inf)",
"Salary:(-inf,2300)","Salary:(2300,3800)","Salary:(3800,5160)",
"Salary:(5160,7200)","Salary:(7200,inf)"]}
card = pd.DataFrame(card_dict)
card
My code is as follows:
# for numerical variables
def prepare_numerical_data(data, scard):
"""
function to create dummy variables from numerical columns
"""
# numerical columns
num_df = df.select_dtypes(exclude='object')
num_cols = num_df.columns.values
variable_names = list(set([val.split(':')[0] for val in scard['Dummy Variable']])) # to have the same columns used to create the scorecard
num_variables = [x for x in variable_names if x in num_cols] # select numerical variables only
for i in num_variables:
for j in scard['Dummy Variable']:
if j.split(":")[0] in num_variables:
for val in data[i].unique():
if (val > (float(j.split(':')[1].split(',')[0][1:]))) & (val <= (float(j.split(':')[1].split(',')[1][:-1]))):
data.loc[data[i] == val, j] = 1
else:
data.loc[data[i] == val, j] = 0
return data
Here are the results:
result_df = prepare_numerical_data(df,card)
result_df
The results are not OK for salary and payments columns. The function didn't create correct dummies for the two columns as it did for age. How can I correct that?
This worked for me. Initially my code was not looping through every column in the dataframe.
def create_dummies(data, card):
# specify numerical and categorical columns
num_df = data.select_dtypes(exclude='object')
cat_df = data.select_dtypes(exclude=['float','int'])
num_cols = num_df.columns.values
cat_cols = cat_df.columns.values
# create dummies for numerical columns
for j in num_df.columns:
all_value = num_df[j].values
for variable_v in all_value:
for i in card["Dummy Variable"].values:
if i.split(":")[0] in num_cols:
var1 = i.split(":")
val1 = float(var1[1].strip("()").strip("[]").split(",")[0])
val2 = float(var1[1].strip("()").strip("[]").split(",")[1])
variable = var1[0]
if variable.lower() == j.lower():
if variable_v >= val1 and variable_v < val2:
num_df.loc[num_df[j] == variable_v, i] = 1
else:
num_df.loc[num_df[j] == variable_v, i] = 0
return num_df

Python Code Optimization (For-Loop & If-else) Advice for quicker computation time

Need to reduce the computation for the following python code which contains multiple if else statements. The code runs on a DataBricks so I'm open to Pyspark Solutions as well.
Currently this code takes more than 1 hour to run. So any help would be appreciated.
unique_list_code: List of Unique code from concat_df['C_Code'] column used to filter rows of dataframe containing the code.
concat_df:Pandas DataFrame with 4 million records
unique_list_code = list(concat_df['C_Code'].unique())
MC_list =[]
SN_list =[]
AN_list = []
Nothing_list =[]
for i in range(0,len(unique_list_code)):
print(unique_list_code[i])
code_filtered_df = concat_df[concat_df['C_Code'] == unique_list_code[i]]
#SN_Filter:
SN_filter = code_filtered_df[(code_filtered_df['D_Type'] == 'SN') & (code_filtered_df['Comm_P'] == 'P-mail')]
if len(SN_filter)>0:
print("Found SN")
SN_list.append(unique_list_code[i])
clean_up(SN_filter)
else:
#AN_Filter
AN_filter = code_filtered_df[(code_filtered_df['D_Type'] == 'AN') & (code_filtered_df['Comm_P'] == 'P-mail')]
if len(AN_filter)>0:
print("Found AN")
AN_list.append(unique_list_code[i])
clean_up(AN_filter)
else:
#MC_Check
MF_filter = code_filtered_df[code_filtered_df['MC_Flag'] =='Y' ]
MF_DNS_filter = MF_filter[~(((MF_filter['D_Type'] == 'AN')| (MF_filter['D_Type'] =='SN')) & (MF_filter['Comm_P'] == 'DNS'))]
if len(MF_DNS_filter)>0:
print("Found MC")
MC_list.append(unique_list_code[i])
clean_up(MF_DNS_filter)
else:
print("Nothing Found")
Nothing_list.append(unique_list_code[i])
Update:
Changed it to Pyspark DF and the code as follows, still no luck.
from pyspark.sql.functions import col
from pyspark.sql.functions import when
MC_list =[]
SN_list =[]
AN_list = []
Nothing_list =[]
for i in range(0,len(unique_list_code)):
code_filtered_df = df.filter(col("C_code") == unique_list_code[i])
SN_filter = code_filtered_df.filter((col('D_Type') == 'SN') & (col('Comm_P') == 'P-mail'))
if SN_filter.count() >0:
SN_list.append(unique_list_code[i])
else:
AN_filter = code_filtered_df.filter((col('D_Type') == 'AN') & (col('Comm_P') == 'P-mail'))
if AN_filter.count()>0:
AN_list.append(unique_list_code[i])
else:
MF_filter = code_filtered_df.filter(col('MC_Flag') =='Y')
MF_DNS_filter = MF_filter[~(((col('D_Type') == 'AN')| (col('D_Type') =='SN')) & (col('Comm_P') == 'DNS'))]
if MF_DNS_filter.count()>0:
print("Found MC")
MC_list.append(unique_list_code[i])
else:
print("Nothing Found")
Nothing_list.append(unique_list_code[i])
The reason why is taking so long is that you are working with a Pandas DF
If you want to benefit from distributed computation and increase your performance, you need to work with a Spark dataframe in this case:
Spark_DF = spark.createDataFrame(Pandas_DF)
You will need to rewrite your code in PySpark to work with a Spark DF
As mentioned above you need to rewrite this code in pyspark. Pyspark allow you distribut data through worker nodes on cluster.
from pyspark.sql.functions import col
filter_condition = (((col('D_Type') == 'SN' & col('Comm_P') == 'P-mail')) OR ((col('D_Type') == 'AN' & col('Comm_P') == 'P-mail')) OR (...))
result_df = concat_df.where(filter_condition)

modifying the dataframe column and get unexpected results

I have a dataframe listed like below:
There are actually 120000 rows in this data, and there are 20000 users, this is just one user. For every user I need to make sure the prediction is three "1" and three "0".
I wrote the following function to do that:
def check_prediction_quality(df):
df_n = df.copy()
unique = df_n['userID'].unique()
for i in range(len(unique)):
ex_df = df[df['userID']== unique[i]]
v = ex_df['prediction'].tolist()
v_bool = [i == 0 for i in v]
if sum(v_bool) != 3:
if sum(v_bool) > 3:
res = [i for i,val in enumerate(v_bool) if val]
diff = sum(v_bool) - 3
for i in range(diff):
idx = np.random.choice(res,1)[0]
v[idx] = float(1)
res.remove(idx)
elif sum(v_bool) < 3:
res = [i for i,val in enumerate(v_bool) if not val]
diff = 3 - sum(v_bool)
for i in range(diff):
idx = np.random.choice(res,1)[0]
v[idx] = float(0)
res.remove(idx)
for j in range(len(v)):
df_n.loc[(0+i*6)+j:(6+i*6)+j,'prediction'] = v[j]
return df_n
However, when I run to check if the number of "0" and "1" are the same, turns it's not.. I am not sure what I did wrong.
sum([i == 0 for i in df['prediction']])
should be six using the below example, but when I run on my 120000 dataframe, it does not have 60000 on each
data = {'userID': [199810,199810,199810,199810,199810,199810,199812,199812,199812,199812,199812,199812],
'trackID':[1,2,3,4,5,6,7,8,9,10,11,12],
'prediction':[0,0,0,0,1,1,1,1,1,1,0,0]
}
df = pd.DataFrame(data = data)
df
Much appreciated!
When working with pandas dataframes you should reassign the post-processed Dataframe to the old one.
df = pd.DataFrame(np.array(...))
#reasignation:
df.loc[:,3:5] = df.loc[:,3:5]*10 #This multiplies the columns from 3 to 5 by 10
Actually never mind. I found out I don't have to modify the "0" and "1"..

Seems that I have a wrong index somewhere, but cannot spot it

I am new to python and having trouble with a function. It should delete rows of a (N,10) matrix (imported fron a file) where -1 appears. This is the code
import pandas as pd
import numpy as np
def load(name, f):
file = pd.read_csv(name,header=None)
totalMatrix = np.array(file)
if f == 'forward':
for i in range(len(totalMatrix)):
for j in range(10):
if totalMatrix[i,j] ==-1:
if i > 0:
totalMatrix[i,j]=totalMatrix[i-1,j]
else:
print("Warning")
f = 'drop'
elif f == 'drop':
for i in range(len(totalMatrix)): # or np.size(totalMatrix[:, 0])
for j in range(10):
if totalMatrix[i,j] == -1 :
totalMatrix = np.delete(totalMatrix, (i), axis=0)
t = totalMatrix[:, 0:6]
d = totalMatrix[:, 6:11]
return t, d
But I keep running on this error:
line 38, in load
if totalMatrix[i,j] == -1 :
IndexError: index 2 is out of bounds for axis 0 with size 2
I have tried to look several places on internet, but could not find an answer, neither could I find the error myself. Can anybody see what is wrong and tell me?
It doesn't work because the matrix is getting smaller and you keep iterating based on the old size, i.e. if the totalMatrix has 3 rows in the beginning and you delete one, the last iteration will try to get a nonexisting row.
While iterating on the Matrix, gather the indices you want to delete. Afterward, you can delete them at once.
toDelete = []
for i in range(len(totalMatrix)): # or np.size(totalMatrix[:, 0])
for j in range(10):
if totalMatrix[i,j] == -1 :
toDelete.append(i)
totalMatrix = np.delete(totalMatrix, i, axis=0)

Categories