dask: Increasing speed of mutliple file loading into single dataframe - python

I am merging several thousands of reasonably sized (~1 million rows) dataframes together on a fairly regular basis.
While I can get pandas to work with read_csv, it is a terrible solution due to the extremely large overhead.
I need a faster solution to this and dask apparently has this multiple csv functionality baked into their read_csv/read_table functions.
However, I haven't noticed much improvement in speed with these solutions.
Is there a way to increase the speed of the following type of process? :
import io
import re
import numpy as np
import dask.bag as dbag
import dask.dataframe as ddf
def filter_data(fp, ix_col = 'index_here', val_col = 'some_value'):
dask_frame = ddf.read_table(fp)
# filter to only one column and index (like a series)
series = dask_frame[[ix_col, val_col]].set_index(ix_col)
# Rename it to be the filename / file_id
file_id = re.match("file_(.+)\.txt", fp)[1]
series.columns = [file_id]
return series
def get_dataframe(file_paths):
# Make a collection
dasks_bag = dbag.from_sequence(file_paths)
# Open the files as dask frame and filter each to series-like frames
filtered_dfs = dasks_bag.map(filter_data)
# Compute pandas dataframe on each within the list
filtered_dfs = filtered_dfs.compute()
# concatenate them together
df = ddf.concat(filtered_dfs, axis = 1)
# Compute on concatenated again, so it becomes pandas dataframe
return df.dropna(how = "all").compute()
# Just write some random files here
paths = ['file_120202021.txt', 'file_123.txt', 'file_12330.txt']
for fp in paths:
with open(fp, 'w') as f:
f.write('index_here\tsome_value\tother_cols\n')
for row in range(0,1000):
for val, other_col in np.random.rand(1, 2):
f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')
# Make a dataframe with dask
get_dataframe(paths)
Edit:
I have a small script here that shows the failure of dask:
The time required for dask on my machine is 1.87 seconds
while the time required for pandas is 0.29 seconds
Clearly, I am doing this wrong, as dask was specifically made for more rapid computation on dataframes.
import io
import re
import numpy as np
import pandas as pd
import dask.bag as dbag
import dask.dataframe as ddf
import time
def get_dask_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
# Make a collection
dasks_bag = dbag.from_sequence(file_paths)
# read and filter to data of interest
dask_frames = ddf.read_table(file_paths, include_path_column = True)[[ix_col, val_col, 'path']]
# Make pandas dataframe
df = dask_frames.compute()
# Pivot since read_table puts path in one column
df = df.pivot_table(values = val_col, index = ix_col, columns = 'path')
return df.dropna(how = "all")
def get_pandas_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
# Make a collection
l = []
for f in file_paths:
series = pd.read_csv(f, sep = '\t')[[ix_col, val_col]].set_index(ix_col)
# Rename it to be the filename / file_id
file_id = re.match("file_(.+)\.txt", f)[1]
series.columns = [file_id]
l += [series]
# concatenate them together
df = pd.concat(l, axis = 1)
return df.dropna(how = "all")
# Just write a whole bunch of random files
paths = ['file_'+str(i)+'.txt' for i in range(0, 100)]
for fp in paths:
with open(fp, 'w') as f:
f.write('index_here\tsome_value\tother_cols\n')
for row in range(0,1000):
for val, other_col in np.random.rand(1, 2):
f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')
t0 = time.time()
# Make a dataframe with dask
df1 = get_dask_dataframe(paths)
t1 = time.time()
print(t1-t0)
t0 = time.time()
# Make a dataframe with dask
df2 = get_pandas_dataframe(paths)
t1 = time.time()
print(t1-t0)

Related

How to read list of parquets with partially overlapping set of columns in dask?

Consider this code:
import dask.dataframe as dd
import numpy as np
df1=pd.DataFrame({'A': [1, 2], 'B': [11, 12]})
df1.to_parquet("df1.parquet")
df2=pd.DataFrame({'A': [3, 4], 'C': [13, 14]})
df2.to_parquet("df2.parquet")
all_files = ["df1.parquet", "df2.parquet"]
full_df = dd.read_parquet(all_files)
# dask.compute(full_df) # KeyError: "['B'] not in index"
def normalize(df):
df_cols = set(df.columns)
for c in ['A', 'B', 'C']:
if c not in df_cols:
df[c] = np.nan
df = df[sorted(df.columns)]
return df
normal_df = full_df.map_partitions(normalize)
dask.compute(normal_df) # Still gives keyError
I was hoping that after the normalization using map_partitions, I wouldn't get keyError, but the read_parquet probably fails before reaching the map_partitions step.
I could have created the DataFrame from a list of delayed objects which would each read one file and normalize the columns, but I want to avoid using delayed objects for this reason
The other option is suggested by SultanOrazbayev is to use dask dataframe like this:
def normal_ddf(path):
df = dd.read_parquet(path)
return normalize(df) # normalize f should work with both pandas and dask
full_df = dd.concat([normal_ddf(path) for path in all_files])
Problem with this is that, when all_files contains large number of files (10K) this takes a long time to create the dataframe since all those dd.read_parquet happens sequentially. Although dd.read_parquet doesn't need to load the whole file, it still needs to read some headers to get column info. Doing it sequentially on 10k files adds up.
So, what is the proper/efficient way to read a bunch of parquet files all of which don't have the same set of columns?
dd.concat should take care of your normalization.
Consider this example:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import string
N = 100_000
all_files = []
for col in string.ascii_uppercase[1:]:
df = pd.DataFrame({
"A": np.random.normal(size=N),
col: (np.random.normal(size=N) ** 2) * 50,
})
fname = f"df_{col}.parquet"
all_files.append(fname)
df.to_parquet(fname)
full_df = dd.concat([dd.read_parquet(path) for path in all_files]).compute()
And I get this on my task stream dashboard:
Another option that was not mentioned in the comments by #Michael Delgado is to load each parquet into a separate dask dataframe and then stitch them together. Here's the rough pseudocode:
def normal_ddf(path):
df = dd.read_parquet(path)
return normalize(df) # normalize f should work with both pandas and dask
full_df = dd.concat([normal_ddf(path) for path in all_files])

How to use loops to adjust path names

I'm working on a program which analyses a lot of csv files.
Currently I'm declaring every item manually, but as you can see in my code I'm actually just go +1 in my paths and in the variable-names.
I guess I can simplify this with a loop, just don't know how to do this with the path-names.
My code:
import pandas as pd
import numpy as np
### declation ###
df_primes1 = pd.DataFrame()
df_primes1 = np.array(df_primes1)
df_search1 = pd.DataFrame()
df_primes2 = pd.DataFrame()
df_primes2 = np.array(df_primes2)
df_search2 = pd.DataFrame()
df_primes3 = pd.DataFrame()
df_primes3 = np.array(df_primes3)
df_search3 = pd.DataFrame()
searchterm = '322'
### reads csv in numpy array ###
df_primes1 = pd.read_csv('1/1_Primes_32.csv', delimiter=';', header=None, names='1')
df_primes2 = pd.read_csv('1/2_Primes_32.csv', delimiter=';', header=None, names='2')
df_primes3 = pd.read_csv('1/3_Primes_32.csv', delimiter=';', header=None, names='3')
### sorts prime numbers ###
#df_sorted = df_primes1.sort_values(by='n')
#print(df_sorted)
### searches for number with "searchterm" as start value ###
df_search1 = df_primes1[df_primes1['1'].astype(str).str.startswith(searchterm)]['1']
df_search2 = df_primes2[df_primes2['2'].astype(str).str.startswith(searchterm)]['2']
df_search3 = df_primes3[df_primes3['3'].astype(str).str.startswith(searchterm)]['3']
print(df_search1)
print(df_search2)
print(df_search3)
The program is working, I was just want to know how I can simplify this, because there will be 20+ more files like this.
IIUC, we can use pathlib and a dict comprehension :
from pathlib import Path
p = 'Path/to/your_csv/'
dfs = {
f"search_{i}": pd.read_csv(file, delimiter=";",
header=None,
names=str(i))
for i, file in enumerate(Path(p).glob("*Prime*.csv"), 1)
}
to break down each item,
p is the target folder that holds your csvs
i is an enumerator to loop over your files you will most likely need to add a pre-step of sorting your csvs to get the order you're after.
file is each item that is returned from the generator object. we turn each value into a dataframe.
you can filter each dataframe by your collection i.e
dfs['search_1']
this will return a dataframe.

Trouble merging dask dataframes

I have several .pcap files whose data I want write to one large dask data frame. Currently, initializes a dask data frame using data from the first file. It then is supposed to process the rest of the pcap files and add to that dask data frame using merge/concat. However, when I check the number of the rows of the merged dask dataframe it doesn't increase. What is happening?
I also am not sure if I am using the right approach for my use case. I am trying to convert my entire dataset into a giant dask dataframe and write it out to h5 file. My computer doesn't have enough memory to load the entire dataset so that's why I'm using dask. The idea is to load the dask dataframe that contains the entire dataset so I could do operations on the entire dataset. I'm new to dask and I've read over the some of the documentation but I'm still fuzzy about how dasks handles loading data from disk instead of memory. I'm also fuzzy about how partitions work in dask. Specifically, I'm also not sure how chunksize differs from partitions so I'm having trouble properly partitioning this dataframe. Any tips and advice would be helpful.
As said before, I've read over the main parts of the documentation.
I've tried using the dd.merge(dask_df, panda_df) as shown in the documentation. When I initialize the dask dataframe, it starts with 6 rows. When I use merge the row count decreases to 1
I've also tried using concat. Again, I have a count of 6 rows during initialization. However, after the concat operations the row count still remains at 6. I would expect the row count to increase.
Here is the initialization function
import os
import sys
import h5py
import pandas as pd
import dask.dataframe as dd
import gc
import pprint
from scapy.all import *
flags = {
'R': 0,
'A': 1,
'S': 2,
'DF':3,
'FA':4,
'SA':5,
'RA':6,
'PA':7,
'FPA':8
}
def initialize(file):
global flags
data = {
'time_delta': [0],
'ttl':[],
'len':[],
'dataofs':[],
'window':[],
'seq_delta':[0],
'ack_delta':[0],
'flags':[]
}
scap = sniff(offline=file,filter='tcp and ip')
for packet in range(0,len(scap)):
pkt = scap[packet]
flag = flags[str(pkt['TCP'].flags)]
data['ttl'].append(pkt['IP'].ttl)
data['len'].append(pkt['IP'].len)
data['dataofs'].append(pkt['TCP'].dataofs)
data['window'].append(pkt['TCP'].window)
data['flags'].append(flag)
if packet != 0:
lst_pkt = scap[packet-1]
data['time_delta'].append(pkt.time - lst_pkt.time)
data['seq_delta'].append(pkt['TCP'].seq - lst_pkt['TCP'].seq)
data['ack_delta'].append(pkt['TCP'].ack - lst_pkt['TCP'].ack)
panda = pd.DataFrame(data=data)
panda['ttl']=panda['ttl'].astype('float16')
panda['flags']=panda['flags'].astype('float16')
panda['dataofs']=panda['dataofs'].astype('float16')
panda['len']=panda['len'].astype('float16')
panda['window']=panda['window'].astype('float32')
panda['seq_delta']=panda['seq_delta'].astype('float32')
panda['ack_delta']=panda['ack_delta'].astype('float32')
df =dd.from_pandas(panda,npartitions=6)
gc.collect()
return df
Here is the concatenation function
def process(file):
global flags
global df
data = {
'time_delta': [0],
'ttl':[],
'len':[],
'dataofs':[],
'window':[],
'seq_delta':[0],
'ack_delta':[0],
'flags':[]
}
scap = sniff(offline=file,filter='tcp and ip')
for packet in range(0,len(scap)):
pkt = scap[packet]
flag = flags[str(pkt['TCP'].flags)]
data['ttl'].append(pkt['IP'].ttl)
data['len'].append(pkt['IP'].len)
data['dataofs'].append(pkt['TCP'].dataofs)
data['window'].append(pkt['TCP'].window)
data['flags'].append(flag)
if packet != 0:
lst_pkt = scap[packet-1]
data['time_delta'].append(pkt.time - lst_pkt.time)
data['seq_delta'].append(pkt['TCP'].seq - lst_pkt['TCP'].seq)
data['ack_delta'].append(pkt['TCP'].ack - lst_pkt['TCP'].ack)
panda = pd.DataFrame(data=data)
panda['ttl']=panda['ttl'].astype('float16')
panda['flags']=panda['flags'].astype('float16')
panda['dataofs']=panda['dataofs'].astype('float16')
panda['len']=panda['len'].astype('float16')
panda['window']=panda['window'].astype('float32')
panda['seq_delta']=panda['seq_delta'].astype('float32')
panda['ack_delta']=panda['ack_delta'].astype('float32')
#merge version dd.merge(df, panda)
dd.concat([df,dd.from_pandas(panda,npartitions=6)])
gc.collect()
And here is the main program
directory = 'dev/streams/'
files = os.listdir(directory)
df = initialize(directory+files[0])
files.remove(files[0])
for file in files:
process(directory+file)
print(len(df))
using merge:
print(len(df)) = 1
using concat:
print(len(df))=6
expected:
print(len(df)) > 10,000
Try explicitly returning df as the result of your dask concat:
df = dd.concat([df, dd.from_pandas(panda,npartitions=6)])
And don't duplicate the exact same blocks of code but encaspulate them in another function:
def process_panda(file_wpath, flags):
data = {
[...]
panda['ack_delta']=panda['ack_delta'].astype('float32')
return panda
Then you just have to test if the file to process is the first, so your main code becomes:
import os
import sys
import h5py
import pandas as pd
import dask.dataframe as dd
import gc
import pprint
from scapy.all import *
flags = {
'R': 0,
'A': 1,
'S': 2,
'DF':3,
'FA':4,
'SA':5,
'RA':6,
'PA':7,
'FPA':8
}
directory = 'dev/streams/'
files = os.listdir(directory)
for file in files:
file_wpath = os.path.join(directory, file)
panda = process_panda(file_wpath, flags)
if file == files[0]:
df = dd.from_pandas(panda, npartitions=6)
else:
df = dd.concat([df, dd.from_pandas(panda, npartitions=6)])
gc.collect()
print(len(df))

Using Python to reduce the two lines of data in a file to create a new file

I want to use python to reduce the two lines of data in a file to create a new file. I use pandas and numpy for processing, but the processing time of pandas is very long,even need a few hours, while numpy can be two or three minutes, a total of more than 1 million data,
as part of the data:
33,Jogging,49105962326000,-0.6946377,12.680544,0.50395286;
33,Jogging,49106062271000,5.012288,11.264028,0.95342433;
33,Jogging,49106112167000,4.903325,10.882658,-0.08172209;
My pandas code is as below:
import pandas as pd
import numpy as np
import time
time1 = time.time()
file = open('WISDM_ar_v1.1_raw.txt','r')
dataset = file.readlines()
list1 = []
for i in range(len(dataset)-1):
dataset[i] = dataset[i].rstrip('\n')
dataset[i] = dataset[i].rstrip(';')
dataset[i] = dataset[i].split(",")
if len(dataset[i])==6:
#list1为处理后的数据
list1.append(dataset[i])
array1 = np.array(list1)
#newline两行之间按什么分割 delimiter列之间按什么分割
np.savetxt("aa.txt", array1, fmt="%s",newline='\r\n', delimiter=",")
column_names = ['user-id', 'activity', 'timestamp', 'x-axis', 'y-axis', 'z-axis']
dataset1 = pd.read_csv('aa.txt',names=column_names, header=None)
df = pd.DataFrame(dataset1)
df1 = pd.DataFrame(columns=column_names)
for i in range(0,len(dataset1)-1):
data = dataset1.loc[[i]]
if dataset1.loc[i+1, 'activity']==dataset1.loc[i,'activity']:
data.loc[i,'user-id'] = dataset1.loc[i,'user-id']
data.loc[i,'x-axis'] = dataset1.loc[i+1,'x-axis']-dataset1.loc[i,'x-axis']
data.loc[i,'y-axis'] = dataset1.loc[i+1,'y-axis'] - dataset1.loc[i,'y-axis']
data.loc[i,'z-axis'] = dataset1.loc[i+1,'z-axis'] - dataset1.loc[i,'z-axis']
df1 = df1.append(data, ignore_index=True)
df1.to_csv('new_data.txt', mode='a',sep=',', header=False, index=False)
I want to know why this is the case. Is there any mistake in the pandas code I wrote? Thank you very much!

Python Fast data import column-wide

I'm using the following code to import 2 columns (trigger and amplitude) out of 3 from 500 *.txt files :
from glob import glob
import pandas
dataFileList = glob( '*.txt' )
nbDataSamplesFiles = len(dataFileList)
amplitudes = []
colnames = ['time','trigger','amplitude']
for dataFileName in dataFileList :
#Method4
data = pandas.read_csv( dataFileName, delim_whitespace=True, skipinitialspace=True, names = colnames ) #Environ 4.5s pour 500 fichiers
trigger1 = data['trigger'].tolist()
amplitude1 = data.amplitude.tolist() #another way
amplitudes.append( amplitude1 ) #list of lists
amplitudes = np.asarray( amplitudes ) #matrix nbFiles x nbSamples
It takes about 3.5 seconds to do the job.
I need it to be much faster, is there a way to do it using the same or another python module ?
And how can I achieve it ?
UPDATE 1 : Using dask
import dask.dataframe as dd
amplitudes = []
for dataFileName in dataFileList :
df = dd.read_csv(urlpath = dataFileName, delim_whitespace=True, skipinitialspace=True, names = colnames )
trigger1 = df.trigger.values
amplitude1 = df.amplitude.values
amplitudes.append( amplitude1 ) #list of arrays
I want to check the content of amplitude1 :
ipdb> amplitude1[111:121]
*** ValueError: ('Arrays chunk sizes are unknown: %s', (nan,))
Any idea ?
Dask might be good option to try for handling large collections/directory of CSVs - Go through Dask Docs - Specific Usecase

Categories