Trouble merging dask dataframes - python

I have several .pcap files whose data I want write to one large dask data frame. Currently, initializes a dask data frame using data from the first file. It then is supposed to process the rest of the pcap files and add to that dask data frame using merge/concat. However, when I check the number of the rows of the merged dask dataframe it doesn't increase. What is happening?
I also am not sure if I am using the right approach for my use case. I am trying to convert my entire dataset into a giant dask dataframe and write it out to h5 file. My computer doesn't have enough memory to load the entire dataset so that's why I'm using dask. The idea is to load the dask dataframe that contains the entire dataset so I could do operations on the entire dataset. I'm new to dask and I've read over the some of the documentation but I'm still fuzzy about how dasks handles loading data from disk instead of memory. I'm also fuzzy about how partitions work in dask. Specifically, I'm also not sure how chunksize differs from partitions so I'm having trouble properly partitioning this dataframe. Any tips and advice would be helpful.
As said before, I've read over the main parts of the documentation.
I've tried using the dd.merge(dask_df, panda_df) as shown in the documentation. When I initialize the dask dataframe, it starts with 6 rows. When I use merge the row count decreases to 1
I've also tried using concat. Again, I have a count of 6 rows during initialization. However, after the concat operations the row count still remains at 6. I would expect the row count to increase.
Here is the initialization function
import os
import sys
import h5py
import pandas as pd
import dask.dataframe as dd
import gc
import pprint
from scapy.all import *
flags = {
'R': 0,
'A': 1,
'S': 2,
'DF':3,
'FA':4,
'SA':5,
'RA':6,
'PA':7,
'FPA':8
}
def initialize(file):
global flags
data = {
'time_delta': [0],
'ttl':[],
'len':[],
'dataofs':[],
'window':[],
'seq_delta':[0],
'ack_delta':[0],
'flags':[]
}
scap = sniff(offline=file,filter='tcp and ip')
for packet in range(0,len(scap)):
pkt = scap[packet]
flag = flags[str(pkt['TCP'].flags)]
data['ttl'].append(pkt['IP'].ttl)
data['len'].append(pkt['IP'].len)
data['dataofs'].append(pkt['TCP'].dataofs)
data['window'].append(pkt['TCP'].window)
data['flags'].append(flag)
if packet != 0:
lst_pkt = scap[packet-1]
data['time_delta'].append(pkt.time - lst_pkt.time)
data['seq_delta'].append(pkt['TCP'].seq - lst_pkt['TCP'].seq)
data['ack_delta'].append(pkt['TCP'].ack - lst_pkt['TCP'].ack)
panda = pd.DataFrame(data=data)
panda['ttl']=panda['ttl'].astype('float16')
panda['flags']=panda['flags'].astype('float16')
panda['dataofs']=panda['dataofs'].astype('float16')
panda['len']=panda['len'].astype('float16')
panda['window']=panda['window'].astype('float32')
panda['seq_delta']=panda['seq_delta'].astype('float32')
panda['ack_delta']=panda['ack_delta'].astype('float32')
df =dd.from_pandas(panda,npartitions=6)
gc.collect()
return df
Here is the concatenation function
def process(file):
global flags
global df
data = {
'time_delta': [0],
'ttl':[],
'len':[],
'dataofs':[],
'window':[],
'seq_delta':[0],
'ack_delta':[0],
'flags':[]
}
scap = sniff(offline=file,filter='tcp and ip')
for packet in range(0,len(scap)):
pkt = scap[packet]
flag = flags[str(pkt['TCP'].flags)]
data['ttl'].append(pkt['IP'].ttl)
data['len'].append(pkt['IP'].len)
data['dataofs'].append(pkt['TCP'].dataofs)
data['window'].append(pkt['TCP'].window)
data['flags'].append(flag)
if packet != 0:
lst_pkt = scap[packet-1]
data['time_delta'].append(pkt.time - lst_pkt.time)
data['seq_delta'].append(pkt['TCP'].seq - lst_pkt['TCP'].seq)
data['ack_delta'].append(pkt['TCP'].ack - lst_pkt['TCP'].ack)
panda = pd.DataFrame(data=data)
panda['ttl']=panda['ttl'].astype('float16')
panda['flags']=panda['flags'].astype('float16')
panda['dataofs']=panda['dataofs'].astype('float16')
panda['len']=panda['len'].astype('float16')
panda['window']=panda['window'].astype('float32')
panda['seq_delta']=panda['seq_delta'].astype('float32')
panda['ack_delta']=panda['ack_delta'].astype('float32')
#merge version dd.merge(df, panda)
dd.concat([df,dd.from_pandas(panda,npartitions=6)])
gc.collect()
And here is the main program
directory = 'dev/streams/'
files = os.listdir(directory)
df = initialize(directory+files[0])
files.remove(files[0])
for file in files:
process(directory+file)
print(len(df))
using merge:
print(len(df)) = 1
using concat:
print(len(df))=6
expected:
print(len(df)) > 10,000

Try explicitly returning df as the result of your dask concat:
df = dd.concat([df, dd.from_pandas(panda,npartitions=6)])
And don't duplicate the exact same blocks of code but encaspulate them in another function:
def process_panda(file_wpath, flags):
data = {
[...]
panda['ack_delta']=panda['ack_delta'].astype('float32')
return panda
Then you just have to test if the file to process is the first, so your main code becomes:
import os
import sys
import h5py
import pandas as pd
import dask.dataframe as dd
import gc
import pprint
from scapy.all import *
flags = {
'R': 0,
'A': 1,
'S': 2,
'DF':3,
'FA':4,
'SA':5,
'RA':6,
'PA':7,
'FPA':8
}
directory = 'dev/streams/'
files = os.listdir(directory)
for file in files:
file_wpath = os.path.join(directory, file)
panda = process_panda(file_wpath, flags)
if file == files[0]:
df = dd.from_pandas(panda, npartitions=6)
else:
df = dd.concat([df, dd.from_pandas(panda, npartitions=6)])
gc.collect()
print(len(df))

Related

How to read list of parquets with partially overlapping set of columns in dask?

Consider this code:
import dask.dataframe as dd
import numpy as np
df1=pd.DataFrame({'A': [1, 2], 'B': [11, 12]})
df1.to_parquet("df1.parquet")
df2=pd.DataFrame({'A': [3, 4], 'C': [13, 14]})
df2.to_parquet("df2.parquet")
all_files = ["df1.parquet", "df2.parquet"]
full_df = dd.read_parquet(all_files)
# dask.compute(full_df) # KeyError: "['B'] not in index"
def normalize(df):
df_cols = set(df.columns)
for c in ['A', 'B', 'C']:
if c not in df_cols:
df[c] = np.nan
df = df[sorted(df.columns)]
return df
normal_df = full_df.map_partitions(normalize)
dask.compute(normal_df) # Still gives keyError
I was hoping that after the normalization using map_partitions, I wouldn't get keyError, but the read_parquet probably fails before reaching the map_partitions step.
I could have created the DataFrame from a list of delayed objects which would each read one file and normalize the columns, but I want to avoid using delayed objects for this reason
The other option is suggested by SultanOrazbayev is to use dask dataframe like this:
def normal_ddf(path):
df = dd.read_parquet(path)
return normalize(df) # normalize f should work with both pandas and dask
full_df = dd.concat([normal_ddf(path) for path in all_files])
Problem with this is that, when all_files contains large number of files (10K) this takes a long time to create the dataframe since all those dd.read_parquet happens sequentially. Although dd.read_parquet doesn't need to load the whole file, it still needs to read some headers to get column info. Doing it sequentially on 10k files adds up.
So, what is the proper/efficient way to read a bunch of parquet files all of which don't have the same set of columns?
dd.concat should take care of your normalization.
Consider this example:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import string
N = 100_000
all_files = []
for col in string.ascii_uppercase[1:]:
df = pd.DataFrame({
"A": np.random.normal(size=N),
col: (np.random.normal(size=N) ** 2) * 50,
})
fname = f"df_{col}.parquet"
all_files.append(fname)
df.to_parquet(fname)
full_df = dd.concat([dd.read_parquet(path) for path in all_files]).compute()
And I get this on my task stream dashboard:
Another option that was not mentioned in the comments by #Michael Delgado is to load each parquet into a separate dask dataframe and then stitch them together. Here's the rough pseudocode:
def normal_ddf(path):
df = dd.read_parquet(path)
return normalize(df) # normalize f should work with both pandas and dask
full_df = dd.concat([normal_ddf(path) for path in all_files])

dask: Increasing speed of mutliple file loading into single dataframe

I am merging several thousands of reasonably sized (~1 million rows) dataframes together on a fairly regular basis.
While I can get pandas to work with read_csv, it is a terrible solution due to the extremely large overhead.
I need a faster solution to this and dask apparently has this multiple csv functionality baked into their read_csv/read_table functions.
However, I haven't noticed much improvement in speed with these solutions.
Is there a way to increase the speed of the following type of process? :
import io
import re
import numpy as np
import dask.bag as dbag
import dask.dataframe as ddf
def filter_data(fp, ix_col = 'index_here', val_col = 'some_value'):
dask_frame = ddf.read_table(fp)
# filter to only one column and index (like a series)
series = dask_frame[[ix_col, val_col]].set_index(ix_col)
# Rename it to be the filename / file_id
file_id = re.match("file_(.+)\.txt", fp)[1]
series.columns = [file_id]
return series
def get_dataframe(file_paths):
# Make a collection
dasks_bag = dbag.from_sequence(file_paths)
# Open the files as dask frame and filter each to series-like frames
filtered_dfs = dasks_bag.map(filter_data)
# Compute pandas dataframe on each within the list
filtered_dfs = filtered_dfs.compute()
# concatenate them together
df = ddf.concat(filtered_dfs, axis = 1)
# Compute on concatenated again, so it becomes pandas dataframe
return df.dropna(how = "all").compute()
# Just write some random files here
paths = ['file_120202021.txt', 'file_123.txt', 'file_12330.txt']
for fp in paths:
with open(fp, 'w') as f:
f.write('index_here\tsome_value\tother_cols\n')
for row in range(0,1000):
for val, other_col in np.random.rand(1, 2):
f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')
# Make a dataframe with dask
get_dataframe(paths)
Edit:
I have a small script here that shows the failure of dask:
The time required for dask on my machine is 1.87 seconds
while the time required for pandas is 0.29 seconds
Clearly, I am doing this wrong, as dask was specifically made for more rapid computation on dataframes.
import io
import re
import numpy as np
import pandas as pd
import dask.bag as dbag
import dask.dataframe as ddf
import time
def get_dask_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
# Make a collection
dasks_bag = dbag.from_sequence(file_paths)
# read and filter to data of interest
dask_frames = ddf.read_table(file_paths, include_path_column = True)[[ix_col, val_col, 'path']]
# Make pandas dataframe
df = dask_frames.compute()
# Pivot since read_table puts path in one column
df = df.pivot_table(values = val_col, index = ix_col, columns = 'path')
return df.dropna(how = "all")
def get_pandas_dataframe(file_paths, ix_col = 'index_here', val_col = 'some_value'):
# Make a collection
l = []
for f in file_paths:
series = pd.read_csv(f, sep = '\t')[[ix_col, val_col]].set_index(ix_col)
# Rename it to be the filename / file_id
file_id = re.match("file_(.+)\.txt", f)[1]
series.columns = [file_id]
l += [series]
# concatenate them together
df = pd.concat(l, axis = 1)
return df.dropna(how = "all")
# Just write a whole bunch of random files
paths = ['file_'+str(i)+'.txt' for i in range(0, 100)]
for fp in paths:
with open(fp, 'w') as f:
f.write('index_here\tsome_value\tother_cols\n')
for row in range(0,1000):
for val, other_col in np.random.rand(1, 2):
f.write(str(row)+'\t'+str(val)+'\t'+str(other_col)+'\n')
t0 = time.time()
# Make a dataframe with dask
df1 = get_dask_dataframe(paths)
t1 = time.time()
print(t1-t0)
t0 = time.time()
# Make a dataframe with dask
df2 = get_pandas_dataframe(paths)
t1 = time.time()
print(t1-t0)

How to create dataframes from chunks

I have huge scv file(630 mln rows),and my computer can t read it in 1 dataframe(out of memory)(After it i wanna to teach model for each dataframe).I did 630 chunks, and wanna create dataframe from each chunk(It s will 630 dataframes). Cant find or undestand no one solution of this situation.Can someone support me pls. Mb i think wrong in general and someone can say smtng new opinion on this situation. Code:
import os
import pandas as pd
lol=0
def load_csv():
path="D:\\mml\\"
csv_path = os.path.join(path,"eartquaqe_train.csv")
return pd.read_csv(csv_path,sep=',',chunksize=1000000)
dannie = load_csv()
for chunk in dannie:
lol=lol+1
print(lol)
630
Use the pandas.read_csv() method and specify either the chunksize parameter or create an iterator over all you csv rows using skiprows like:
import pandas as pd
path = 'D:\...'
a = list(range(0,6300))
for line in range(0,6300-630,630):
df = pd.read_csv(path,skiprows=a[0:line]+a[line+630:])
print(df)
OR
import pandas as pd
path = 'D:\...'
df = pd.read_csv(path,chunksize=6300)
for chunk in df:
print(chunk)
Use -
for chunk in dannie:
chunk.to_csv('{}.csv'.format(lol))
lol+=1
Read here for more info

Read several CSV files on pandas do operations and append to csv file multiprocessing

my following code reads several csv files in one folder, filters according the value in a column, and then appends the resulting dataframe to a csv file. Given that there are about 410 files 130 MB each, this code currently takes about 30 min. I was wondering if there is a quick way to make it faster my using a multiprocessing library. Could you offer me some tips on how to get it started? thank you
import pandas as pd
import glob
path =r'C:\Users\\Documents\\'
allfiles = glob.glob(path + "*.csv")
with open('test.csv','w') as f:
for i,file in enumerate(allfiles):
df = pd.read_csv(file,index_col=None, header=0)
df.sort_values(['A','B','C'], ascending = True, inplace = True)
df['D'] = df.groupby(['A','B'])['C'].fillna(method = 'ffill')
df[(df['D'] == 1) | (df['D'] == 0)].to_csv(f, header = False)
print i
print "Done"

Import multiple CSV files into pandas and concatenate into one DataFrame

I would like to read several CSV files from a directory into pandas and concatenate them into one big DataFrame. I have not been able to figure it out though. Here is what I have so far:
import glob
import pandas as pd
# Get data file names
path = r'C:\DRO\DCL_rawdata_files'
filenames = glob.glob(path + "/*.csv")
dfs = []
for filename in filenames:
dfs.append(pd.read_csv(filename))
# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)
I guess I need some help within the for loop?
See pandas: IO tools for all of the available .read_ methods.
Try the following code if all of the CSV files have the same columns.
I have added header=0, so that after reading the CSV file's first row, it can be assigned as the column names.
import pandas as pd
import glob
import os
path = r'C:\DRO\DCL_rawdata_files' # use your path
all_files = glob.glob(os.path.join(path , "/*.csv"))
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
Or, with attribution to a comment from Sid.
all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
It's often necessary to identify each sample of data, which can be accomplished by adding a new column to the dataframe.
pathlib from the standard library will be used for this example. It treats paths as objects with methods, instead of strings to be sliced.
Imports and Setup
from pathlib import Path
import pandas as pd
import numpy as np
path = r'C:\DRO\DCL_rawdata_files' # or unix / linux / mac path
# Get the files from the path provided in the OP
files = Path(path).glob('*.csv') # .rglob to get subdirectories
Option 1:
Add a new column with the file name
dfs = list()
for f in files:
data = pd.read_csv(f)
# .stem is method for pathlib objects to get the filename w/o the extension
data['file'] = f.stem
dfs.append(data)
df = pd.concat(dfs, ignore_index=True)
Option 2:
Add a new column with a generic name using enumerate
dfs = list()
for i, f in enumerate(files):
data = pd.read_csv(f)
data['file'] = f'File {i}'
dfs.append(data)
df = pd.concat(dfs, ignore_index=True)
Option 3:
Create the dataframes with a list comprehension, and then use np.repeat to add a new column.
[f'S{i}' for i in range(len(dfs))] creates a list of strings to name each dataframe.
[len(df) for df in dfs] creates a list of lengths
Attribution for this option goes to this plotting answer.
# Read the files into dataframes
dfs = [pd.read_csv(f) for f in files]
# Combine the list of dataframes
df = pd.concat(dfs, ignore_index=True)
# Add a new column
df['Source'] = np.repeat([f'S{i}' for i in range(len(dfs))], [len(df) for df in dfs])
Option 4:
One liners using .assign to create the new column, with attribution to a comment from C8H10N4O2
df = pd.concat((pd.read_csv(f).assign(filename=f.stem) for f in files), ignore_index=True)
or
df = pd.concat((pd.read_csv(f).assign(Source=f'S{i}') for i, f in enumerate(files)), ignore_index=True)
An alternative to darindaCoder's answer:
path = r'C:\DRO\DCL_rawdata_files' # use your path
all_files = glob.glob(os.path.join(path, "*.csv")) # advisable to use os.path.join as this makes concatenation OS independent
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
# doesn't create a list, nor does it append to one
import glob
import os
import pandas as pd
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('', "my_files*.csv"))))
Almost all of the answers here are either unnecessarily complex (glob pattern matching) or rely on additional third-party libraries. You can do this in two lines using everything Pandas and Python (all versions) already have built in.
For a few files - one-liner
df = pd.concat(map(pd.read_csv, ['d1.csv', 'd2.csv','d3.csv']))
For many files
import os
filepaths = [f for f in os.listdir(".") if f.endswith('.csv')]
df = pd.concat(map(pd.read_csv, filepaths))
For No Headers
If you have specific things you want to change with pd.read_csv (i.e., no headers) you can make a separate function and call that with your map:
def f(i):
return pd.read_csv(i, header=None)
df = pd.concat(map(f, filepaths))
This pandas line, which sets the df, utilizes three things:
Python's map (function, iterable) sends to the function (the
pd.read_csv()) the iterable (our list) which is every CSV element
in filepaths).
Panda's read_csv() function reads in each CSV file as normal.
Panda's concat() brings all these under one df variable.
Easy and Fast
Import two or more CSV files without having to make a list of names.
import glob
import pandas as pd
df = pd.concat(map(pd.read_csv, glob.glob('data/*.csv')))
The Dask library can read a dataframe from multiple files:
>>> import dask.dataframe as dd
>>> df = dd.read_csv('data*.csv')
(Source: https://examples.dask.org/dataframes/01-data-access.html#Read-CSV-files)
The Dask dataframes implement a subset of the Pandas dataframe API. If all the data fits into memory, you can call df.compute() to convert the dataframe into a Pandas dataframe.
I googled my way into Gaurav Singh's answer.
However, as of late, I am finding it faster to do any manipulation using NumPy and then assigning it once to a dataframe rather than manipulating the dataframe itself on an iterative basis and it seems to work in this solution too.
I do sincerely want anyone hitting this page to consider this approach, but I don't want to attach this huge piece of code as a comment and making it less readable.
You can leverage NumPy to really speed up the dataframe concatenation.
import os
import glob
import pandas as pd
import numpy as np
path = "my_dir_full_path"
allFiles = glob.glob(os.path.join(path,"*.csv"))
np_array_list = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
np_array_list.append(df.as_matrix())
comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)
big_frame.columns = ["col1", "col2"....]
Timing statistics:
total files :192
avg lines per file :8492
--approach 1 without NumPy -- 8.248656988143921 seconds ---
total records old :1630571
--approach 2 with NumPy -- 2.289292573928833 seconds ---
A one-liner using map, but if you'd like to specify additional arguments, you could do:
import pandas as pd
import glob
import functools
df = pd.concat(map(functools.partial(pd.read_csv, sep='|', compression=None),
glob.glob("data/*.csv")))
Note: map by itself does not let you supply additional arguments.
If you want to search recursively (Python 3.5 or above), you can do the following:
from glob import iglob
import pandas as pd
path = r'C:\user\your\path\**\*.csv'
all_rec = iglob(path, recursive=True)
dataframes = (pd.read_csv(f) for f in all_rec)
big_dataframe = pd.concat(dataframes, ignore_index=True)
Note that the three last lines can be expressed in one single line:
df = pd.concat((pd.read_csv(f) for f in iglob(path, recursive=True)), ignore_index=True)
You can find the documentation of ** here. Also, I used iglobinstead of glob, as it returns an iterator instead of a list.
EDIT: Multiplatform recursive function:
You can wrap the above into a multiplatform function (Linux, Windows, Mac), so you can do:
df = read_df_rec('C:\user\your\path', *.csv)
Here is the function:
from glob import iglob
from os.path import join
import pandas as pd
def read_df_rec(path, fn_regex=r'*.csv'):
return pd.concat((pd.read_csv(f) for f in iglob(
join(path, '**', fn_regex), recursive=True)), ignore_index=True)
Inspired from MrFun's answer:
import glob
import pandas as pd
list_of_csv_files = glob.glob(directory_path + '/*.csv')
list_of_csv_files.sort()
df = pd.concat(map(pd.read_csv, list_of_csv_files), ignore_index=True)
Notes:
By default, the list of files generated through glob.glob is not sorted. On the other hand, in many scenarios, it's required to be sorted e.g. one may want to analyze number of sensor-frame-drops v/s timestamp.
In pd.concat command, if ignore_index=True is not specified then it reserves the original indices from each dataframes (i.e. each individual CSV file in the list) and the main dataframe looks like
timestamp id valid_frame
0
1
2
.
.
.
0
1
2
.
.
.
With ignore_index=True, it looks like:
timestamp id valid_frame
0
1
2
.
.
.
108
109
.
.
.
IMO, this is helpful when one may want to manually create a histogram of number of frame drops v/s one minutes (or any other duration) bins and want to base the calculation on very first timestamp e.g.
begin_timestamp = df['timestamp'][0]
Without, ignore_index=True, df['timestamp'][0] generates the series containing very first timestamp from all the individual dataframes, it does not give just a value.
Another one-liner with list comprehension which allows to use arguments with read_csv.
df = pd.concat([pd.read_csv(f'dir/{f}') for f in os.listdir('dir') if f.endswith('.csv')])
Alternative using the pathlib library (often preferred over os.path).
This method avoids iterative use of pandas concat()/apped().
From the pandas documentation:
It is worth noting that concat() (and therefore append()) makes a full copy of the data, and that constantly reusing this function can create a significant performance hit. If you need to use the operation over several datasets, use a list comprehension.
import pandas as pd
from pathlib import Path
dir = Path("../relevant_directory")
df = (pd.read_csv(f) for f in dir.glob("*.csv"))
df = pd.concat(df)
If multiple CSV files are zipped, you may use zipfile to read all and concatenate as below:
import zipfile
import pandas as pd
ziptrain = zipfile.ZipFile('yourpath/yourfile.zip')
train = []
train = [ pd.read_csv(ziptrain.open(f)) for f in ziptrain.namelist() ]
df = pd.concat(train)
Based on Sid's good answer.
To identify issues of missing or unaligned columns
Before concatenating, you can load CSV files into an intermediate dictionary which gives access to each data set based on the file name (in the form dict_of_df['filename.csv']). Such a dictionary can help you identify issues with heterogeneous data formats, when column names are not aligned for example.
Import modules and locate file paths:
import os
import glob
import pandas
from collections import OrderedDict
path =r'C:\DRO\DCL_rawdata_files'
filenames = glob.glob(path + "/*.csv")
Note: OrderedDict is not necessary, but it'll keep the order of files which might be useful for analysis.
Load CSV files into a dictionary. Then concatenate:
dict_of_df = OrderedDict((f, pandas.read_csv(f)) for f in filenames)
pandas.concat(dict_of_df, sort=True)
Keys are file names f and values are the data frame content of CSV files.
Instead of using f as a dictionary key, you can also use os.path.basename(f) or other os.path methods to reduce the size of the key in the dictionary to only the smaller part that is relevant.
import os
os.system("awk '(NR == 1) || (FNR > 1)' file*.csv > merged.csv")
Where NR and FNR represent the number of the line being processed.
FNR is the current line within each file.
NR == 1 includes the first line of the first file (the header), while FNR > 1 skips the first line of each subsequent file.
In case of an unnamed column issue, use this code for merging multiple CSV files along the x-axis.
import glob
import os
import pandas as pd
merged_df = pd.concat([pd.read_csv(csv_file, index_col=0, header=0) for csv_file in glob.glob(
os.path.join("data/", "*.csv"))], axis=0, ignore_index=True)
merged_df.to_csv("merged.csv")
You can do it this way also:
import pandas as pd
import os
new_df = pd.DataFrame()
for r, d, f in os.walk(csv_folder_path):
for file in f:
complete_file_path = csv_folder_path+file
read_file = pd.read_csv(complete_file_path)
new_df = new_df.append(read_file, ignore_index=True)
new_df.shape
Consider using convtools library, which provides lots of data processing primitives and generates simple ad hoc code under the hood.
It is not supposed to be faster than pandas/polars, but sometimes it can be.
e.g. you could concat csv files into one for further reuse - here's the code:
import glob
from convtools import conversion as c
from convtools.contrib.tables import Table
import pandas as pd
def test_pandas():
df = pd.concat(
(
pd.read_csv(filename, index_col=None, header=0)
for filename in glob.glob("tmp/*.csv")
),
axis=0,
ignore_index=True,
)
df.to_csv("out.csv", index=False)
# took 20.9 s
def test_convtools():
table = None
for filename in glob.glob("tmp/*.csv"):
table_ = Table.from_csv(filename, header=False)
if table is None:
table = table_
else:
table = table.chain(table_)
table.into_csv("out_convtools.csv", include_header=False)
# took 15.8 s
Of course if you just want to obtain a dataframe without writing a concatenated file, it will take 4.63 s and 10.9 s correspondingly (pandas is faster here because it doesn't need to zip columns for writing it back).
import pandas as pd
import glob
path = r'C:\DRO\DCL_rawdata_files' # use your path
file_path_list = glob.glob(path + "/*.csv")
file_iter = iter(file_path_list)
list_df_csv = []
list_df_csv.append(pd.read_csv(next(file_iter)))
for file in file_iter:
lsit_df_csv.append(pd.read_csv(file, header=0))
df = pd.concat(lsit_df_csv, ignore_index=True)
This is how you can do it using Colaboratory on Google Drive:
import pandas as pd
import glob
path = r'/content/drive/My Drive/data/actual/comments_only' # Use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True,sort=True)
frame.to_csv('/content/drive/onefile.csv')

Categories