Speed up my data reading in python? - python

My current code looks like this:
import pandas as pd
import csv
import matplotlib.pyplot as plt
def data_reader(filename, rowname):
with open(filename, newline='') as fp:
yield from (row[1:] for row in csv.reader(fp, skipinitialspace=True)
if row[0] == rowname)
File = 'data.csv'
ASA = pd.DataFrame.from_records(data_reader(File, 'ASA'))
GDS = pd.DataFrame.from_records(data_reader(File, 'GDS'))
SCD = pd.DataFrame.from_records(data_reader(File, 'SCD'))
ASF = pd.DataFrame.from_records(data_reader(File, 'ASF'))
ADC = pd.DataFrame.from_records(data_reader(File, 'ADC'))
DFS = pd.DataFrame.from_records(data_reader(File, 'DFS'))
DCS = pd.DataFrame.from_records(data_reader(File, 'DCS'))
DFDS = pd.DataFrame.from_records(data_reader(File, 'DFDS'))
It is reading data that looks like this:
legend, useless data, useless data, DCS, useless data, sped, air, xds, sas, dac
legend, useless data, useless data, GDS, useless data, sped, air
Legend, useless data, useless data, ASA, useless data, sped, air, gnd
ASA, 231, 123, 12
GDS, 12, 1
DCS, 13, 12, 123, 12, 4
ASA, 123, 132, 12
and so on for couple of millions....
I am trying to write an IF statement that looks something like this:
pd.DataFrame.from_records(data_reader(
if rowname = 'ASA'
ASA.append(row)
elif rowname = 'GDS'
GDS.append(row)
and so on. Would this be faster? currently it is taking about 1 minute to run my code and plot one graph. I am sure it will be much longer when I have about 10-15 plots to do. I have tried different methods of writing the if/elseif statement but I am not having any luck doing so.

Reading from disk is the bottleneck here, so we should try to avoid reading the file more than once. If you have enough memory to parse the entire CSV into a dict of lists, then you could use
import csv
import collections
import pandas as pd
def data_reader(filename):
dfs = collections.defaultdict(list)
columns = dict()
with open(filename, newline='') as fp:
for row in csv.reader(fp, skipinitialspace=True):
key = row[0].upper()
if key == 'LEGEND':
name = row[3]
columns[name] = row
else:
dfs[key].append(row[1:])
for key in dfs:
num_cols = max(len(row) for row in dfs[key])
dfs[key] = pd.DataFrame(dfs[key], columns=columns[key][-num_cols:])
return dfs
filename = 'data.csv'
dfs = data_reader(filename)
for key in dfs:
print(dfs[key])
The loop
for row in csv.reader(fp, skipinitialspace=True):
key = row[0].upper()
...
dfs[key].append(row[1:])
loads the CSV into a dict, dfs. The dict keys are strings like 'ASA',
'GDS' and 'DCS'. The dict values are lists of lists.
The other loop
for key in dfs:
...
dfs[key] = pd.DataFrame(dfs[key], columns=columns[key][:-num_cols:])
converts the lists of lists to DataFrames.
The if-statement:
if key == 'LEGEND':
name = row[3]
columns[name] = row
else:
dfs[key].append(row[1:])
records the row in the columns dict if the row begins with LEGEND (with or without capitalization), or otherwise records the row in the dfs dict.
Later in the for-loop:
for key in dfs:
num_cols = max(len(row) for row in dfs[key])
dfs[key] = pd.DataFrame(dfs[key], columns=columns[key][-num_cols:])
The keys are strings such as 'ASA'. For each key, the number of columns is
obtained by finding the maximum length of the rows in dfs[key].
columns[key] returns the corresponding legend row for key.
columns[key][-num_cols:] returns the last num_cols values from that row.
The result returned by data_reader is a dict of DataFrames:
In [211]: dfs['ASA']
Out[211]:
sped air gnd
0 231 123 12
1 123 132 12
In [212]: dfs['GDS']
Out[212]:
sped air
0 12 1
In [213]: dfs['DCS']
Out[213]:
sped air xds sas dac
0 13 12 123 12 4

You should be able to do something like this:
df = pd.read_csv('data.csv')
ASA = df.ix[df[0] == "ASA"]
# etc ...

Related

Reading in tab-delimited txt file in chunks?

I have a txt file, and here is a snippet of the first few lines:
C A10231 A1 171|171 HER
C B23098 A1 171|171 HEF
C A03295 A2 171|171 HAF
I want to create a running list of every time the third column reads something other than "A1", and also keep track of how many times "A1" appears. Is there a way to import this file into a pandas df without causing a memory error?
If not, how can I process the txt file using the following rules:
Keep a running count of every time the third column reads "A1"
If the third column is not "A1", append the value to a list.
Find the amount of rows in the txt file
I essentially want to create three outputs. One output is the count of A1, the other is a list of everything that isn't A1 non_A1 = ['A2','B3','B4,'V6'...], and the last is the total number of rows.
All you need to do is process each line as you read it; no need to store anything more than your accumulated results and the current line in memory at any given time, and certainly no need to build a full dataframe from the contents of the file.
row_count = 0
a1_count = 0
non_a1 = []
with open("file.tsv") as f:
for line in f:
row = line.strip().split('\t')
row_count += 1
if row[2] == 'A1':
a1_count += 1
else:
non_a1.append(row[2])
As you tag your question with Pandas, you can use:
count_A1 = 0
non_A1 = set()
num_rows = 0
for chunk in pd.read_csv('/home/damien/data.txt', sep='\t', usecols=[2], header=None, chunksize=1):
count_A1 += chunk[2].eq('A1').sum()
non_A1 |= set(chunk.loc[chunk[2].ne('A1'), 2].unique().tolist())
num_rows += chunk.shape[0]
Output:
>>> count_A1
2
>>> list(non_A1):
['A2']
>>> num_rows
3
Using pandas for this trivial task is overkill
a1_count = 0
line_count = 0
others = []
with open('foo.tsv') as tsv:
for line in tsv:
if (ax := line.split()[2]) == 'A1':
a1_count += 1
else:
others.append(ax)
line_count += 1
In a similar vein to #Corralien. However, using the categorical datatype that results in memory savings for large amounts of data that are in a limited number of categories:
import pandas as pd
# Create some test data
fname = "reading_tsv_in_chunks.tsv"
with open("reading_tsv_in_chunks.tsv", "w") as fid:
for i in range(1000):
fid.write("C\tA10231\tA1\t171|171\tHER\nC\tB23098\tA1\t171|171\tHEF\nC\tA03295\tA2\t171|171\tHAF\nC\tA02225\tA3\t171|171\tHAX\nC\tA012325\tA4\t171|171\tHAY\n")
# Read as categorical
df = pd.read_csv(fname, sep="\t", header=None, names=["category",], usecols=[2,], dtype="category")
print(f"Contents of df:\n{df.describe()}\n")
print(f"Memory usage of with categorical dtype:\n{df.memory_usage()}\n\n")
# Read as non-categorical
df2 = pd.read_csv(fname, sep="\t", header=None, names=["category",], usecols=[2,])
print(f"Contents of df2:\n{df2.describe()}\n")
print(f"Memory usage of WITHOUT categorical dtype:\n{df2.memory_usage()}\n\n")
# Process as necessary e.g.
a1_count = sum([ len(values) for category, values in df.groupby("category")["category"] if category=="A1"])
non_a1_count = sum([ len(values) for category, values in df.groupby("category")["category"] if category!="A1"])
print(f"A1 count: {a1_count}\n")
print(f"Non-A1 count: {non_a1_count}")

Reading csv file formatted as dictioary into pandas

I have a csv file containing sensor data where one row is of the following format
1616580317.0733, {'Roll': 0.563820598084682, 'Pitch': 0.29817540218781163, 'Yaw': 60.18415650363684, 'gyroX': 0.006687641609460116, 'gyroY': -0.012394784949719908, 'gyroZ': -0.0027120113372802734, 'accX': -0.12778355181217196, 'accY': 0.24647256731987, 'accZ': 9.763526916503906}
Where the first column is a timestamp and the remainder is a dictionary like object containing various measured quantities.
I want to read this into a pandas array wit the columns
["Timestamp","Roll","Pitch","Yaw","gyroX","gyroY","gyroZ","accX","accY","accZ"]. What would be an efficient way of doing this? The file is 600MB so it's not a trivial number of lines which need to be parsed.
I'm not sure where you are getting the seconds column from.
The code below parses each row into a timestamp and dict. Then adds the timestamp to the dictionary that will eventually become a row in the dataframe.
import json
import pandas as pd
def read_file(filename):
chunk_size = 20000
entries = []
counter = 0
df = pd.DataFrame()
with open(filename, "r") as fh:
for line in fh:
timestamp, data_dict = line.split(",", 1)
data_dict = json.loads(data_dict.replace("'", '"'))
data_dict["timestamp"] = float(timestamp)
entries.append(data_dict)
counter += 1
if counter == chunk_size:
df = df.append(entries, ignore_index=True)
entries = []
counter = 0
if counter != 0:
df = df.append(entries, ignore_index=True)
return df
read_file("sample.txt")
I think you should convert your csv file to json format and then look at this site on how to transform the dictionary into a pandas dataframe : https://www.delftstack.com/fr/howto/python-pandas/how-to-convert-python-dictionary-to-pandas-dataframe/#:~:text=2%20banana%2012-,M%C3%A9thode%20pandas.,le%20nom%20de%20la%20colonne.

How can we load a .txt file with non aligned data into data frame in Python

datafile (df) in .txt format is mentioned below where few of the fields are missing for some records . The missing fields should be kept as blank in the respective columns .
For example - data file in txt format is
1,name=Messi,car=ford,Price=234,Bike=Harley
2,name=Cavani,car=mazda,price=58,Bike=Ducatti
3,name=Dembele,car=toyota,Bike=Yamaha
4,name=kevin,car=Ford,price=989
5,name=Aguero,Bike=Ducatti
6,name=nadal,car=Ferrari,Bike=Harley
I want the file to be loaded to Python in the format below :
required output with respective column names:
Output_image
I want the column names as Number, CARNAME , PRICE , BIKENAME . I want the respective data to be populated in a DataFrame under the respective column names . The empty values should be kept as blank under the respective columns fields .
I am unable to post the image of the output or type the output here due to format issue . As I am new to stackoverflow , I don't have enough reputation to post the image
Please note that my dataset has million records.
There may be slim chance that an efficient library dedicated to process such a non-standard and non-uniform file format would exist. Therefore I will just parse this file manually line-by-line into a list of dicts, in which the missing keys (columns) can be taken care of by the DataFrame() constructor.
Code:
path_to_file = "/mnt/ramdisk/in.txt"
ls_dic = []
with open(path_to_file) as f:
for line in f:
ls = line.split(",")
dic = {}
dic["Number"] = ls[0]
for k_v in ls[1:]:
k, v = k_v.split("=")
dic[k.capitalize()] = v.strip()
ls_dic.append(dic)
df = pd.DataFrame(ls_dic)
Result:
print(df)
Number Name Car Price Bike
0 1 Messi ford 234 Harley
1 2 Cavani mazda 58 Ducatti
2 3 Dembele toyota NaN Yamaha
3 4 kevin Ford 989 NaN
4 5 Aguero NaN NaN Ducatti
5 6 nadal Ferrari NaN Harley
You could write the data to an intermediate CSV. Add some file modification time checks and you get the conversion only when your data text file changes.
import io
import csv
import pandas as pd
from pathlib import Path
header = ["Number", "CARNAME", "PRICE", "BIKENAME"]
key_to_index = {"car":1, "Price":2, "Bike":3}
def build_car_info_csv(in_fileobj, out_fileobj):
reader = csv.reader(in_fileobj)
writer = csv.writer(out_fileobj)
for row in reader:
outrow = [''] *len(header)
outrow[0] = row.pop(0)
for cell in row:
key, val = cell.split("=")
try:
outrow[key_to_index[key]] = val
except KeyError:
# ignore unwanted keys
pass
writer.writerow(outrow)
def read_car_info_df(filename):
filename = Path(filename)
csv_filename = filename.with_suffix(".csv")
mtime = filename.stat().st_mtime
csv_mtime = csv_filename.stat().st_mtime if csv_filename.is_file() else 0
if mtime > csv_mtime:
with filename.open(newline="") as infile,\
csv_filename.open("w", newline="") as outfile:
build_car_info_csv(infile, outfile)
return pd.read_csv(csv_filename)
TEST
open("mytest.txt", "w").write("""1,name=Messi,car=ford,Price=234,Bike=Harley
2,name=Cavani,car=mazda,price=58,Bike=Ducatti
3,name=Dembele,car=toyota,Bike=Yamaha
4,name=kevin,car=Ford,price=989 5,name=Aguero,Bike=Ducatti
6,name=nadal,car=Ferrari,Bike=Harley""")
df = read_car_info_df("mytest.txt")
print(df)

Comparing 2 lists and return mismatches

im struggling with 2 csv files which I have imported
the csv files look like this:
csv1
planet,diameter,discovered,color
sceptri,33.41685587,28-11-1611 05:15, black
...
csv2
planet,diameter,discovered,color
sceptri,33.41685587,28-11-1611 05:15, blue
...
in both csv files, there are the same planets but in a different order and sometimes with different values (a mismatch)
the data for each planet (diameter, discovered and color) has been entered independently. I wanted to Cross-check the two sheets and find all the fields that are mismatched. Then I want to generate a new file that contains one line per error with a description of the error.
for example:
sceptri: mismatch (black/blue)
here is my code so far
with open('planets1.csv') as csvfile:
a = csv.reader(csvfile, delimiter=',')
data_a= list(a)
for row in a:
print(row)
with open('planets2.csv') as csvfile:
b = csv.reader(csvfile, delimiter=',')
data_b= list(b)
for row in b:
print(row)
print(data_a)
print(data_b)
c= [data_a]
d= [data_b]```
thank you in advance for your help!
Assuming the name of planets are correct in both files, here is my proposal
# Working with list of list, which could be get csv file reading:
csv1 = [["sceptri",33.41685587,"28-11-1611 05:15", "black"],
["foo",35.41685587,"29-11-1611 05:15", "black"],
["bar",38.7,"29-11-1611 05:15", "black"],]
csv2 = [["foo",35.41685587,"29-11-1611 05:15", "black"],
["bar",38.17,"29-11-1611 05:15", "black"],
["sceptri",33.41685587,"28-11-1611 05:15", "blue"]]
# A list to contain the errors:
new_file = []
# A dict to check if a planet has already been processed:
a_dict ={}
# Let's read all planet data:
for planet in csv1+csv2:
# Check if planet is already as a key in a_dict:
if planet[0] in a_dict:
# Yes, sir, need to check discrepancies.
if a_dict[planet[0]] != planet[1:]:
# we have some differences in some values.
# Put both set of values in python sets to differences:
error = set(planet[1:]) ^ set(a_dict[planet[0]])
# Append [planet_name, diff.param1, diff_param2] to new_file:
new_file.append([planet[0]]+list(error))
else:
# the planet name becomes a dict key, other param are key value:
a_dict[planet[0]] = planet[1:]
print(new_file)
# [['bar', 38.17, 38.7], ['sceptri', 'black', 'blue']]
The list new_file may be saved as new file, see Writing a list to file
I'd suggest using Pandas for a task like this.
Firstly, you'll need to read the csv contents into dataframe objects. This can be done as follows:
import pandas as pd
# make a dataframe from each csv file
df1 = pd.read_csv('planets1.csv')
df2 = pd.read_csv('planets2.csv')
You may want to declare names for each column if your CSV file doesn't have them.
colnames = ['col1', 'col2', ..., 'coln']
df1 = pd.read_csv('planets1.csv', names=colnames, index_col=0)
df2 = pd.read_csv('planets2.csv', names=colnames, index_col=0)
# use index_col=0 if csv already has an index column
For the sake of reproducible code, I will define dataframe objects without a csv below:
import pandas as pd
# example column names
colnames = ['A','B','C']
# example dataframes
df1 = pd.DataFrame([[0,3,6], [4,5,6], [3,2,5]], columns=colnames)
df2 = pd.DataFrame([[1,3,1], [4,3,6], [3,6,5]], columns=colnames)
Note that df1 looks like this:
A B C
---------------
0 0 3 6
1 4 5 6
2 3 2 5
And df2 looks like this:
A B C
---------------
0 1 3 1
1 4 3 6
2 3 6 5
The following code compares dataframes, concatenate the comparison to a new dataframe, and then saves the result to a CSV:
# define the condition you want to check for (i.e., mismatches)
mask = (df1 != df2)
# df1[mask], df2[mask] will replace matched values with NaN (Not a Number), and leave mismatches
# dropna(how='all') will remove rows filled entirely with NaNs
errors_1 = df1[mask].dropna(how='all')
errors_2 = df2[mask].dropna(how='all')
# add labels to column names
errors_1.columns += '_1' # for planets 1
errors_2.columns += '_2' # for planets 2
# you can now combine horizontally into one big dataframe
errors = pd.concat([errors_1,errors_2],axis=1)
# if you want, reorder the columns of `errors` so compared columns are next to each other
errors = errors.reindex(sorted(errors.columns), axis=1)
# if you don't like the clutter of NaN values, you can replace them with fillna()
errors = errors.fillna('_')
# save to a csv
errors.to_csv('mismatches.csv')
The final result looks something like this:
A_1 A_2 B_1 B_2 C_1 C_2
-----------------------------
0 0 1 _ _ 6 1
1 _ _ 5 3 _ _
2 _ _ 2 6 _ _
Hope this helps.
This kind of problem can be solved by sorting the rows from the csv files, and then comparing the corresponding rows to see if there are differences.
This approach uses a functional style to perform the comparisons and will compare any number of csv files.
It assumes that the csvs contain the same number of records, and that the columns are in the same order.
import contextlib
import csv
def compare_files(readers):
colnames = [next(reader) for reader in readers][0]
sorted_readers = [sorted(r) for r in readers]
for gen in [compare_rows(colnames, rows) for rows in zip(*sorted_readers)]:
yield from gen
def compare_rows(colnames, rows):
col_iter = zip(*rows)
# Be sure we're comparing the same planets.
planets = set(next(col_iter))
assert len(planets) == 1, planets
planet = planets.pop()
for (colname, *vals) in zip(colnames, col_iter):
if len(set(*vals)) > 1:
yield f"{planet} mismatch {colname} ({'/'.join(*vals)})"
def main(outfile, *infiles):
with contextlib.ExitStack() as stack:
csvs = [stack.enter_context(open(fname)) for fname in infiles]
readers = [csv.reader(f) for f in csvs]
with open(outfile, 'w') as out:
for result in compare_files(readers):
out.write(result + '\n')
if __name__ == "__main__":
main('mismatches.txt', 'planets1.csv', 'planets2.csv')

Combine columns from several CSV files into a single file

I have a bunch of CSV files (only two in the example below). Each CSV file has 6 columns. I want to go into each CSV file, copy the first two columns and add them as new columns to an existing CSV file.
Thus far I have:
import csv
f = open('combined.csv')
data = [item for item in csv.reader(f)]
f.close()
for x in range(1,3): #example has 2 csv files, this will be automated
n=0
while n<2:
f=open(str(x)+".csv")
new_column=[item[n] for item in csv.reader(f)]
f.close()
#print d
new_data = []
for i, item in enumerate(data):
try:
item.append(new_column[i])
print i
except IndexError, e:
item.append("")
new_data.append(item)
f = open('combined.csv', 'w')
csv.writer(f).writerows(new_data)
f.close()
n=n+1
This works, it is not pretty, but it works.
However, I have three minor annoyances:
I open each CSV file twice (once for each column), that is hardly elegant
When I print the combined.csv file, it prints an empty row following each row?
I have to provide a combined.csv file that has at least as many rows in it as the largest file I may have. Since I do not really know what that number may be, that kinda sucks
As always, any help is much appreciated!!
As requested:
1.csv looks like (mock data)
1,a
2,b
3,c
4,d
2.csv looks like
5,e
6,f
7,g
8,h
9,i
the combined.csv file should look like
1,a,5,e
2,b,6,f
3,c,7,g
4,d,8,h
,,9,i
import csv
import itertools as IT
filenames = ['1.csv', '2.csv']
handles = [open(filename, 'rb') for filename in filenames]
readers = [csv.reader(f, delimiter=',') for f in handles]
with open('combined.csv', 'wb') as h:
writer = csv.writer(h, delimiter=',', lineterminator='\n', )
for rows in IT.izip_longest(*readers, fillvalue=['']*2):
combined_row = []
for row in rows:
row = row[:2] # select the columns you want
if len(row) == 2:
combined_row.extend(row)
else:
combined_row.extend(['']*2)#This extends two empty columns
writer.writerow(combined_row)
for f in handles:
f.close()
The line for rows in IT.izip_longest(*readers, fillvalue=['']*2):
can be understood with an example:
In [1]: import itertools as IT
In [2]: readers = [(1,2,3), ('a','b','c','d'), (10,20,30,40)]
In [3]: list(IT.izip_longest(readers[0], readers[1], readers[2]))
Out[3]: [(1, 'a', 10), (2, 'b', 20), (3, 'c', 30), (None, 'd', 40)]
As you can see, IT.izip_longest behaves very much like zip, except that it does not stop until the longest iterable is consumed. It fills in missing items with None by default.
Now what happens if there were more than 3 items in readers?
We would want to write
list(IT.izip_longest(readers[0], readers[1], readers[2], ...))
but that's laborious and if we did not know len(readers) in advance, we wouldn't even be able to replace the ellipsis (...) with something explicit.
Python has a solution for this: the star (aka argument unpacking) syntax:
In [4]: list(IT.izip_longest(*readers))
Out[4]: [(1, 'a', 10), (2, 'b', 20), (3, 'c', 30), (None, 'd', 40)]
Notice the result Out[4] is identical to the result Out[3].
The *readers tells Python to unpack the items in readers and send them along as individual arguments to IT.izip_longest.
This is how Python allows us to send an arbitrary number of arguments to a function.
These days it seems almost obligatory for someone to give a pandas-based solution to any data processing problem in Python. So here's mine:
import pandas as pd
to_merge = ['{}.csv'.format(i) for i in range(4)]
dfs = []
for filename in to_merge:
# read the csv, making sure the first two columns are str
df = pd.read_csv(filename, header=None, converters={0: str, 1: str})
# throw away all but the first two columns
df = df.ix[:,:1]
# change the column names so they won't collide during concatenation
df.columns = [filename + str(cname) for cname in df.columns]
dfs.append(df)
# concatenate them horizontally
merged = pd.concat(dfs,axis=1)
# write it out
merged.to_csv("merged.csv", header=None, index=None)
which for the files
~/coding/pand/merge$ cat 0.csv
0,a,6,5,3,7
~/coding/pand/merge$ cat 1.csv
1,b,7,6,7,0
2,c,0,1,8,7
3,d,6,8,4,5
4,e,8,4,2,4
~/coding/pand/merge$ cat 2.csv
5,f,6,2,9,1
6,g,0,3,2,7
7,h,6,5,1,9
~/coding/pand/merge$ cat 3.csv
8,i,9,1,7,1
9,j,0,9,3,9
gives
In [21]: !cat merged.csv
0,a,1,b,5,f,8,i
,,2,c,6,g,9,j
,,3,d,7,h,,
,,4,e,,,,
In [22]: pd.read_csv("merged.csv", header=None)
Out[22]:
0 1 2 3 4 5 6 7
0 0 a 1 b 5 f 8 i
1 NaN NaN 2 c 6 g 9 j
2 NaN NaN 3 d 7 h NaN NaN
3 NaN NaN 4 e NaN NaN NaN NaN
which I think is the right alignment.
Here is a program I wrote to solve your problem. It makes a class that holds the information about each CSV file to read, including which columns you want from it. Then there is simply a list of CSV files to read, and a line is read from each.
Since you said it needs to keep returning rows until all the input files are read, it returns dummy values for input files that have reached the end. It keeps reading rows until all input files are done.
Also, this program only needs to hold one row at a time in memory. So it could process even large CSV files without needing much memory.
Originally I had a dummy value of -1 for missing data. Now I see you added an example and you just want no value. I've changed the program from using -1 to using an empty string when there is no data.
One of the design goals was to make this extendable. Right now you need the first two columns, but what if you later need columns 0, 3, and 7 from one of the files? So each file has a list with the columns to take.
I didn't actually write the code to rename the output file to the original filename but that is easy to add.
Ideally this whole thing would be wrapped up into a class, where you can iterate a class instance and get back one row put together using columns from all input files. I didn't take the extra time to do that, but if you will be using this over the long term you might want to do that. Also, I never bothered to close any of the input files, since I figure the program will end after we write the output file and everything will close then; but ideally we should close all files after we use them!
import csv
fname_in = "combined.csv"
fname_out = "combined.tmp"
lst_other_fnames = [str(x) + ".csv" for x in range(1, 3)]
no_data = ''
def _no_data_list(columns):
return [no_data for _ in columns]
class DataCsvFile(object):
def __init__(self, fname, columns=None):
self.fname = fname
self.f = open(fname)
self.reader = csv.reader(self.f)
self.columns = columns
self.done = False
def next_columns(self):
if self.done:
return _no_data_list(self.columns)
try:
item = next(self.reader)
except StopIteration:
self.done = True
return _no_data_list(self.columns)
return [item[i] for i in self.columns]
# want all columns from original file
data_csv_files = [DataCsvFile(fname_in, range(5))]
# build list of filenames and columns: want first two columns from each
data_csv_files.extend(DataCsvFile(fname, range(2)) for fname in lst_other_fnames)
with open(fname_out, "w") as out_f:
writer = csv.writer(out_f)
while True:
values = []
for df in data_csv_files:
columns = df.next_columns()
values.extend(columns)
if not all(df.done for df in data_csv_files):
writer.writerow(values)
else:
break
Here's an example (I'm using string io instead of files for simplicity, but that's not essential):
a = u"""
1,a
2,b
3,c
4,d
"""
b = u"""
5,e
6,f
7,g
8,h
9,i
"""
c = u"""
11,x
12,y
13,z
"""
import io, csv, itertools
data = []
expand = lambda it, size: it + [[''] * len(it[0])] * size
for f in [a, b, c]:
with io.StringIO(f.strip()) as fp:
d = list(csv.reader(fp))
t = len(d) - len(data)
data = d if not data else [
x + y for x, y in itertools.izip_longest(
expand(data, t), expand(d, -t))]
for r in data:
print ','.join(r)
# 1,a,5,e,11,x
# 2,b,6,f,12,y
# 3,c,7,g,13,z
# 4,d,8,h,,
# ,,9,i,,
with real files (named 1.csv, 2.csv etc) the main loop will look like this:
for n in range(...):
with open(str(n) + '.csv') as fp:
d = list(csv.reader(fp))
t = len(d) - len(data)
data = d if not data else [
x + y for x, y in itertools.izip_longest(
expand(data, t), expand(d, -t))]

Categories