Merge 2 or more csv files with time overlap data

Merge 2 or more csv files with time overlap data - python

How do I merge 2 or more csv files with time overlap data? For e.g.,
data1 is
Time u v w
0.24001821 0 0.009301949 0
0.6400364 0 0.009311552 0
0.84005458 0 0.0093211568 0
0.94034343 0 0.0094739951 0
data2 is
Time u v w
0.74041502 0 0.0095119512 0
0.84043291 0 0.0095214359 0
0.94045075 0 0.0095309047 0
1.2404686 0 0.0095403752 0
What I want is:
Time u v w
0.24001821 0 0.009301949 0
0.6400364 0 0.009311552 0
0.74041502 0 0.0095119512 0
0.84043291 0 0.0095214359 0
0.94045075 0 0.0095309047 0
1.2404686 0 0.0095403752 0
So the last few rows of data from the 1st csv file is deleted and the 2nd csv file is merged so that the time sequence is increasing.
How can that be done? Thanks.

Python has an excellent built in library function to help with this called heapq.merge().
Assuming your data is space delimited, you could use this as follows:
from heapq import merge
import csv
filenames = ['data1.csv', 'data2.csv']
merge_list = []
for filename in filenames:
f_input = open(filename)
csv_input = csv.reader(f_input, delimiter=' ', skipinitialspace=True)
header = next(csv_input)
merge_list.append(csv_input)
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output, delimiter=' ')
csv_output.writerow(header)
csv_output.writerows(merge(*merge_list, key=lambda x: float(x[0])))
This would produce a CSV output format as:
Time u v w
0.24001821 0 0.009301949 0
0.6400364 0 0.009311552 0
0.74041502 0 0.0095119512 0
0.84005458 0 0.0093211568 0
0.84043291 0 0.0095214359 0
0.94034343 0 0.0094739951 0
0.94045075 0 0.0095309047 0
1.2404686 0 0.0095403752 0
This will work for any number of input CSV files.

If both files are individually ordered by time already.
Using for loop is enough:
# csv cell should be separated by comma, change if required
dilimeter = ','
# open files and read lines
f1 = open('data1.csv', 'r')
f1_lines = f1.readlines()
f1.close()
f2 = open('data2.csv', 'r')
f2_lines = f2.readlines()
f2.close()
# extract header
output_lines = [f1_lines[0]]
# start scanning frome line 2 of both files (line 1 is header)
f1_index = 1
f2_index = 1
while True:
# all data1 are processed, append remaining lines from data2
if f1_index >= len(f1_lines):
output_lines += f2_lines[f2_index:]
break
# all data2 are processed, append remaining lines from data1
if f2_index >= len(f2_lines):
output_lines += f1_lines[f1_index:]
break
f1_line_time = float(f1_lines[f1_index].split(dilimeter)[0]) # get the time cell of data1
f2_line_time = float(f2_lines[f2_index].split(dilimeter)[0]) # get the time cell of data2
if f1_line_time < f2_line_time:
output_lines.append(f1_lines[f1_index])
f1_index += 1
elif f1_lines == f2_line_time:
# if they are equal in time, pick one
output_lines.append(f1_lines[f1_index])
f1_index += 1
f2_index += 1
else:
output_lines.append(f2_lines[f2_index])
f2_index += 1
f_output = open('out.csv', 'w')
f_output.write(''.join(output_lines))
f_output.close()

Another option:
import csv
delimiter = " "
with open("data1.csv", "r") as fin1,\
open("data2.csv", "r") as fin2,\
open("data.csv", "w") as fout:
reader1 = csv.reader(fin1, delimiter=delimiter)
reader2 = csv.reader(fin2, delimiter=delimiter)
writer = csv.writer(fout, delimiter=delimiter)
next(reader2)
first_row = next(reader2)
start2 = float(first_row[0])
writer.writerow(next(reader1))
for row in reader1:
if start2 <= float(row[0]):
break
writer.writerow(row)
writer.writerow(first_row)
writer.writerows(reader2)
Assumption is that the files are already ordered individually:
First take the first data row of data2.csv and convert its first entry into a float start2.
With that in mind write all rows from data1.csv with a time less than start2 into the new file data.csv, and break out of the loop once the condition isn't met anymore.
Then write the already extracted first data row from data2.csv to data.csv, and afterwards write the rest of data2.csv to data.csv.
Result for
data1.csv
Time u v w
0.24001821 0 0.009301949 0
0.6400364 0 0.009311552 0
0.84005458 0 0.0093211568 0
0.94034343 0 0.0094739951 0
data2.csv
Time u v w
0.74041502 0 0.0095119512 0
0.84043291 0 0.0095214359 0
0.94045075 0 0.0095309047 0
1.2404686 0 0.0095403752 0
is
Time u v w
0.24001821 0 0.009301949 0
0.6400364 0 0.009311552 0
0.74041502 0 0.0095119512 0
0.84043291 0 0.0095214359 0
0.94045075 0 0.0095309047 0
1.2404686 0 0.0095403752 0
A more general solution (multiple files) could look like:
import csv
delimiter = " "
files = ["data1.csv", "data2.csv", "data3.csv"]
stops = []
for file in files[1:]:
with open(file, "r") as file:
reader = csv.reader(file, delimiter=delimiter)
header = next(reader)
stops.append(float(next(reader)[0]))
stops.append(float("inf"))
with open("data.csv", "w") as fout:
writer = csv.writer(fout, delimiter=delimiter)
writer.writerow(header)
for stop, file in zip(stops, files):
with open(file, "r") as fin:
next(fin)
reader = csv.reader(fin, delimiter=delimiter)
for row in reader:
if stop <= float(row[0]):
break
writer.writerow(row)
This would work for overlaps looking like
1. file: |------|
2. file: |--------|
3. file: |------|
but not
1. file: |--------|
2. file: |-------|
3. file: |--------------|

Related

How to extract columnes from Python list?

My Python code
import operator
with open('index.txt') as f:
lines = f.read().splitlines()
print type(lines)
print len(lines)
l2=lines[1::3]
print len(l2)
print l2[0]
list1 = [0,2]
my_items = operator.itemgetter(*list1)
new_list = [ my_items(x) for x in l2 ]
with open('newindex1.txt','w') as thefile:
for item in l2:
thefile.write("%s\n" % item)
Couple of lines from index.txt
0 0 0
0 1 0
0 2 0
1 0 0
1 1 0
1 2 0
2 0 0
2 1 0
2 2 0
3 0 0
Couple of lines from newindex1.txt
0 1 0
1 1 0
2 1 0
3 1 0
4 1 0
5 1 0
6 1 0
7 1 0
8 1 0
9 1 0
I wanted to read the file as a list,then choose every third row and then finally select first and the third column from that list.It seems that I do not understand how operator works.
If I try with Back2Basics solution
import numpy as np
myarray = np.fromfile('index.txt', dtype=int, sep=' ')
anotherarray = myarray[::3][0,2]
I got
File "a12.py", line 4, in <module>
anotherarray = myarray[::3][0,2]
IndexError: too many indices

You don't need to read all the data into memory at all, you can use itertools.islice to parse the rows you want and the csv lib to read and write the data:
from operator import itemgetter
from itertools import islice
import csv
with open("in.txt") as f, open('newindex1.txt','w') as out:
r = csv.reader(f, delimiter=" ")
wr = csv.writer(out, delimiter=" ")
for row in iter(lambda: list(islice(r, 0, 3, 3)), []):
wr.writerow(map(itemgetter(0, 2), row)[0])

I'd highly suggest using numpy for this. The reason being this is all numerical data that fits so nicely into memory. The code looks like this.
import numpy as np
myarray = np.fromfile('index.txt', dtype=int, sep=' ')
anotherarray = myarray[::3,::2]
and then you want to write the file
anotherarray.tofile('newfile.txt', sep=" ")
The way the array slicing line [::3,::2] reads is "take every 3rd row starting from 0, and take every other column starting from 0"

I think you need something this?
lines = []
with open('index.txt', 'r') as fi:
lines = fi.read().splitlines()
lines = [line.split() for line in lines]
with open('answer.txt', 'w') as fo:
for column in range(len(lines)):
if (column + 1) % 3:
fo.write('%s %s\n' % (lines[column][0], lines[column][2]))

Fast way to create pandas dataframe from pairs

I have a big file of word/tag pairs saved like this:
This/DT gene/NN called/VBN gametocide/NN
Now I want to put these pairs into a DataFrame with their counts like this:
DT | NN --
This| 1 0
Gene| 0 1
:
I tried doing this with a dict that counts the pairs and then put it in the DataFrame:
file = open("data.txt", "r")
train = file.read()
words = train.split()
data = defaultdict(int)
for i in words:
data[i] += 1
matrixB = pd.DataFrame()
for elem, count in data.items():
word, tag = elem.split('/')
matrixB.loc[tag, word] = count
But this takes a really long time (file has like 300000 of these). Is there a faster way to do this?

What was wrong with the answers from your other question?
from collections import Counter
with open('data.txt') as f:
train = f.read()
c = Counter(tuple(x.split('/')) for x in train.split())
s = pd.Series(c)
df = s.unstack().fillna(0)
print(df)
yields
DT NN VBN
This 1 0 0
called 0 0 1
gametocide 0 1 0
gene 0 1 0

I thought this question was remarkably similar... Why did you post twice?
from collection import Counter
text = "This/DT gene/NN called/VBN gametocide/NN"
>>> pd.Series(Counter(tuple(pair.split('/')) for pair in text.split())).unstack().fillna(0)
DT NN VBN
This 1 0 0
called 0 0 1
gametocide 0 1 0
gene 0 1 0

How to remove a specific string common in multiple lines in a CSV file using python script?

I have a csv file which contains 65000 lines (Size approximately 28 MB). In each of the lines a certain path in the beginning is given e.g. "c:\abc\bcd\def\123\456". Now let's say the path "c:\abc\bcd\" is common in all the lines and rest of the content is different. I have to remove the common part (In this case "c:\abc\bcd\") from all the lines using a python script. For example the content of the CSV file is as mentioned.
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.frag 0 0 0
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.vert 0 0 0
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.link-link-0.frag 16 24 3
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.link-link-0.vert 87 116 69
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.link-link-0.vert.bin 75 95 61
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.link-link-0 0 0
C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.link-link-6 0 0 0
In the above example I need the output as below
FILE0.frag 0 0 0
FILE0.vert 0 0 0
FILE0.link-link-0.frag 17 25 2
FILE0.link-link-0.vert 85 111 68
FILE0.link-link-0.vert.bin 77 97 60
FILE0.link-link-0 0 0
FILE0.link 0 0 0
Can any of you please help me out with this?

^\S+/
You can simply use this regex over each line and replace by empty string.See demo.
https://regex101.com/r/cK4iV0/17
import re
p = re.compile(ur'^\S+/', re.MULTILINE)
test_str = u"C:/Abc/Def/Test/temp/test/GLNext/FILE0.frag 0 0 0\nC:/Abc/Def/Test/temp/test/GLNext/FILE0.vert 0 0 0\nC:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-0.frag 16 24 3\nC:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-0.vert 87 116 69\nC:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-0.vert.bin 75 95 61\nC:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-0 0 0\nC:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-6 0 0 0 "
subst = u" "
result = re.sub(p, subst, test_str)

What about something like,
import csv
with open("file.csv", 'rb') as f:
sl = []
csvread = csv.reader(f, delimiter=' ')
for line in csvread:
sl.append(line.replace("C:/Abc/Def/Test/temp\.\test\GLNext\", ""))
To write the list sl out to filenew use,
with open('filenew.csv', 'wb') as f:
csvwrite = csv.writer(f, delimiter=' ')
for line in sl:
csvwrite.writerow(line)

You can automatically detect the common prefix without the need to hardcode it. You don't really need regex for this. os.path.commonprefix can be used
instead:
import csv
import os
with open('data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile)
paths = [] #stores all paths
rows = [] #stores all lines
for row in reader:
paths.append(row[0].split("/")) #split path by "/"
rows.append(row)
commonprefix = os.path.commonprefix(paths) #finds prefix common to all paths
for row in rows:
row[0] = row[0].replace('/'.join(commonprefix)+'/', "") #remove prefix
rows now has a list of lists which you can write to a file
with open('data2.csv', 'wb') as csvfile:
writer = csv.writer(csvfile)
for row in rows:
writer.writerow(row)

The following Python script will read your file in (assuming it looks like your example) and will create a version removing the common folders:
import os.path, csv
finput = open("d:\\input.csv","r")
csv_input = csv.reader(finput, delimiter=" ", skipinitialspace=True)
csv_output = csv.writer(open("d:\\output.csv", "wb"), delimiter=" ")
# Create a set of unique folder names
set_folders = set()
for input_row in csv_input:
set_folders.add(os.path.split(input_row[0])[0])
# Determine the common prefix
base_folder = os.path.split(os.path.commonprefix(set_folders))[0]
nprefix = len(base_folder) + 1
# Go back to the start of the input CSV
finput.seek(0)
for input_row in csv_input:
csv_output.writerow([input_row[0][nprefix:]] + input_row[1:])
Using the following as input:
C:/Abc/Def/Test/temp/test/GLNext/FILE0.frag 0 0 0
C:/Abc/Def/Test/temp/test/GLNext/FILE0.vert 0 0 0
C:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-0.frag 16 24 3
C:/Abc/Def/Test/temp/test/GLNext2/FILE0.link-link-0.vert 87 116 69
C:/Abc/Def/Test/temp/test/GLNext5/FILE0.link-link-0.vert.bin 75 95 61
C:/Abc/Def/Test/temp/test/GLNext7/FILE0.link-link-0 0 0
C:/Abc/Def/Test/temp/test/GLNext/FILE0.link-link-6 0 0 0
The output is as follows:
GLNext/FILE0.frag 0 0 0
GLNext/FILE0.vert 0 0 0
GLNext/FILE0.link-link-0.frag 16 24 3
GLNext2/FILE0.link-link-0.vert 87 116 69
GLNext5/FILE0.link-link-0.vert.bin 75 95 61
GLNext7/FILE0.link-link-0 0 0
GLNext/FILE0.link-link-6 0 0 0
With one space between each column, although this could easily be changed.

So i tried something like this
for dirName, subdirList, fileList in os.walk(Directory):
for fname in fileList:
if fname.endswith('.csv'):
for line in fileinput.input(os.path.join(dirName, fname), inplace = 1):
location = line.find(r'GLNext')
if location > 0:
location += len('GLNext')
print line.replace(line[:location], ".")
else:
print line

You can use the pandas library for this. Doing so, you can leverage pandas' amazing handling of big CSV files (even in the hundreds of MB).
Code:
import pandas as pd
csv_file = 'test_csv.csv'
df = pd.read_csv(csv_file, header=None)
print df
print "-------------------------------------------"
path = "C:/Abc/bcd/Def/Test/temp/test/GLNext/"
df[0] = df[0].replace({path:""}, regex=True)
print df
# df.to_csv("truncated.csv") # Export to new file.
Result:
0 1 2 3
0 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.frag 0 0 0
1 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.vert 0 0 0
2 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.lin... 16 24 3
3 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.lin... 87 116 69
4 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.lin... 75 95 61
5 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.lin... 0 0 NaN
6 C:/Abc/bcd/Def/Test/temp/test/GLNext/FILE0.lin... 0 0 0
-------------------------------------------
0 1 2 3
0 FILE0.frag 0 0 0
1 FILE0.vert 0 0 0
2 FILE0.link-link-0.frag 16 24 3
3 FILE0.link-link-0.vert 87 116 69
4 FILE0.link-link-0.vert.bin 75 95 61
5 FILE0.link-link-0 0 0 NaN
6 FILE0.link-link-6 0 0 0

Sum all values in CSV

I have a CSV file with 0s and 1s and need to determine the sum total of the entire file. The file looks like this when opened in ExCel:
0 1 1 1 0 0 0 1 0 1
1 0 1 0 0 1 1 0 0 0
0 0 1 0 0 0 0 1 0 1
0 1 1 1 1 1 1 0 1 1
0 0 1 0 1 0 1 1 0 1
0 0 0 0 0 0 0 0 1 0
0 0 1 0 0 1 1 0 1 1
0 0 1 1 0 0 1 1 0 1
1 0 1 0 1 0 1 1 1 0
0 1 0 0 1 0 0 0 1 1
Using this script I can sum the values of each row and they print out in a single column:
import csv
import numpy as np
path = r'E:\myPy\one_zero.csv'
infile = open(path, 'r')
with infile as file_in:
fin = csv.reader(file_in, delimiter = ',')
for line in fin:
print line.count('1')
I need to be able to sum up the resulting column, but my experience with this is mild. Looking for suggestions. Thanks.

If you have more than just 1's and 0's map to int and sum all rows:
with open( r'E:\myPy\one_zero.csv') as f:
r = csv.reader(f, delimiter = ',')
count = sum(sum(map(int,row)) for row in r)
Or just count the 1's:
with open( r'E:\myPy\one_zero.csv' ) as f:
r = csv.reader(f, delimiter = ',')
count = sum(row.count("1") for row in r)
Just use with open(r'E:\myPy\one_zero.csv'), you don't need to and should not open and then pass the file handle to with.

path = r'E:\myPy\one_zero.csv'
infile = open(path, 'r')
answer = 0
with infile as file_in:
fin = csv.reader(file_in, delimiter = ',')
for line in fin:
a = line.count(1)
answer += a
print answer
Example:
answer = 0
lines = [[1, 0, 0, 1],[1,1,1,1],[0,0,0,1]]
for line in lines:
a = line.count(1)
answer += a
print answer
7
One possible error is you used:
line.count('1')
vs
line.count(1)
looking for a string instead of a numeric

Why use the CSV module at all? You have a file full of 0s, 1s, commas and newlines. Just open the file, read() it and count the 1s:
>>> with open(filename, 'r') as fin: print fin.read().count('1')
That should get you what you want, no?

Python counting lines in files using exact locations

I know this is straightforward but I am not quite understanding how to make my for loop work.
My first file is a long list of two columns of data:
ROW VALUE
0 165
1 115
2 32
3 14
4 9
5 0
6 89
7 26
. .
406369 129
406370 103
My second file is a list of important row numbers:
1
43
192
so on
All I want to do is go to the row number of interest in file 1, and then walk down, row by row, until the value column hits zero. The output will then be simply a list of the important row numbers followed by the count of the lines there are until the first file reaches zero. For instance, the output for important row number "1" from file #2, should be 3, because there are three lines and then the values reaches 0 in file #1. I appreciate any help! I have some script I have started and can post it in an edit if that is helpful. THANK YOU!
EDIT:
Some script I have started:
for line in important_rows_file:
line = line.strip().split()
positive_starts.append(int(line[2])
countsfile = []
for line in file:
line = line.strip().split()
countsfile.append([line[0]] + [line[1]])
count = 0
i = 0
for i in range(0, len(countsfile)):
for start in positive_starts:
if int(countsfile[start + i][1]) > 0:
count = count + 1
else:
count = count
.... not sure what is next

Here are two ways to do it.
The first way builds a dictionary in memory for all row numbers. This would be a good way to do it if a. You are going to re-use this same data over and over (you can store it and read it back in) or b. You are going to process a lot of rows from the second file (ie. most of the rows need this done). The second way just does a one-off for a given row number.
Given this as the input file:
ROW VALUE
0 165
1 115
2 32
3 14
4 9
5 0
6 89
7 26
8 13
9 0
Method 1.
ref_dict = {}
with open("so_cnt_file.txt") as infile:
next(infile)
cur_start_row = 0
cur_rows = []
for line in infile:
row, col = [int(val) for val in line.strip().split(" ") if val]
if col == 0:
for cur_row in cur_rows:
ref_dict[cur_row] = row - cur_row - 1
cur_start_row = row
cur_rows = []
continue
cur_rows.append(row)
print ref_dict
OUTPUT
{0: 4, 1: 3, 2: 2, 3: 1, 4: 0, 6: 2, 7: 1, 8: 0}
Method 2
def get_count_for_row(row=1):
with open("so_cnt_file.txt") as infile:
for i in range(0, row + 2):
next(infile)
cnt = 0
for line in infile:
row, col = [int(val) for val in line.strip().split(" ") if val]
if col == 0:
return cnt
cnt += 1
print get_count_for_row(1)
print get_count_for_row(6)
OUTPUT
3
2
Here is a solution that takes all of the rows of interest in a single call.
def get_count_for_rows(*rows):
rows = sorted(rows)
counts = []
with open("so_cnt_file.txt") as infile:
cur_row = 0
for i in range(cur_row, 2):
next(infile)
while rows:
inrow = rows.pop(0)
for i in range(cur_row, inrow):
next(infile)
cnt = 0
for line in infile:
row, col = [int(val) for val in line.strip().split(" ") if val]
if col == 0:
counts.append((inrow, cnt))
break
cnt += 1
cur_row = row
return counts
print get_count_for_rows(1, 6)
OUTPUT
[(1, 3), (6, 2)]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Merge 2 or more csv files with time overlap data - python

Related

How to extract columnes from Python list?

Fast way to create pandas dataframe from pairs

How to remove a specific string common in multiple lines in a CSV file using python script?

Sum all values in CSV

Python counting lines in files using exact locations

Categories

Resources