compare the first word from 2 text files - python

I have these 2 csv files. The first word is the "key" and I need to compare the old file t1 with the new one t2. If the key match, I need to display the content from new file.
cat /tmp/t1.txt
a, b, c
d, e, f
g, m, p
cat /tmp/t2.txt
d, x, y
g, h, i
But the way this loop is written, it shows the entry from old file for the key 'g' while it works correctly for the key 'd'.
with open("/tmp/t1.txt", "r") as f:
with open("/tmp/t2.txt", "r") as n:
for nline in n:
for fline in f:
if nline.split()[0] == fline.split()[0]:
print("nline", nline)
else:
print("fline", fline)
The result is:
fline a, b, c
nline d, x, y
fline g, m, p
The last line should look like this:
nline g, h, i

Basically you have to reopen the file once you read through it:
with open("/tmp/t2.txt", "r") as n:
for nline in n:
with open("/tmp/t1.txt", "r") as f: # <<-- moved this line here
for fline in f:
# some comparisons here
Another good approach would be to read both files once, and then compare the data:
with open("/tmp/t1.txt", "r") as fin :
data1 = fin.readlines()
with open("/tmp/t2.txt", "r") as fin :
data2 = fin.readlines()
for nline in data1 :
for fline in data2 :
# put your logic here
ok, answering the question from the comments:
with open("/tmp/t1.txt", "r") as fin :
data1 = [ (i[0], i) for i in fin.readlines() if len(i) > 3 ]
with open("/tmp/t2.txt", "r") as fin :
data2 = { i[0] : i for i in fin.readlines() if len(i) > 3 }
for key,value in data1 :
print data2[key] if key in data2 else value

Using pandas
Caveat: keys in t2 are unique
Generates a dataframe with rows updated where the t1 key matches the t2 key
import pandas as pd
# read files
t1 = pd.read_csv('t1.txt', header=None)
t2 = pd.read_csv('t2.txt', header=None)
# t2 to a dict
t2_d = t2.set_index(0, drop=False).to_dict(orient='index')
# look for t1 in t2_d
t1.apply(lambda x: t2_d[x[0]] if x[0] in t2_d else x, axis=1)
0 1 2
0 a b c
1 d x y
2 g h i
Optionally
This option adds a column, in_t2 to indicate whether t1 is in t2
Create updated and join it to t1 to easily verify updated values
# read files
t1 = pd.read_csv('t1.txt', header=None)
t2 = pd.read_csv('t2.txt', header=None)
# add Boolean column
t1['in_t2'] = t1[0].isin(t2[0])
# t2 to a dict
t2_d = t2.set_index(0, drop=False).to_dict(orient='index')
# look for t1 in t2_d
updated = t1.iloc[:, :-1].apply(lambda x: t2_d[x[0]] if x[0] in t2_d else x, axis=1)
# join t1 and updated
pd.concat([t1, updated], axis=1)
0 1 2 in_t2 0 1 2
0 a b c False a b c
1 d e f True d x y
2 g m p True g h i

Related

Python List Wrangling

I take a list that looks like this
[[x, a, d], [y], [z,f]]
It wrangles the list by this
#tag to print is the raw JSON
for tagElement in tagToPrint[x]:
if tagElement not in allConstrainedTagList:
if v == 0:
topicTagConcat = [tagElement]
topicTagToPrint[x] = tagElement
v = v + 1
elif v >= 1:
topicTagConcat = [topicTagConcat, tagElement]
topicTagToPrint[x] = topicTagConcat
v = v + 1
which then uses this
with open("output.csv", 'wb') as outcsv:
#configure writer to write standard csv file
writer = csv.writer(outcsv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
writer.writerow(['Topic Tag', ..... ])
while x < y:
writer.writerow(rowPrinter[x])
x = x + 1
to print this out
ID ColA ColB ColC
1 X A D
2 Y
3 Z F
How can I analyze A & B & C together as one column? X, Y, Z, A, B, C are
the same column. It is going in tableau.

splitting dictionary and writing it to different csv file in python

I want to split the python dictionary and write it to different files based on NO_OF_LINES_PER_FILE and size of dictionary
Input
NO_OF_LINES_PER_FILE
so if NO_OF_LINES_PER_FILE = 2 and size of dictionary is 10 the i want the dictionary to be splitted into 5 files(each file will have 2 rows)
Script
import csv
NO_OF_LINES_PER_FILE = 2
s = {"2222":["1","2","3"],"3456":["2","3","4"],"5634":["4","5"],"23543":["456","3"],"29587":["4","5"],"244":["23","34"],"455":["3","4"],"244221":["5"],"23232345":["2323","43"]}
def again(c,h,NO_OF_LINES_PER_FILE1):
f3 = open('num_'+str(h)+'.csv', 'at')
if c == 1:
ceh = 2
else:
ceh = c
print ceh
v = 0
for w in s:
v = v + 1
if v < ceh:
pass
elif v > NO_OF_LINES_PER_FILE1:
print "yes"
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE1 + 1
h = NO_OF_LINES_PER_FILE1 + 1
again(c,h,NO_OF_LINES_PER_FILE1)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
def split():
f3 = open('has_'+str(NO_OF_LINES_PER_FILE)+'.csv', 'at')
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
c = 0
for w in s:
if c >= NO_OF_LINES_PER_FILE:
NO_OF_LINES_PER_FILE1 = NO_OF_LINES_PER_FILE + 1
h = NO_OF_LINES_PER_FILE
again(c,h,NO_OF_LINES_PER_FILE1)
break
else:
#print NO_OF_LINES_PER_FILE
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(s[w])
c = c + 1
split()
But this script is not working and creates many files
In the above script NO_OF_LINES_PER_FILE = 2 and size of dictionary s is 9
so i want 5 files first four file will contain 2 rows each and fifth file will contain 1 row
How can i solve this problem?
my method is flat the dict first, then split flat dict to sub list with length you want
import csv
flatDict = [ i for i in s.items()]
splitFlatDict = [flatDict[i:i+NO_OF_LINES_PER_FILE] for i in xrange(0,len(flatDict),NO_OF_LINES_PER_FILE)]
for i,rows in enumerate(splitFlatDict):
with open(str(i) + '.csv','wb') as f:
writer = csv.writer(f)
writer.writerows(rows)

Python - looping multiple lists with enumerate for same index

list1 = Csvfile1._getRow(' field1')
list2 = Csvfile2._getRow(' field1')
_list1 = Csvfile1._getRow(' field2')
_list2 = Csvfile2._getRow(' field2')
for i,(a,b) in enumerate(zip(list2, list1)):
value = False
if field == ' field1':
for j,(c,d) in enumerate(zip(_list2, _list1)):
if i == j:
if a != b and c != d:
value = True
else:
value = False
break
if value == True:
continue
if a != b
# do something
Below is the sample :
values in both the csv files are compared. when the value for field1
is not equal in both csv files, the condition if a != b: should be executed.
When the value for field1 is not equal in both csv files, and at the same time if the values for field2 is also not equal -> then the condition if a != b: should not be executed.
With huge data this seems to be not working. Or is there a better way to achieve this ?
Csvfile1
field1 | field2
222 | 4 -> enter if a != b: condition loop
435 | 5 -> do not enter if a != b: condition loop
Csvfile2
field1 | field2
223 | 4
436 | 6
If I got right what you want to do, try something like this:
$ cat t1.txt
field1|field2
222|4
435|5
$ cat t2.txt
field1|field2
223|4
436|6
$ python
import csv
with open("t1.txt", "rb") as csvfile:
with open("t2.txt", "rb") as csvfile2:
reader = csv.reader(csvfile, delimiter='|')
reader2 = csv.reader(csvfile2, delimiter='|')
for row1, row2 in zip(reader, reader2):
for elem1, elem2 in zip(row1, row2):
if elem1 != elem2:
print "different: {} and {}".format(elem1, elem2)
different: 222 and 223
different: 435 and 436
different: 5 and 6
#first field(ff) second field(sf) first file(ff) second file(sf)
field1csv1 = Csvfile1._getRow(' field1')
field1csv2 = Csvfile2._getRow(' field1')
field2csv1 = Csvfile1._getRow(' field2')
field2csv2 = Csvfile2._getRow(' field2')
Every time you have huge lists of data you should think about using a generator instead of a list comprehension. itertools.izip is a generator version of zip.
Plugging it in should give you a considerable improvement, as no temporary lists will be generated:
from itertools import izip
for i, (a, b) in enumerate(izip(list2, list1)):
value = False
if field == ' field1':
for j, (c, d) in enumerate(izip(_list2, _list1)):
if i == j:
if a != b and c != d:
value = True
else:
value = False
break
if value == True:
continue
if a != b
# do something
This is an example of how to refactor your code to get rid of the iteration in python and drop the iteration to the C level:
#orig
for i, (a, b) in enumerate(zip(list2, list1)):
value = False
if field == ' field1':
for j, (c, d) in enumerate(zip(_list2, _list1)):
if i == j:
if a != b and c != d:
value = True
else:
value = False
break
With generators:
from itertools import izip
mygen = izip(izip(list2,list1),izip(_list2,_list1))
#[((a, b), (c, d)), ((x, y), (_x, _y)), ...]
values = [tuple1[0]!=tuple1[1] and tuple1[2]!=tuple2[1] for tuple1, tuple2 in mygen]
Also you could use "equality" generators:
field1 = izip(field1csv1, field1csv2)
field2 = izip(field2csv1, field2csv2)
field1equal = (f[0] == f[1] for f in field1)
field2equal = (f[0] == f[1] for f in field2)
I got this far and then gave up. I have no idea what you're doing.

how to merge file lines having the same first word in python?

I have written a program to merge lines in a file containing the same first word
in python.However I am unable to get the desired output.
Can anyone please suggest me the mistake in my program?
Note:- (line1,line 2) and (line4,line5,line6) are merging since they
have the same first element
#input
"file.txt"
line1: a b c
line2: a b1 c1
line3: d e f
line4: i j k
line5: i s t
line6: i m n
#output
a b c a b1 c1
d e f
i j k i s t i m n
#my code
for i in range(0,len(a)):
j=i
try:
while True:
if a[j][0] == a[j+1][0]:
L.append(a[j])
L.append(a[j+1])
j=j+2
else:
print a[i]
print L
break
except:
pass`
Try this (give it the file as an argument).
Produces a dictionary with the lines you expect.
import sys
if "__main__" == __name__:
new_lines = dict()
# start reading file
with open(sys.argv[1]) as a:
# iterate file by lines - removing newlines
for a_line in a.read().splitlines():
# candidate is first word in every sentence
candidate = a_line.split()[0] # split on whitespace by default
# dictionary keys are previous candidates
if candidate in new_lines.keys():
# word already included
old_line = new_lines[candidate]
new_lines[candidate] = "%s %s" % (old_line, a_line)
else:
# word not included
new_lines[candidate] = a_line
# now we have our dictionary. print it (or do what you want with it)
for key in new_lines.keys():
print "%s -> %s" % (key, new_lines[key])
output:
a -> a b c a b1 c1
i -> i j k i s t i m n
d -> d e f

Building a dictionary using two files

I am very very new to python and I have been playing around to write a script using two files. File 1 contains a number of ID numbers such as:
1000012
1000015
1000046
1000047
1000050
1000072
1000076
100008
1000102
100013
The other file has few lines of single ID numbers followed by lines made of one ID number followed by other ID numbers which have a + or - at the end:
951450
8951670
8951800
8951863
8951889
9040311
9255087 147+ 206041- 8852164- 4458078- 1424812- 3631438- 8603144+ 4908786- 4780663+ 4643406+ 3061176- 7523696- 5876052- 163881- 6234800- 395660-
9255088 149+ 7735585+ 6359867+ 620034- 4522360- 2810885- 3705265+ 5966368- 7021344+ 9165926- 2477382+ 4015358- 2497281+ 9166415+ 6837601-
9255089 217+ 6544241+ 5181434+ 4625589+ 7433598+ 7295233+ 3938917+ 4109401+ 2135539+ 4960823+ 1838531+ 1959852+ 5698864+ 1925066+ 8212560+ 3056544+ 82N 1751642+ 4772695+ 2396528+ 2673866+ 2963754+ 5087444+ 977167+ 2892617- 7412278- 6920479- 2539680- 4315259- 8899799- 733101- 5281901- 7055760+ 8508290+ 8559218+ 7985985+ 6391093+ 2483783+ 8939632+ 3373919- 924346+ 1618865- 8670617+ 515619+ 5371996+ 2152211+ 6337329+ 284813+ 8512064+ 3469059+ 3405322+ 1415471- 1536881- 8034033+ 4592921+ 4226887- 6578783-
I want to build a dictionary using these two files. My script has to search inside File 2 for the ID numbers in File 1 and append those lines as values in which the key is the number in File 1. Therefore there may be more than one value for each key. I only want to search the lines in File 2 that have more than one number (if len(x) > 1).
the output would be something like: 1000047: 9292540 1000047+ 9126889+ 3490727- 8991434+ 4296324+ 9193432- 3766395+ 9193431+ 8949379- (I need to print each ID number in File1 as the key and as its value, the chunk of lines that contain that ID number as a whole)
Here is my -very wrong- script:
#!/usr/bin/python
f = open('file1')
z = open('file2')
d = dict() # d is an empty dictionary
for l in f:
p = l.rstrip()
d[p] = list() # sets the keys in the dictionary as p (IDs with newline characters stripped)
y = z.readlines() # retrieves a string from the path file
s = "".join(y) # makes a string from y
x = str.split(s) #splits the path file at white spaces
if len(x) > 1: # only the lines that include contigs IDs that were used to make another contig
for lines in y:
k = lines.rstrip()
w = tuple(x) # convert list x into a tuple called w
for i in w:
if i[:-1] in d:
d[p].append(k)
print d
Try:
#!/usr/bin/python
f = open('file1')
z = open('file2')
d = dict() # d is an empty dictionary
for l in f:
p = l.rstrip()
d[p] = list() # Change #1
f.close()
# Now we have a dictinary with the keys from file1 and empty lists as values
for line in z:
items = item.split() # items will be a list from 1 line
if len(items) > 1: # more than initial item in the list
k = items[0] # First is the key line
for i in items[1:]: # rest of items
if d.haskey(i[:-1]): # is it in the dict
d[i].append(k) # Add the k value
z.close()
print d
N.B. This is untested code but shouldn't be too far off.
Is this what you are looking for ?? (I have not tested it ...)
#!/usr/bin/python
f = open('file1')
z = open('file2')
d = dict() # d is an empty dictionary
for l in f.readlines():
for l2 in z.readlines():
if l.rstrip() in l2.rstrip():
d[l] = l2
z.seek(0, 0)
f.close()
z.close()
Here is a simpler version the same code, if you don't want to deal with the file pointer
f = open("file1")
z = open("file2")
d = dict() # d is an empty dictionary
file1_lines = f.readlines()
file2_lines = z.readlines()
for l in file1_lines:
for l2 in file2_lines:
if l.rstrip() in l2.rstrip():
d[l] = l2
print d
f.close()
z.close()

Categories