Delete and save duplicate in another file - python

In test.txt:
1 a
2 b
3 c
4 a
5 d
6 c
I want to remove duplicate and save the rest in test2.txt:
2 b
5 d
I tried to start with the codes below.
file1 = open('../test.txt').read().split('\n')
#file2 = open('../test2.txt', "w")
word = set()
for line in file1:
if line:
sline = line.split('\t')
if sline[1] not in word:
print sline[0], sline[1]
word.add(sline[1])
#file2.close()
The results from the codes showed:
1 a
2 b
3 c
5 d
Any suggestion?

You can use collections.Orderedict here:
>>> from collections import OrderedDict
with open('abc') as f:
dic = OrderedDict()
for line in f:
v,k = line.split()
dic.setdefault(k,[]).append(v)
Now dic looks like:
OrderedDict([('a', ['1', '4']), ('b', ['2']), ('c', ['3', '6']), ('d', ['5'])])
Now we only need those keys which contain only 1 items in the list.
for k,v in dic.iteritems():
if len(v) == 1:
print v[0],k
...
2 b
5 d

What you're doing is that you're just making sure every second item (letter) gets printed out only once. Which obviously is not what you're saying you want.
You must split your code into two halfs - reading and gathering statistics about letter counts, and part which prints only those which has count == 1.
Converting your original code (I just made it a little simpler):
file1 = open('../test.txt')
words = {}
for line in file1:
if line:
line_num, letter = line.split('\t')
if letter not in words:
words[letter] = [1, line_num]
else:
words[letter][0] += 1
for letter, (count, line_num) in words.iteritems():
if count == 1:
print line_num, letter

I tried to keep it as similar to your stlye as possible:
file1 = open('../test.txt').read().split('\n')
word = set()
test = []
duplicate = []
sin_duple = []
num_lines = 0;
num_duplicates = 0;
for line in file1:
if line:
sline = line.split(' ')
test.append(" ".join([sline[0], sline[1]]))
if (sline[1] not in word):
word.add(sline[1])
num_lines = num_lines + 1;
else:
sin_duple.append(sline[1])
duplicate.append(" ".join([sline[0], sline[1]]))
num_lines = num_lines + 1;
num_duplicates = num_duplicates + 1;
for i in range (0,num_lines+1):
for item in test:
for j in range(0, num_duplicates):
#print((str(i) + " " + str(sin_duple[j])))
if item == (str(i) + " " + str(sin_duple[j])):
test.remove(item)
file2 = open("../test2.txt", 'w')
for item in test:
file2.write("%s\n" % item)
file2.close()

How about some Pandas
import pandas as pd
a = pd.read_csv("test_remove_dupl.txt",sep=",")
b = a.drop_duplicates(cols="a")

Related

How to assign the first colmn in text file to list variable?

I have txt file which have "strings" like this
5.0125,511.2,5.12.3,4.51212,45.412,54111.5142 \n
4.23,1.2,2.6,2.3,1.2,1.554 \n
How to assign each column a separate list of floats please? I have been spending few hours on that, but I am lost.
Expected results
list 1 = [5.0125, 4.23]
list 2 = [511.2, 1.2 ]
Update: adding my trial :
for line in f:
lis = [float(line.split()[0]) for line in f]
print("lis is ", list)
tmp = line.strip().split(",")
values = [float(v) for v in tmp]
points4d = np.array(values).reshape(-1,11) #11 is number of elements in the line
print("points4d", points4d)
for i in points4d:
points3d_first_cluster = points4d[:, :3] # HARD CODED PART
points3d_second_cluster = points4d[:, 3:6]
points3d_third_cluster = points4d[:, 6:9]
#print("x_values_first_cluster",x_values_first_cluster)
print("points3d first cluster ",points3d_first_cluster)
print("points3d second cluster", points3d_second_cluster)
print("points for third cluster", points3d_third_cluster)
lists = []
with open ("text.txt", "r") as file:
for lines in file.readlines():
lista = [s for s in lines.split(',')]
lista.pop(-1)
lists.append(lista)
final_list = []
for x in range (len(lists[0])):
i = x+1
print("list {}".format(i))
globals()['list%s' % i] = [lists[0][x],lists[1][x]]
print(globals()['list%s' % i])
print(list1)
Output :
list 1
['5.0125', '4.23']
list 2
['511.2', '1.2']
list 3
['5.12.3', '2.6']
list 4
['4.51212', '2.3']
list 5
['45.412', '1.2']
['5.0125', '4.23'] # Output of print(list1)
this should work :
lists = []
with open ("text.txt", "r") as file:
for lines in file.readlines():
lista = [s for s in lines.split(',')]
lista.pop(-1)
lists.append(lista)
final_list = []
for x in range (len(lists[0])):
list_new = [lists[0][x],lists[1][x]]
final_list.append(list_new)
print(list_new)

Find frequency of words line by line in txt file Python (how to format properly)

I'm trying to make a simple program that can find the frequency of occurrences in a text file line by line. I have it outputting everything correctly except for when more than one word is on a line in the text file. (More information below)
The text file looks like this:
Hello
Hi
Hello
Good Day
Hi
Good Day
Good Night
I want the output to be: (Doesn't have to be in the same order)
Hello: 2
Hi: 2
Good Day: 2
Good Night: 2
What it's currently outputting:
Day: 2
Good: 3
Hello: 2
Hi: 2
Night: 1
My code:
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split(None)
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
You want to preserve the lines. Don't split. Don't capitalize. Don't sort
Use a Counter
from collections import Counter
c = Counter()
with open('test.txt') as f:
for line in f:
c[line.rstrip()] += 1
for k, v in c.items():
print('{}: {}'.format(k, v))
Instead of splitting the text by None, split it by each line break so you get each line into a list.
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split('\n')
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
You can make it yourself very easy by using a Counter object. If you want to count the occurrences of full lines you can simply do:
from collections import Counter
with open('file.txt') as f:
c = Counter(f)
print(c)
Edit
Since you asked for a way without modules:
counter_dict = {}
with open('file.txt') as f:
l = f.readlines()
for line in l:
if line not in counter_dict:
counter_dict[line] = 0
counter_dict[line] +=1
print(counter_dict)
Thank you all for the answers, most of the code produces the desired output just in different ways. The code I ended up using with no modules was this:
file = open("test.txt", "r")
text = file.read() #reads file (I've tried .realine() & .readlines()
word_list = text.split('\n')
word_freq = {} # Declares empty dictionary
for word in word_list:
word_freq[word] = word_freq.get(word, 0) + 1
keys = sorted(word_freq.keys())
for word in keys:
final=word.capitalize()
print(final + ': ' + str(word_freq[word])) # Line that prints the output
The code I ended up using with modules was this:
from collections import Counter
c = Counter()
with open('live.txt') as f:
for line in f:
c[line.rstrip()] += 1
for k, v in c.items():
print('{}: {}'.format(k, v))

How to extract common lines from text files based on their associated values?

I have 3 text files as:
List1.txt:
032_M5, 5
035_M9, 5
036_M4, 3
038_M2, 6
041_M1, 6
List2.txt:
032_M5, 6
035_M9, 6
036_M4, 5
038_M2, 5
041_M1, 6
List3.txt:
032_M5, 6
035_M9, 6
036_M4, 4
038_M2, 5
041_M1, 6
where the 1st part (i.e string) of lines in all 3 text files are the same, but the 2nd part (i.e number) changes.
I want to get three output files from this:
Output1.txt --> All lines where numbers corresponds to a string are all different.
Example:
036_M4 3, 5, 4
Output2.txt --> All lines where numbers corresponds to a string are the same.
Example:
041_M1, 6
Output3.txt --> All lines where atleast two numbers corresponds to a string are the same (which includes results of Output2.txt also).
Example:
032_M5, 6
035_M9, 6
038_M2, 5
041_M1, 6
Then I need the count of lines with number 1, number 2, number 3, number 4, number 5, and number 6 from Output3.txt.
Here is what I tried. It is giving me the wrong output.
from collections import defaultdict
data = defaultdict(list)
for fileName in ["List1.txt","List2.txt", "List3.txt"]:
with open(fileName,'r') as file1:
for line in file1:
col1,value = line.split(",")
data[col1].append(int(value))
with open("Output3.txt","w") as output:
for (col1),values in data.items():
if len(values) < 3: continue
result = max(x for x in values)
output.write(f"{col1}, {result}\n")
Here is an approach that does not utilize any python modules and it entirely depends on native built-in python functions:
with open("List1.txt", "r") as list1, open("List2.txt", "r") as list2, open("List3.txt", "r") as list3:
# Forming association between keywords and numbers.
data1 = list1.readlines()
totalKeys = [elem.split(',')[0] for elem in data1]
numbers1 = [elem.split(',')[1].strip() for elem in data1]
numbers2 = [elem.split(',')[1].strip() for elem in list2.readlines()]
numbers3 = [elem.split(',')[1].strip() for elem in list3.readlines()]
totalValues = list(zip(numbers1,numbers2,numbers3))
totalDict = dict(zip(totalKeys,totalValues))
#Outputs
output1 = []
output2 = []
output3 = []
for key in totalDict.keys():
#Output1
if len(set(totalDict[key])) == 3:
output1.append([key, totalDict[key]])
#Output2
if len(set(totalDict[key])) == 1:
output2.append([key, totalDict[key][0]])
#Output3
if len(set(totalDict[key])) <= 2:
output3.append([key, max(totalDict[key], key=lambda elem: totalDict[key].count(elem))])
#Output1
print('Output1:')
for elem in output1:
print(elem[0] + ' ' + ", ".join(elem[1]))
print()
#Output2
print('Output2:')
for elem in output2:
print(elem[0] + ' ' + " ".join(elem[1]))
print()
#Output3
print('Output3:')
for elem in output3:
print(elem[0] + ' ' + " ".join(elem[1]))
The result of the above will be:
Output1:
036_M4 3, 5, 4
Output2:
041_M1 6
Output3:
032_M5 6
035_M9 6
038_M2 5
041_M1 6
max gives the biggest number in the list, not the most commonly occurring. For that, use statistics.mode
from collections import defaultdict
from statistics import mode
data = defaultdict(list)
for fileName in ["List1.txt","List2.txt", "List3.txt"]:
with open(fileName,'r') as file1:
for line in file1:
col1,value = line.split(",")
data[col1].append(int(value))
with open("Output1.txt","w") as output:
for (col1),values in data.items():
if len(values) < 3: continue
if values[0] != values[1] != values[2] and values[0] != values[2]:
output.write(f"{col1}, {values[0]}, {values[1]}, {values[2]}\n")
with open("Output2.txt","w") as output:
for (col1),values in data.items():
if len(values) < 3: continue
if values[0] == values[1] == values[2]:
output.write(f"{col1}, {values[0]}\n")
with open("Output3.txt","w") as output:
for (col1),values in data.items():
if len(values) < 3: continue
if len(set(values)) >= 2:
output.write(f"{col1}, {mode(values)}\n")

Dictionaries overwriting in Python

This program is to take the grammar rules found in Binary.text and store them into a dictionary, where the rules are:
N = N D
N = D
D = 0
D = 1
but the current code returns D: D = 1, N:N = D, whereas I want N: N D, N: D, D:0, D:1
import sys
import string
#default length of 3
stringLength = 3
#get last argument of command line(file)
filename1 = sys.argv[-1]
#get a length from user
try:
stringLength = int(input('Length? '))
filename = input('Filename: ')
except ValueError:
print("Not a number")
#checks
print(stringLength)
print(filename)
def str2dict(filename="Binary.txt"):
result = {}
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]] = line
print (result)
return result
print (str2dict("Binary.txt"))
Firstly, your data structure of choice is wrong. Dictionary in python is a simple key-to-value mapping. What you'd like is a map from a key to multiple values. For that you'll need:
from collections import defaultdict
result = defaultdict(list)
Next, where are you splitting on '=' ? You'll need to do that in order to get the proper key/value you are looking for? You'll need
key, value = line.split('=', 1) #Returns an array, and gets unpacked into 2 variables
Putting the above two together, you'd go about in the following way:
result = defaultdict(list)
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
key, value = line.split('=', 1)
result[key.strip()].append(value.strip())
return result
Dictionaries, by definition, cannot have duplicate keys. Therefor there can only ever be a single 'D' key. You could, however, store a list of values at that key if you'd like. Ex:
from collections import defaultdict
# rest of your code...
result = defaultdict(list) # Use defaultdict so that an insert to an empty key creates a new list automatically
with open(filename, "r") as grammar:
#read file
lines = grammar.readlines()
count = 0
#loop through
for line in lines:
print(line)
result[line[0]].append(line)
print (result)
return result
This will result in something like:
{"D" : ["D = N D", "D = 0", "D = 1"], "N" : ["N = D"]}

How to parse data in a variable length delimited file?

I have a text file which does not confirm to standards. So I know the (end,start) positions of each column value.
Sample text file :
# # # #
Techy Inn Val NJ
Found the position of # using this code :
1 f = open('sample.txt', 'r')
2 i = 0
3 positions = []
4 for line in f:
5 if line.find('#') > 0:
6 print line
7 for each in line:
8 i += 1
9 if each == '#':
10 positions.append(i)
1 7 11 15 => Positions
So far, so good! Now, how do I fetch the values from each row based on the positions I fetched? I am trying to construct an efficient loop but any pointers are greatly appreciated guys! Thanks (:
Here's a way to read fixed width fields using regexp
>>> import re
>>> s="Techy Inn Val NJ"
>>> var1,var2,var3,var4 = re.match("(.{5}) (.{3}) (.{3}) (.{2})",s).groups()
>>> var1
'Techy'
>>> var2
'Inn'
>>> var3
'Val'
>>> var4
'NJ'
>>>
Off the top of my head:
f = open(.......)
header = f.next() # get first line
posns = [i for i, c in enumerate(header + "#") if c = '#']
for line in f:
fields = [line[posns[k]:posns[k+1]] for k in xrange(len(posns) - 1)]
Update with tested, fixed code:
import sys
f = open(sys.argv[1])
header = f.next() # get first line
print repr(header)
posns = [i for i, c in enumerate(header) if c == '#'] + [-1]
print posns
for line in f:
posns[-1] = len(line)
fields = [line[posns[k]:posns[k+1]].rstrip() for k in xrange(len(posns) - 1)]
print fields
Input file:
# # #
Foo BarBaz
123456789abcd
Debug output:
'# # #\n'
[0, 7, 10, -1]
['Foo', 'Bar', 'Baz']
['1234567', '89a', 'bcd']
Robustification notes:
This solution caters for any old rubbish (or nothing) after the last # in the header line; it doesn't need the header line to be padded out with spaces or anything else.
The OP needs to consider whether it's an error if the first character of the header is not #.
Each field has trailing whitespace stripped; this automatically removes a trailing newline from the rihtmost field (and doesn't run amok if the last line is not terminated by a newline).
Final(?) update: Leapfrooging #gnibbler's suggestion to use slice(): set up the slices once before looping.
import sys
f = open(sys.argv[1])
header = f.next() # get first line
print repr(header)
posns = [i for i, c in enumerate(header) if c == '#']
print posns
slices = [slice(lo, hi) for lo, hi in zip(posns, posns[1:] + [None])]
print slices
for line in f:
fields = [line[sl].rstrip() for sl in slices]
print fields
Adapted from John Machin's answer
>>> header = "# # # #"
>>> row = "Techy Inn Val NJ"
>>> posns = [i for i, c in enumerate(header) if c == '#']
>>> [row[slice(*x)] for x in zip(posns, posns[1:]+[None])]
['Techy ', 'Inn ', 'Val ', 'NJ']
You can also write the last line like this
>>> [row[i:j] for i,j in zip(posns, posns[1:]+[None])]
For the other example you give in the comments, you just need to have the correct header
>>> header = "# # # #"
>>> row = "Techiyi Iniin Viial NiiJ"
>>> posns = [i for i, c in enumerate(header) if c == '#']
>>> [row[slice(*x)] for x in zip(posns, posns[1:]+[None])]
['Techiyi ', 'Iniin ', 'Viial ', 'NiiJ']
>>>
Ok, to be little different and to give the asked in comments generalized solution, I use the header line instead of slice and generator function. Additionally I have allowed first columns to be comment by not putting field name in first column and using of multichar field names instead of only '#'.
Minus point is that one char fields are not possible to have header names but only have '#' in header line (which are allways considered like in previous solutions as beginning of field, even after letters in header)
sample="""
HOTEL CAT ST DEP ##
Test line Techy Inn Val NJ FT FT
"""
data=sample.splitlines()[1:]
def fields(header,line):
previndex=0
prevchar=''
for index,char in enumerate(header):
if char == '#' or (prevchar != char and prevchar == ' '):
if previndex or header[0] != ' ':
yield line[previndex:index]
previndex=index
prevchar = char
yield line[previndex:]
header,dataline = data
print list(fields(header,dataline))
Output
['Techy Inn ', 'Val ', 'NJ ', 'FT ', 'F', 'T']
One practical use of this is to use in parsing fixed field length data without knowing the lengths by just putting copy of dataline with all fields and no comment present and spaces replaced with something else like '_' and single character field values replaced by #.
Header from sample line:
' Techy_Inn Val NJ FT ##'
def parse(your_file):
first_line = your_file.next().rstrip()
slices = []
start = None
for e, c in enumerate(first_line):
if c != '#':
continue
if start is None:
start = e
continue
slices.append(slice(start, e))
start = e
if start is not None:
slices.append(slice(start, None))
for line in your_file:
parsed = [line[s] for s in slices]
yield parsed
f = open('sample.txt', 'r')
pos = [m.span() for m in re.finditer('#\s*', f.next())]
pos[-1] = (pos[-1][0], None)
for line in f:
print [line[i:j].strip() for i, j in pos]
f.close()
How about this?
with open('somefile','r') as source:
line= source.next()
sizes= map( len, line.split("#") )[1:]
positions = [ (sum(sizes[:x]),sum(sizes[:x+1])) for x in xrange(len(sizes)) ]
for line in source:
fields = [ line[start,end] for start,end in positions ]
Is this what you're looking for?

Categories