Completely deleting duplicates words in a text file - python

I have some words in a text file like:
joynal
abedin
rahim
mohammad
joynal
abedin
mohammad
kudds
I want to delete the duplicate names. It will delete these duplicate entries totally from the text file
The output should be like:
rahim
kuddus
I have tried some coding but it's only giving me the duplicate values as one like 1.joynal and 2.abedin.
Edited: This is the code I tried:
content = open('file.txt' , 'r').readlines()
content_set = set(content)
cleandata = open('data.txt' , 'w')
for line in content_set:
cleandata.write(line)

Use a Counter:
from collections import Counter
with open(fn) as f:
cntr=Counter(w.strip() for w in f)
Then just print the words with a count of 1:
>>> print('\n'.join(w for w,cnt in cntr.items() if cnt==1))
rahim
kudds
Or do it the 'old fashion way' with a dict as a counter:
cntr={}
with open(fn) as f:
for line in f:
k=line.strip()
cntr[k]=cntr.get(k, 0)+1
>>> print('\n'.join(w for w,cnt in cntr.items() if cnt==1))
# same
If you want to output to a new file:
with open(new_file, 'w') as f_out:
f_out.write('\n'.join(w for w,cnt in cntr.items() if cnt==1))

you can just create a list which appends if name is not in and remove if name is in and occured a 2nd time.
with open("file1.txt", "r") as f, open("output_file.txt", "w") as g:
output_list = []
for line in f:
word = line.strip()
if not word in output_list:
output_list.append(word)
else:
output_list.remove(word)
g.write("\n".join(output_list))
print(output_list)
['rahim', 'kudds']
#in the text it is for each row one name like this:
rahim
kudds
The solution with counter is still the more elegant way imo

For completeness, if you don't care about order:
with open(fn) as f:
words = set(x.strip() for x in f)
with open(new_fn, "w") as f:
f.write("\n".join(words))
Where fn is the file you want to read from, and new_fn the file you want to write to.
In general for uniqueness think set---remembering that order is not gauranteed.

file = open("yourFile.txt") # open file
text = file.read() # returns content of the file
file.close()
wordList = text.split() # creates list of every word
wordList = list(dict.fromkeys(wordList)) # removes duplicate elements
str = ""
for word in wordList:
str += word
str += " " # creates a string that contains every word
file = open("yourFile.txt", "w")
file.write(str) # writes the new string in the file
file.close()

Related

Not all duplicates are deleted from a text file in Python

I am new to Python. I am trying to delete duplicates from my text file by doing the following:
line_seen = set()
f = open('a.txt', 'r')
w = open('out.txt', 'w')
for i in f:
if i not in line_seen:
w.write(i)
line_seen.add(i)
f.close()
w.close()
In the initial file I had
hello
world
python
world
hello
And in output file I got
hello
world
python
hello
So it did not remove the last duplicate. Can anyone help me to understand why it happened and how could I fix it?
The first line probably contains 'hello\n' - the last line contains only 'hello' - they are not the same.
Use
line_seen = set()
with open('a.txt', 'r') as f, open('out.txt', 'w') as w:
for i in f:
i = i.strip() # remove the \n from line
if i not in line_seen:
w.write(i + "\n")
line_seen.add(i)
The main problem is with the break line characters ("\n") which appears at the end of each line but the last line. You can use a combination of set, map and join function such as what follows:
f = open('a.txt', 'r')
w = open('out.txt', 'w')
w.write("\n".join(list(set(map(str.strip,f.readlines())))))
out.txt
python
world
hello
If you want to stick to your previous approach you can use:
line_seen = set()
f = open('a.txt', 'r')
w = open('out.txt', 'w')
for i in f:
i = i.strip()
if i not in line_seen:
w.write(i)
line_seen.add(i)
f.close()
w.close()
Most likely you didn't end the last line with a newline. The known line is `hello\n'. The last just 'hello'
Fix the input or strip() the read i
# Since we check if the line exists in lines, we can use a list instead of
# a set to preserve order
lines = []
infile = open('a.txt', 'r')
outfile = open('out.txt', 'w')
# Use the readlines method
for line in infile.readlines():
if line not in lines:
# Strip whitespace
line = line.strip()
lines.append(line)
for line in lines:
# Add the whitespace back
outfile.write("{}\n".format(line))
infile.close()
outfile.close()

Read text file to list in python

I want to create a text file which contains positive/negative numbers separated by ','.
i want to read this file and put it in data = []. i have written the code below and i think that it works well.
I want to ask if you guys know a better way to do it or if is it well written
thanks all
#!/usr/bin/python
if __name__ == "__main__":
#create new file
fo = open("foo.txt", "w")
fo.write( "111,-222,-333");
fo.close()
#read the file
fo = open("foo.txt", "r")
tmp= []
data = []
count = 0
tmp = fo.read() #read all the file
for i in range(len(tmp)): #len is 11 in this case
if (tmp[i] != ','):
count+=1
else:
data.append(tmp[i-count : i])
count = 0
data.append(tmp[i+1-count : i+1])#append the last -333
print data
fo.close()
You can use split method with a comma as a separator:
fin = open('foo.txt')
for line in fin:
data.extend(line.split(','))
fin.close()
Instead of looping through, you can just use split:
#!/usr/bin/python
if __name__ == "__main__":
#create new file
fo = open("foo.txt", "w")
fo.write( "111,-222,-333");
fo.close()
#read the file
with open('foo.txt', 'r') as file:
data = [line.split(',') for line in file.readlines()]
print(data)
Note that this gives back a list of lists, with each list being from a separate line. In your example you only have one line. If your files will always only have a single line, you can just take the first element, data[0]
To get the whole file content(numbers positive and negative) into list you can use split and splitlines
file_obj = fo.read()#read your content into string
list_numbers = file_obj.replace('\n',',').split(',')#split on ',' and newline
print list_numbers

Append text in every line of txt in Python

I've a text file with many lines. I need to append to every line a text in Python.
Here an example:
Text before:
car
house
blog
Text modified:
car: [word]
house: [word]
blog: [word]
If you just want to append word on each line this works fine
file_name = 'YOUR_FILE_NAME.txt' #Put here your file
with open(file_name,'r') as fnr:
text = fnr.readlines()
text = "".join([line.strip() + ': [word]\n' for line in text])
with open(file_name,'w') as fnw:
fnw.write(text)
But there are many ways to do it
Read the text in a list:
f = open("filename.dat")
lines = f.readlines()
f.close()
append text:
new_lines = [x.strip() + "text_to_append" for x in lines]
# removes newlines from the elements of the list, appends
# the text for each element of the list in a list comprehension
Edit:
for completness, a more pythonic solution with writing the text to a new file:
with open('filename.dat') as f:
lines = f.readlines()
new_lines = [''.join([x.strip(), text_to_append, '\n']) for x in lines]
with open('filename_new.dat', 'w') as f:
f.writelines(new_lines)

Can't get this function to work in python

The task is to write the unique_file function which takes an input filename and an output filename as parameters. Your function should read contents from the input file and create a list of unique words --> Basically means no two or more of the same words can be writen in thee output file. The code I used is:
def unique_file(input_filename, output_filename):
file = open(input_filename,"r")
contents = file.read()
word_list = contents.split()
output_file = open(output_filename,'w+')
for word in word_list:
if word not in output_file:
output_file.write(word + '\n')
file.close()
output_file.close()
print('Done')
But this function just copies everything from the input file to the output file. So I get words like 'and' 'I' that occur more than once in the output file.
Please help.
You can't really check if word not in output_file: like that. I would suggest you use a set to get unique words:
def unique_file(input_filename, output_filename):
with open(input_filename) as file:
contents = file.read()
word_set = set(contents.split())
with open(output_filename, "w+") as output_file:
for word in word_set:
output_file.write(word + '\n')
print("Done")
Note the use of with to handle files - see the last paragraph of the docs.
That's because you cannot ask if a file contains a word like that. You'll have to create a list of words you're adding. EDIT: You should actually make seen a set(). Membership checking is less costly than with the list.
def unique_file(input_filename, output_filename):
file = open(input_filename,"r")
contents = file.read()
word_list = contents.split()
output_file = open(output_filename,'w+')
seen = set()
for word in word_list:
if word not in seen:
output_file.write(word + '\n')
seen.add(word)
file.close()
output_file.close()
print('Done')
If you don't need to worry about the order of the words you can just use the builtin set() which is a container that does not allow duplicates. Something like this should work:
def unique_file(input_filename, output_filename):
with open(input_filename, "r") as inp, open(output_filename, "w") as out:
out.writelines(set(inp.readlines()))

Read lines from a text file, reverse and save in a new text file

So far I have this code:
f = open("text.txt", "rb")
s = f.read()
f.close()
f = open("newtext.txt", "wb")
f.write(s[::-1])
f.close()
The text in the original file is:
This is Line 1
This is Line 2
This is Line 3
This is Line 4
And when it reverses it and saves it the new file looks like this:
4 eniL si sihT 3 eniL si sihT 2 eniL si sihT 1 eniL si sihT
When I want it to look like this:
This is line 4
This is line 3
This is line 2
This is line 1
How can I do this?
You can do something like:
with open('test.txt') as f, open('output.txt', 'w') as fout:
fout.writelines(reversed(f.readlines()))
read() returns the whole file in a single string. That's why when you reverse it, it reverses the lines themselves too, not just their order. You want to reverse only the order of lines, you need to use readlines() to get a list of them (as a first approximation, it is equivalent to s = f.read().split('\n')):
s = f.readlines()
...
f.writelines(s[::-1])
# or f.writelines(reversed(s))
f = open("text.txt", "rb")
s = f.readlines()
f.close()
f = open("newtext.txt", "wb")
s.reverse()
for item in s:
print>>f, item
f.close()
The method file.read() returns a string of the whole file, not the lines.
And since s is a string of the whole file, you're reversing the letters, not the lines!
First, you'll have to split it to lines:
s = f.read()
lines = s.split('\n')
Or:
lines = f.readlines()
And your method, it is already correct:
f.write(lines[::-1])
Hope this helps!
There are a couple of steps here. First we want to get all the lines from the first file, and then we want to write them in reversed order to the new file. The code for doing this is as follows
lines = []
with open('text.txt') as f:
lines = f.readlines()
with open('newtext.txt', 'w') as f:
for line in reversed(lines):
f.write(line)
Firstly, we initialize a variable to hold our lines. Then we read all the lines from the 'test.txt' file.
Secondly, we open our output file. Here we loop through the lines in reversed order, writing them to the output file as we go.
A sample using list so it will be much easier:
I'm sure there answer that are more elegant but this way is clear to understand.
f = open(r"c:\test.txt", "rb")
s = f.read()
f.close()
rowList = []
for value in s:
rowList.append(value + "\n")
rowList.reverse()
f = open(r"c:\test.txt", "wb")
for value in rowList:
f.write(value)
f.close()
You have to work line by line.
f = open("text.txt", "rb")
s = f.read()
f.close()
f = open("newtext.txt", "wb")
lines = s.split('\n')
f.write('\n'.join(lines[::-1]))
f.close()
Use it like this if your OS uses \n to break lines
f = open("text.txt", "rb")
s = f.read()
f.close()
f = open("newtext.txt", "wb")
f.write(reversed(s.split("\n")).join("\n"))
f.close()
Main thing here is reversed(s.split("\n")).join("\n").
It does the following:
Split your string by line breaks - \n,
resulting an array
reverses the array
merges the array back with linebreaks \n to a string
Here the states:
string: line1 \n line2 \n line3
array: ["line1", "line2", "line3"]
array: ["line3", "line2", "line1"]
string: line3 \n line2 \n line1 \n
If your input file is too big to fit in memory, here is an efficient way to reverse it:
Split input file into partial files (still in original order).
Read each partial file from last to first, reverse it and append to output file.
Implementation:
import os
from itertools import islice
input_path = "mylog.txt"
output_path = input_path + ".rev"
with open(input_path) as fi:
for i, sli in enumerate(iter(lambda: list(islice(fi, 100000)), []), 1):
with open(f"{output_path}.{i:05}", "w") as fo:
fo.writelines(sli)
with open(output_path, "w") as fo:
for file_index in range(i, 0, -1):
path = f"{output_path}.{file_index:05}"
with open(path) as fi:
lines = fi.readlines()
os.remove(path)
for line in reversed(lines):
fo.write(line)

Categories