searching for a string in a file using python faliure - python

I am using this code to search for emails in a particular file and write them into a another file. I have used 'in' operator to make sure that the email are not duplicated.
But this code does not get executed after the for line in f: line.
Can any one point out the mistake i have made here?
tempPath = input("Please Enter the Path of the File\n")
temp_file = open(tempPath, "r")
fileContent = temp_file.read()
temp_file.close()
pattern_normal = re.compile("[-a-zA-Z0-9._]+#[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+")
pattern_normal_list = pattern_normal.findall(str(fileContent))
with open('emails_file.txt', 'a+') as f:
for item in pattern_normal_list:
for line in f:
if line in item:
print("duplicate")
else:
print("%s" %item)
f.write("%s" %item)
f.write('\n')

New solution:
tempPath = input("Please Enter the Path of the File\n")
temp_file = open(tempPath, "r")
fileContent = temp_file.read()
temp_file.close()
pattern_normal = re.compile("[-a-zA-Z0-9._]+#[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+")
addresses = list(set(pattern_normal.findall(str(fileContent))))
with open('new_emails.txt', 'a+') as f:
f.write('\n'.join(addresses))
I think your logic was wrong, this works:
addresses = ['test#wham.com', 'heffa#wham.com']
with open('emails_file.txt', 'a+') as f:
fdata = f.read()
for mail in addresses:
if not mail in fdata:
f.write(mail + '\n')
Without reading to much into your code,
it looks like youre looping line by line, checking if the address you've also looping through exists in the line, if it doesn't you append your e-mail to it? But in 99% of a 100 lines the address will not be in the line, hence you'll get an unwanted addition.
Output of my code snippet:
[Torxed#faparch ~]$ cat emails_file.txt
test#wham.com
Torxed#whoever.com
[Torxed#faparch ~]$ python test.py
[Torxed#faparch ~]$ cat emails_file.txt
test#wham.com
Torxed#whoever.com
heffa#wham.com
[Torxed#faparch ~]$

for line in f:
Shouldn't you first call f.readlines()?
lines = f.readlines()
for line in lines:
Check this.

Related

Compare each line and remove the repeated/same line having the same numbers in python [duplicate]

I have a file with one column. How to delete repeated lines in a file?
On Unix/Linux, use the uniq command, as per David Locke's answer, or sort, as per William Pursell's comment.
If you need a Python script:
lines_seen = set() # holds lines already seen
outfile = open(outfilename, "w")
for line in open(infilename, "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
Update: The sort/uniq combination will remove duplicates but return a file with the lines sorted, which may or may not be what you want. The Python script above won't reorder lines, but just drop duplicates. Of course, to get the script above to sort as well, just leave out the outfile.write(line) and instead, immediately after the loop, do outfile.writelines(sorted(lines_seen)).
If you're on *nix, try running the following command:
sort <file name> | uniq
uniqlines = set(open('/tmp/foo').readlines())
this will give you the list of unique lines.
writing that back to some file would be as easy as:
bar = open('/tmp/bar', 'w').writelines(uniqlines)
bar.close()
You can do:
import os
os.system("awk '!x[$0]++' /path/to/file > /path/to/rem-dups")
Here You are using bash into python :)
You have also other way:
with open('/tmp/result.txt') as result:
uniqlines = set(result.readlines())
with open('/tmp/rmdup.txt', 'w') as rmdup:
rmdup.writelines(set(uniqlines))
get all your lines in the list and make a set of lines and you are done.
for example,
>>> x = ["line1","line2","line3","line2","line1"]
>>> list(set(x))
['line3', 'line2', 'line1']
>>>
If you need to preserve the ordering of lines - as set is unordered collection - try this:
y = []
for l in x:
if l not in y:
y.append(l)
and write the content back to the file.
Its a rehash of whats already been said here - here what I use.
import optparse
def removeDups(inputfile, outputfile):
lines=open(inputfile, 'r').readlines()
lines_set = set(lines)
out=open(outputfile, 'w')
for line in lines_set:
out.write(line)
def main():
parser = optparse.OptionParser('usage %prog ' +\
'-i <inputfile> -o <outputfile>')
parser.add_option('-i', dest='inputfile', type='string',
help='specify your input file')
parser.add_option('-o', dest='outputfile', type='string',
help='specify your output file')
(options, args) = parser.parse_args()
inputfile = options.inputfile
outputfile = options.outputfile
if (inputfile == None) or (outputfile == None):
print parser.usage
exit(1)
else:
removeDups(inputfile, outputfile)
if __name__ == '__main__':
main()
Python One liners :
python -c "import sys; lines = sys.stdin.readlines(); print ''.join(sorted(set(lines)))" < InputFile > OutputFile
adding to #David Locke's answer, with *nix systems you can run
sort -u messy_file.txt > clean_file.txt
which will create clean_file.txt removing duplicates in alphabetical order.
Look at script I created to remove duplicate emails from text files. Hope this helps!
# function to remove duplicate emails
def remove_duplicate():
# opens emails.txt in r mode as one long string and assigns to var
emails = open('emails.txt', 'r').read()
# .split() removes excess whitespaces from str, return str as list
emails = emails.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate emails to clean list
for email in emails:
if email not in clean_list:
clean_list.append(email)
return clean_list
# close emails.txt file
emails.close()
# assigns no_duplicate_emails.txt to variable below
no_duplicate_emails = open('no_duplicate_emails.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for email in remove_duplicate():
# .strip() method to remove commas
email = email.strip(',')
no_duplicate_emails.write(f"E-mail: {email}\n")
# close no_duplicate_emails.txt file
no_duplicate_emails.close()
If anyone is looking for a solution that uses a hashing and is a little more flashy, this is what I currently use:
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(line)
return line
output_file.writelines((add_line(line) for line in input_file
if line not in seen_lines))
edit it within the same file
lines_seen = set() # holds lines already seen
with open("file.txt", "r+") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
Readable and Concise
with open('sample.txt') as fl:
content = fl.read().split('\n')
content = set([line for line in content if line != ''])
content = '\n'.join(content)
with open('sample.txt', 'w') as fl:
fl.writelines(content)
Here is my solution
if __name__ == '__main__':
f = open('temp.txt','w+')
flag = False
with open('file.txt') as fp:
for line in fp:
for temp in f:
if temp == line:
flag = True
print('Found Match')
break
if flag == False:
f.write(line)
elif flag == True:
flag = False
f.seek(0)
f.close()
cat <filename> | grep '^[a-zA-Z]+$' | sort -u > outfile.txt
To filter and remove duplicate values from the file.
Here is my solution
d = input("your file:") #write your file name here
file1 = open(d, mode="r")
file2 = open('file2.txt', mode='w')
file2 = open('file2.txt', mode='a')
file1row = file1.readline()
while file1row != "" :
file2 = open('file2.txt', mode='a')
file2read = open('file2.txt', mode='r')
file2r = file2read.read().strip()
if file1row not in file2r:
file2.write(file1row)
file1row = file1.readline()
file2read.close()
file2.close

Update Txt file in python

I have a text file with names and results. If the name already exists, only the result should be updated. I tried with this code and many others, but without success.
The content of the text file looks like this:
Ann, 200
Buddy, 10
Mark, 180
Luis, 100
PS: I started 2 weeks ago, so don't judge my bad code.
from os import rename
def updatescore(username, score):
file = open("mynewscores.txt", "r")
new_file = open("mynewscores2.txt", "w")
for line in file:
if username in line:
splitted = line.split(",")
splitted[1] = score
joined = "".join(splitted)
new_file.write(joined)
new_file.write(line)
file.close()
new_file.close()
maks = updatescore("Buddy", "200")
print(maks)
I would suggest reading the csv in as a dictionary and just update the one value.
import csv
d = {}
with open('test.txt', newline='') as f:
reader = csv.reader(f)
for row in reader:
key,value = row
d[key] = value
d['Buddy'] = 200
with open('test2.txt','w', newline='') as f:
writer = csv.writer(f)
for key, value in d.items():
writer.writerow([key,value])
So what needed to be different mostly is that when in your for loop you said to put line in the new text file, but it's never said to Not do that when wanting to replace a score, all that was needed was an else statement below the if statement:
from os import rename
def updatescore(username, score):
file = open("mynewscores.txt", "r")
new_file = open("mynewscores2.txt", "w")
for line in file:
if username in line:
splitted = line.split(",")
splitted[1] = score
print (splitted)
joined = ", ".join(splitted)
print(joined)
new_file.write(joined+'\n')
else:
new_file.write(line)
file.close()
new_file.close()
maks = updatescore("Buddy", "200")
print(maks)
You can try this, add the username if it doesn't exist, else update it.
def updatescore(username, score):
with open("mynewscores.txt", "r+") as file:
line = file.readline()
while line:
if username in line:
file.seek(file.tell() - len(line))
file.write(f"{username}, {score}")
return
line = file.readline()
file.write(f"\n{username}, {score}")
maks = updatescore("Buddy", "300")
maks = updatescore("Mario", "50")
You have new_file.write(joined) inside the if block, which is good, but you also have new_file.write(line) outside the if block.
Outside the if block, it's putting both the original and fixed lines into the file, and since you're using write() instead of writelines() both versions get put on the same line: there's no \n newline character.
You also want to add the comma: joined = ','.join(splitted) since you took the commas out when you used line.split(',')
I got the result you seem to be expecting when I put in both these fixes.
Next time you should include what you are expecting for output and what you're giving as input. It might be helpful if you also include what Error or result you actually got.
Welcome to Python BTW
Removed issues from your code:
def updatescore(username, score):
file = open("mynewscores.txt", "r")
new_file = open("mynewscores2.txt", "w")
for line in file.readlines():
splitted = line.split(",")
if username == splitted[0].strip():
splitted[1] = str(score)
joined = ",".join(splitted)
new_file.write(joined)
else:
new_file.write(line)
file.close()
new_file.close()
I believe this is the simplest/most straightforward way of doing things.
Code:
import csv
def update_score(name: str, score: int) -> None:
with open('../resources/name_data.csv', newline='') as file_obj:
reader = csv.reader(file_obj)
data_dict = dict(curr_row for curr_row in reader)
data_dict[name] = score
with open('../out/name_data_out.csv', 'w', newline='') as file_obj:
writer = csv.writer(file_obj)
writer.writerows(data_dict.items())
update_score('Buddy', 200)
Input file:
Ann,200
Buddy,10
Mark,180
Luis,100
Output file:
Ann,200
Buddy,200
Mark,180
Luis,100

Search for first occurrence of a string and insert string on line above with python

I usually work in bash so i'm very new to this frightening world of python.
I am attempting to search a file for a string then insert text above the "First occurrence" of that string with empty line between.
The file to be edited would look like this:
Name:
Billy
Thorton
Billy
Thorton
I am trying to insert "Bob" above "Thorton" with the empty lines between like this:
Name:
Billy
Bob
Thorton
Billy
Thorton
This is the Python i have so far.
contents = "Bob"
f = open("file", "w")
contents = "".join(contents)
f.write(contents)
f.close()
This does not search for the string and it replaces the whole file.
A working example in bash would be:
sed -i '0,/Thorton/s//Bob\n\n&/' file
A common way to do so in Python would be to open the file, iterate over it line by line and prepare the results, then write the results to the file.
res = ""
with open("test.txt", "r") as f:
data = f.readlines() # Read the file line by line
found = False
for line in data:
if "Thorton" in line and not found:
res += "Bob\n\n" # Insert Bob if needed
found = True
res += line # Insert the line we just read
with open("test.txt", "w") as f:
f.write(res) # Write the answer in the same file
You could use str.split() to get each item into a list then use list.index() to get the position of "Thorton" to insert from then str.join() to get them back into writable form:
with open('filename.txt', 'r') as infile:
data = infile.read().split()
data.insert(data.index('Thorton'), 'Bob')
with open('filename.txt', 'w') as outfile:
outfile.write('\n\n'.join(data))
you could do
searchedName = "Thorton"
addedName= "Bob"
f = open("file", "w")
content = f.readlines()
index = content.index(searchedName + '\n')
contents = content.insert(index , addedName + '\n')
contents = "".join(contents)
f.write(contents)
f.close()

How to search and replace text in a file?

How do I search and replace text in a file using Python 3?
Here is my code:
import os
import sys
import fileinput
print ("Text to search for:")
textToSearch = input( "> " )
print ("Text to replace it with:")
textToReplace = input( "> " )
print ("File to perform Search-Replace on:")
fileToSearch = input( "> " )
#fileToSearch = 'D:\dummy1.txt'
tempFile = open( fileToSearch, 'r+' )
for line in fileinput.input( fileToSearch ):
if textToSearch in line :
print('Match Found')
else:
print('Match Not Found!!')
tempFile.write( line.replace( textToSearch, textToReplace ) )
tempFile.close()
input( '\n\n Press Enter to exit...' )
Input file:
hi this is abcd hi this is abcd
This is dummy text file.
This is how search and replace works abcd
When I search and replace 'ram' by 'abcd' in above input file, it works as a charm. But when I do it vice-versa i.e. replacing 'abcd' by 'ram', some junk characters are left at the end.
Replacing 'abcd' by 'ram'
hi this is ram hi this is ram
This is dummy text file.
This is how search and replace works rambcd
As pointed out by michaelb958, you cannot replace in place with data of a different length because this will put the rest of the sections out of place. I disagree with the other posters suggesting you read from one file and write to another. Instead, I would read the file into memory, fix the data up, and then write it out to the same file in a separate step.
# Read in the file
with open('file.txt', 'r') as file :
filedata = file.read()
# Replace the target string
filedata = filedata.replace('abcd', 'ram')
# Write the file out again
with open('file.txt', 'w') as file:
file.write(filedata)
Unless you've got a massive file to work with which is too big to load into memory in one go, or you are concerned about potential data loss if the process is interrupted during the second step in which you write data to the file.
fileinput already supports inplace editing. It redirects stdout to the file in this case:
#!/usr/bin/env python3
import fileinput
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(text_to_search, replacement_text), end='')
As Jack Aidley had posted and J.F. Sebastian pointed out, this code will not work:
# Read in the file
filedata = None
with file = open('file.txt', 'r') :
filedata = file.read()
# Replace the target string
filedata.replace('ram', 'abcd')
# Write the file out again
with file = open('file.txt', 'w') :
file.write(filedata)`
But this code WILL work (I've tested it):
f = open(filein,'r')
filedata = f.read()
f.close()
newdata = filedata.replace("old data","new data")
f = open(fileout,'w')
f.write(newdata)
f.close()
Using this method, filein and fileout can be the same file, because Python 3.3 will overwrite the file upon opening for write.
You can do the replacement like this
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
for line in f1:
f2.write(line.replace('old_text', 'new_text'))
f1.close()
f2.close()
You can also use pathlib.
from pathlib2 import Path
path = Path(file_to_search)
text = path.read_text()
text = text.replace(text_to_search, replacement_text)
path.write_text(text)
(pip install python-util)
from pyutil import filereplace
filereplace("somefile.txt","abcd","ram")
Will replace all occurences of "abcd" with "ram".
The function also supports regex by specifying regex=True
from pyutil import filereplace
filereplace("somefile.txt","\\w+","ram",regex=True)
Disclaimer: I'm the author (https://github.com/MisterL2/python-util)
Open the file in read mode. Read the file in string format. Replace the text as intended. Close the file. Again open the file in write mode. Finally, write the replaced text to the same file.
try:
with open("file_name", "r+") as text_file:
texts = text_file.read()
texts = texts.replace("to_replace", "replace_string")
with open(file_name, "w") as text_file:
text_file.write(texts)
except FileNotFoundError as f:
print("Could not find the file you are trying to read.")
Late answer, but this is what I use to find and replace inside a text file:
with open("test.txt") as r:
text = r.read().replace("THIS", "THAT")
with open("test.txt", "w") as w:
w.write(text)
DEMO
With a single with block, you can search and replace your text:
with open('file.txt','r+') as f:
filedata = f.read()
filedata = filedata.replace('abc','xyz')
f.truncate(0)
f.write(filedata)
Your problem stems from reading from and writing to the same file. Rather than opening fileToSearch for writing, open an actual temporary file and then after you're done and have closed tempFile, use os.rename to move the new file over fileToSearch.
My variant, one word at a time on the entire file.
I read it into memory.
def replace_word(infile,old_word,new_word):
if not os.path.isfile(infile):
print ("Error on replace_word, not a regular file: "+infile)
sys.exit(1)
f1=open(infile,'r').read()
f2=open(infile,'w')
m=f1.replace(old_word,new_word)
f2.write(m)
Using re.subn it is possible to have more control on the substitution process, such as word splitted over two lines, case-(in)sensitive match. Further, it returns the amount of matches which can be used to avoid waste of resources if the string is not found.
import re
file = # path to file
# they can be also raw string and regex
textToSearch = r'Ha.*O' # here an example with a regex
textToReplace = 'hallo'
# read and replace
with open(file, 'r') as fd:
# sample case-insensitive find-and-replace
text, counter = re.subn(textToSearch, textToReplace, fd.read(), re.I)
# check if there is at least a match
if counter > 0:
# edit the file
with open(file, 'w') as fd:
fd.write(text)
# summary result
print(f'{counter} occurence of "{textToSearch}" were replaced with "{textToReplace}".')
Some regex:
add the re.I flag, short form of re.IGNORECASE, for a case-insensitive match
for multi-line replacement re.subn(r'\n*'.join(textToSearch), textToReplace, fd.read()), depending on the data also '\n{,1}'. Notice that for this case textToSearch must be a pure string, not a regex!
Besides the answers already mentioned, here is an explanation of why you have some random characters at the end:
You are opening the file in r+ mode, not w mode. The key difference is that w mode clears the contents of the file as soon as you open it, whereas r+ doesn't.
This means that if your file content is "123456789" and you write "www" to it, you get "www456789". It overwrites the characters with the new input, but leaves any remaining input untouched.
You can clear a section of the file contents by using truncate(<startPosition>), but you are probably best off saving the updated file content to a string first, then doing truncate(0) and writing it all at once.
Or you can use my library :D
I got the same issue. The problem is that when you load a .txt in a variable you use it like an array of string while it's an array of character.
swapString = []
with open(filepath) as f:
s = f.read()
for each in s:
swapString.append(str(each).replace('this','that'))
s = swapString
print(s)
I tried this and used readlines instead of read
with open('dummy.txt','r') as file:
list = file.readlines()
print(f'before removal {list}')
for i in list[:]:
list.remove(i)
print(f'After removal {list}')
with open('dummy.txt','w+') as f:
for i in list:
f.write(i)
you can use sed or awk or grep in python (with some restrictions). Here is a very simple example. It changes banana to bananatoothpaste in the file. You can edit and use it. ( I tested it worked...note: if you are testing under windows you should install "sed" command and set the path first)
import os
file="a.txt"
oldtext="Banana"
newtext=" BananaToothpaste"
os.system('sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
#print(f'sed -i "s/{oldtext}/{newtext}/g" {file}')
print('This command was applied: sed -i "s/{}/{}/g" {}'.format(oldtext,newtext,file))
if you want to see results on the file directly apply: "type" for windows/ "cat" for linux:
####FOR WINDOWS:
os.popen("type " + file).read()
####FOR LINUX:
os.popen("cat " + file).read()
I have done this:
#!/usr/bin/env python3
import fileinput
import os
Dir = input ("Source directory: ")
os.chdir(Dir)
Filelist = os.listdir()
print('File list: ',Filelist)
NomeFile = input ("Insert file name: ")
CarOr = input ("Text to search: ")
CarNew = input ("New text: ")
with fileinput.FileInput(NomeFile, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace(CarOr, CarNew), end='')
file.close ()
I modified Jayram Singh's post slightly in order to replace every instance of a '!' character to a number which I wanted to increment with each instance. Thought it might be helpful to someone who wanted to modify a character that occurred more than once per line and wanted to iterate. Hope that helps someone. PS- I'm very new at coding so apologies if my post is inappropriate in any way, but this worked for me.
f1 = open('file1.txt', 'r')
f2 = open('file2.txt', 'w')
n = 1
# if word=='!'replace w/ [n] & increment n; else append same word to
# file2
for line in f1:
for word in line:
if word == '!':
f2.write(word.replace('!', f'[{n}]'))
n += 1
else:
f2.write(word)
f1.close()
f2.close()
def word_replace(filename,old,new):
c=0
with open(filename,'r+',encoding ='utf-8') as f:
a=f.read()
b=a.split()
for i in range(0,len(b)):
if b[i]==old:
c=c+1
old=old.center(len(old)+2)
new=new.center(len(new)+2)
d=a.replace(old,new,c)
f.truncate(0)
f.seek(0)
f.write(d)
print('All words have been replaced!!!')
I have worked this out as an exercise of a course: open file, find and replace string and write to a new file.
class Letter:
def __init__(self):
with open("./Input/Names/invited_names.txt", "r") as file:
# read the list of names
list_names = [line.rstrip() for line in file]
with open("./Input/Letters/starting_letter.docx", "r") as f:
# read letter
file_source = f.read()
for name in list_names:
with open(f"./Output/ReadyToSend/LetterTo{name}.docx", "w") as f:
# replace [name] with name of the list in the file
replace_string = file_source.replace('[name]', name)
# write to a new file
f.write(replace_string)
brief = Letter()
Like so:
def find_and_replace(file, word, replacement):
with open(file, 'r+') as f:
text = f.read()
f.write(text.replace(word, replacement))
def findReplace(find, replace):
import os
src = os.path.join(os.getcwd(), os.pardir)
for path, dirs, files in os.walk(os.path.abspath(src)):
for name in files:
if name.endswith('.py'):
filepath = os.path.join(path, name)
with open(filepath) as f:
s = f.read()
s = s.replace(find, replace)
with open(filepath, "w") as f:
f.write(s)

How might I remove duplicate lines from a file?

I have a file with one column. How to delete repeated lines in a file?
On Unix/Linux, use the uniq command, as per David Locke's answer, or sort, as per William Pursell's comment.
If you need a Python script:
lines_seen = set() # holds lines already seen
outfile = open(outfilename, "w")
for line in open(infilename, "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
Update: The sort/uniq combination will remove duplicates but return a file with the lines sorted, which may or may not be what you want. The Python script above won't reorder lines, but just drop duplicates. Of course, to get the script above to sort as well, just leave out the outfile.write(line) and instead, immediately after the loop, do outfile.writelines(sorted(lines_seen)).
If you're on *nix, try running the following command:
sort <file name> | uniq
uniqlines = set(open('/tmp/foo').readlines())
this will give you the list of unique lines.
writing that back to some file would be as easy as:
bar = open('/tmp/bar', 'w').writelines(uniqlines)
bar.close()
You can do:
import os
os.system("awk '!x[$0]++' /path/to/file > /path/to/rem-dups")
Here You are using bash into python :)
You have also other way:
with open('/tmp/result.txt') as result:
uniqlines = set(result.readlines())
with open('/tmp/rmdup.txt', 'w') as rmdup:
rmdup.writelines(set(uniqlines))
get all your lines in the list and make a set of lines and you are done.
for example,
>>> x = ["line1","line2","line3","line2","line1"]
>>> list(set(x))
['line3', 'line2', 'line1']
>>>
If you need to preserve the ordering of lines - as set is unordered collection - try this:
y = []
for l in x:
if l not in y:
y.append(l)
and write the content back to the file.
Its a rehash of whats already been said here - here what I use.
import optparse
def removeDups(inputfile, outputfile):
lines=open(inputfile, 'r').readlines()
lines_set = set(lines)
out=open(outputfile, 'w')
for line in lines_set:
out.write(line)
def main():
parser = optparse.OptionParser('usage %prog ' +\
'-i <inputfile> -o <outputfile>')
parser.add_option('-i', dest='inputfile', type='string',
help='specify your input file')
parser.add_option('-o', dest='outputfile', type='string',
help='specify your output file')
(options, args) = parser.parse_args()
inputfile = options.inputfile
outputfile = options.outputfile
if (inputfile == None) or (outputfile == None):
print parser.usage
exit(1)
else:
removeDups(inputfile, outputfile)
if __name__ == '__main__':
main()
Python One liners :
python -c "import sys; lines = sys.stdin.readlines(); print ''.join(sorted(set(lines)))" < InputFile > OutputFile
adding to #David Locke's answer, with *nix systems you can run
sort -u messy_file.txt > clean_file.txt
which will create clean_file.txt removing duplicates in alphabetical order.
Look at script I created to remove duplicate emails from text files. Hope this helps!
# function to remove duplicate emails
def remove_duplicate():
# opens emails.txt in r mode as one long string and assigns to var
emails = open('emails.txt', 'r').read()
# .split() removes excess whitespaces from str, return str as list
emails = emails.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate emails to clean list
for email in emails:
if email not in clean_list:
clean_list.append(email)
return clean_list
# close emails.txt file
emails.close()
# assigns no_duplicate_emails.txt to variable below
no_duplicate_emails = open('no_duplicate_emails.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for email in remove_duplicate():
# .strip() method to remove commas
email = email.strip(',')
no_duplicate_emails.write(f"E-mail: {email}\n")
# close no_duplicate_emails.txt file
no_duplicate_emails.close()
If anyone is looking for a solution that uses a hashing and is a little more flashy, this is what I currently use:
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(line)
return line
output_file.writelines((add_line(line) for line in input_file
if line not in seen_lines))
edit it within the same file
lines_seen = set() # holds lines already seen
with open("file.txt", "r+") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
Readable and Concise
with open('sample.txt') as fl:
content = fl.read().split('\n')
content = set([line for line in content if line != ''])
content = '\n'.join(content)
with open('sample.txt', 'w') as fl:
fl.writelines(content)
Here is my solution
if __name__ == '__main__':
f = open('temp.txt','w+')
flag = False
with open('file.txt') as fp:
for line in fp:
for temp in f:
if temp == line:
flag = True
print('Found Match')
break
if flag == False:
f.write(line)
elif flag == True:
flag = False
f.seek(0)
f.close()
cat <filename> | grep '^[a-zA-Z]+$' | sort -u > outfile.txt
To filter and remove duplicate values from the file.
Here is my solution
d = input("your file:") #write your file name here
file1 = open(d, mode="r")
file2 = open('file2.txt', mode='w')
file2 = open('file2.txt', mode='a')
file1row = file1.readline()
while file1row != "" :
file2 = open('file2.txt', mode='a')
file2read = open('file2.txt', mode='r')
file2r = file2read.read().strip()
if file1row not in file2r:
file2.write(file1row)
file1row = file1.readline()
file2read.close()
file2.close

Categories