def read_files(file_allmail,file_blacklist):
with open("all_email_large.txt", 'r') as f:
allmail = f.read().splitlines()
with open("bkacklist_large.txt", 'r') as f1:
blacklist = f1.read().splitlines()
return allmail,blacklist
Instead of pre-selecting a document I would like to be able to parameterize them in the command line. In other words, I want to be able to select different documents that python should make a list of.
You probably want argparse: https://docs.python.org/3/howto/argparse.html
Changed slightly, but from the docs:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("file_allmail", help="doc to open")
parser.add_argument("file_blacklist", help="doc to open")
args = parser.parse_args()
def read_files(file_allmail, file_blacklist):
with open(file_allmail, 'r') as f:
allmail = f.read().splitlines()
with open(file_blacklist, 'r') as f1:
blacklist = f1.read().splitlines()
return allmail, blacklist
if __name__ == '__main__':
allmail, blacklist = read_files(args.file_allmail, args.file_blacklist)
print(allmail)
print(blacklist)
You call it with:
python your_python.py --file_allmail all_email_large.txt --file_blacklist bkacklist_large.txt
If you alter your function, then you can simply call read_files('filename1.txt','filename2.txt') at any point for any text files.
def read_files(file_allmail,file_blacklist):
with open(file_allmail, 'r') as f:
allmail = f.read().splitlines()
with open(file_blacklist, 'r') as f1:
blacklist = f1.read().splitlines()
return allmail,blacklist
Related
I have a file with one column. How to delete repeated lines in a file?
On Unix/Linux, use the uniq command, as per David Locke's answer, or sort, as per William Pursell's comment.
If you need a Python script:
lines_seen = set() # holds lines already seen
outfile = open(outfilename, "w")
for line in open(infilename, "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
Update: The sort/uniq combination will remove duplicates but return a file with the lines sorted, which may or may not be what you want. The Python script above won't reorder lines, but just drop duplicates. Of course, to get the script above to sort as well, just leave out the outfile.write(line) and instead, immediately after the loop, do outfile.writelines(sorted(lines_seen)).
If you're on *nix, try running the following command:
sort <file name> | uniq
uniqlines = set(open('/tmp/foo').readlines())
this will give you the list of unique lines.
writing that back to some file would be as easy as:
bar = open('/tmp/bar', 'w').writelines(uniqlines)
bar.close()
You can do:
import os
os.system("awk '!x[$0]++' /path/to/file > /path/to/rem-dups")
Here You are using bash into python :)
You have also other way:
with open('/tmp/result.txt') as result:
uniqlines = set(result.readlines())
with open('/tmp/rmdup.txt', 'w') as rmdup:
rmdup.writelines(set(uniqlines))
get all your lines in the list and make a set of lines and you are done.
for example,
>>> x = ["line1","line2","line3","line2","line1"]
>>> list(set(x))
['line3', 'line2', 'line1']
>>>
If you need to preserve the ordering of lines - as set is unordered collection - try this:
y = []
for l in x:
if l not in y:
y.append(l)
and write the content back to the file.
Its a rehash of whats already been said here - here what I use.
import optparse
def removeDups(inputfile, outputfile):
lines=open(inputfile, 'r').readlines()
lines_set = set(lines)
out=open(outputfile, 'w')
for line in lines_set:
out.write(line)
def main():
parser = optparse.OptionParser('usage %prog ' +\
'-i <inputfile> -o <outputfile>')
parser.add_option('-i', dest='inputfile', type='string',
help='specify your input file')
parser.add_option('-o', dest='outputfile', type='string',
help='specify your output file')
(options, args) = parser.parse_args()
inputfile = options.inputfile
outputfile = options.outputfile
if (inputfile == None) or (outputfile == None):
print parser.usage
exit(1)
else:
removeDups(inputfile, outputfile)
if __name__ == '__main__':
main()
Python One liners :
python -c "import sys; lines = sys.stdin.readlines(); print ''.join(sorted(set(lines)))" < InputFile > OutputFile
adding to #David Locke's answer, with *nix systems you can run
sort -u messy_file.txt > clean_file.txt
which will create clean_file.txt removing duplicates in alphabetical order.
Look at script I created to remove duplicate emails from text files. Hope this helps!
# function to remove duplicate emails
def remove_duplicate():
# opens emails.txt in r mode as one long string and assigns to var
emails = open('emails.txt', 'r').read()
# .split() removes excess whitespaces from str, return str as list
emails = emails.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate emails to clean list
for email in emails:
if email not in clean_list:
clean_list.append(email)
return clean_list
# close emails.txt file
emails.close()
# assigns no_duplicate_emails.txt to variable below
no_duplicate_emails = open('no_duplicate_emails.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for email in remove_duplicate():
# .strip() method to remove commas
email = email.strip(',')
no_duplicate_emails.write(f"E-mail: {email}\n")
# close no_duplicate_emails.txt file
no_duplicate_emails.close()
If anyone is looking for a solution that uses a hashing and is a little more flashy, this is what I currently use:
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(line)
return line
output_file.writelines((add_line(line) for line in input_file
if line not in seen_lines))
edit it within the same file
lines_seen = set() # holds lines already seen
with open("file.txt", "r+") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
Readable and Concise
with open('sample.txt') as fl:
content = fl.read().split('\n')
content = set([line for line in content if line != ''])
content = '\n'.join(content)
with open('sample.txt', 'w') as fl:
fl.writelines(content)
Here is my solution
if __name__ == '__main__':
f = open('temp.txt','w+')
flag = False
with open('file.txt') as fp:
for line in fp:
for temp in f:
if temp == line:
flag = True
print('Found Match')
break
if flag == False:
f.write(line)
elif flag == True:
flag = False
f.seek(0)
f.close()
cat <filename> | grep '^[a-zA-Z]+$' | sort -u > outfile.txt
To filter and remove duplicate values from the file.
Here is my solution
d = input("your file:") #write your file name here
file1 = open(d, mode="r")
file2 = open('file2.txt', mode='w')
file2 = open('file2.txt', mode='a')
file1row = file1.readline()
while file1row != "" :
file2 = open('file2.txt', mode='a')
file2read = open('file2.txt', mode='r')
file2r = file2read.read().strip()
if file1row not in file2r:
file2.write(file1row)
file1row = file1.readline()
file2read.close()
file2.close
Implement the function isWhiteLine(), which takes a string and returns TRUE if the
string contains only white space & tab characters. Program should read a file given as command-line argument, and print only non-blank lines onto the standard output.
import sys
def isWhiteLine(x):
return x.isspace()
file_name = sys.argv[1]
f = open(file_name, "r")
for i in f:
if (isWhiteLine(i) == False):
print(str(i).strip())
f.close()
The code below should support what you are looking for.
import sys
def print_non_empty_lines(file_name):
with open(file_name, 'r') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line:
print(line)
if __name__ == "__main__":
if len(sys.argv) > 1:
print_non_empty_lines(sys.argv[1])
I'd like to figure out how I should use a class to read input from a file so that I can use that data in other classes. If I read input from a file into a list, should I pass that to another class that needs that to use that information?
Right now I have:
import sys
class FileReader:
"""Reads a file"""
def __init__(self):
input = ''
try:
with open(sys.argv[1], 'r') as inFile:
input = inFile.readline()
print(input)
except IndexError:
print("Error - Please specify an input file.")
sys.exit(2)
def main():
x = FileReader()
if __name__ == "__main__":
main()
I thought about making some kind of list to hold strings from the file, but I'm not sure whether that should be global or not.
If all you're trying to do is read the file line by line, something like the following would work just fine (exception handling omitted).
>>> path = '/path/to/file.txt'
>>> with open(path, 'r') as f:
... lines = [l for l in f]
You can then pass around lines as necessary.
I've a file entitled 'users.txt' with the following structure; username:info_about_the_user.
Something like this:
users.txt:
mark:stuffabouthim anthony:stuffabouthim peter:stuffabouthim peterpeter:stuffabouthim peterpeterpeter:stuffabouthim peterpeterpeterpeter:stuffabouthim
The following part of the script needs to change a line (change info about an user) but I'm having problems when the string is duplicated (peter - peterpeter) and I dont know how to fix it.
def test():
fn = 'users.txt'
f = open(fn)
output = []
changeuser = 'peterpeter'
userinfo = 'HeIsTall'
for line in f:
if not changeuser+":" in line:
output.append(line)
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
f = open("users.txt", "a")
f.write(changeuser + ":" + userinfo+"\n")
f = open("users.txt", "a")
test()
This is the input I have:
Input: users.txt:
mark:stuffabouthim anthony:stuffabouthim peter:stuffabouthim peterpeter:HesAwesome peterpeterpeter:stuffabouthim peterpeterpeterpeter:stuffabouthim
I want to change info about peterpeter and have the following output:
Output I want to have: users.txt:
mark:stuffabouthim anthony:stuff about him peter:stuffabouthim peterpeter:HeIsTall peterpeterpeter:stuffabouthim peterpeterpeterpeter:stuffabouthim
But this is the input I'm having. All the lines behind peterpeter are getting deleted among other things.
mark:stuffabouthim
anthony:stuffabouthim
peter:stuffabouthim
peterpeter:HeIsTall
Can anyone give me a help with the code below to have the desired output? Thanks.
You can have it the easy way with the fileinput module:
import fileinput
def test():
fn = 'users.txt'
changeuser = 'peterpeter'
newinfo = 'HeIsTall'
for line in fileinput.input(fn, inplace=1):
user, oldinfo = line.split(':')
print '%s:%s' % (user, newinfo if user == changeuser else oldinfo.replace('\n', ''))
if __name__ == "__main__":
test()
try this:
def test():
fn = 'users.txt.txt'
f = open(fn)
output = []
changeuser = 'peterpeter'
userinfo = 'HeIsTall'
for line in f:
if line.strip().split(':')[0]!=changeuser:
output.append(line)
else:
output.append(changeuser + ":" + userinfo+"\n")
f.close()
f = open(fn, 'w')
f.writelines(output)
f.close()
test()
output:
mark:stuffabouthim
anthony:stuffabouthim
peter:stuffabouthim
peterpeter:HeIsTall
peterpeterpeter:stuffabouthim
peterpeterpeterpeter:stuffabouthim
You got a logical error in the if-clause, which DELETES all peters*, the only peter remaining is the one you append to the file.
for line in f:
if not changeuser+":" in line: #THAT MEANS ALL PETERS ARE IGNORED!
output.append(line)
It's generaly easier to understand positive clauses then a negation:
for line in f:
if changeuser+":" in line:
output.append('%s:%s\n' %(changeuser,userinfo))
else:
output.append(line)
Good code is easy to read. Try to code like you would try to write a report! That leads automatically to spliting your code into smaller pieces like functions. e.g.:
lines = read_all_lines_from_file(filename)
change_user_info(lines, user, userinfo)
save_lines_to_file(lines, filename)
Your code gets split into smaller pieces and if an error occurs you can pin it down to a few lines of code instead of having to work over several pages. ;-)
I have a file with one column. How to delete repeated lines in a file?
On Unix/Linux, use the uniq command, as per David Locke's answer, or sort, as per William Pursell's comment.
If you need a Python script:
lines_seen = set() # holds lines already seen
outfile = open(outfilename, "w")
for line in open(infilename, "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
Update: The sort/uniq combination will remove duplicates but return a file with the lines sorted, which may or may not be what you want. The Python script above won't reorder lines, but just drop duplicates. Of course, to get the script above to sort as well, just leave out the outfile.write(line) and instead, immediately after the loop, do outfile.writelines(sorted(lines_seen)).
If you're on *nix, try running the following command:
sort <file name> | uniq
uniqlines = set(open('/tmp/foo').readlines())
this will give you the list of unique lines.
writing that back to some file would be as easy as:
bar = open('/tmp/bar', 'w').writelines(uniqlines)
bar.close()
You can do:
import os
os.system("awk '!x[$0]++' /path/to/file > /path/to/rem-dups")
Here You are using bash into python :)
You have also other way:
with open('/tmp/result.txt') as result:
uniqlines = set(result.readlines())
with open('/tmp/rmdup.txt', 'w') as rmdup:
rmdup.writelines(set(uniqlines))
get all your lines in the list and make a set of lines and you are done.
for example,
>>> x = ["line1","line2","line3","line2","line1"]
>>> list(set(x))
['line3', 'line2', 'line1']
>>>
If you need to preserve the ordering of lines - as set is unordered collection - try this:
y = []
for l in x:
if l not in y:
y.append(l)
and write the content back to the file.
Its a rehash of whats already been said here - here what I use.
import optparse
def removeDups(inputfile, outputfile):
lines=open(inputfile, 'r').readlines()
lines_set = set(lines)
out=open(outputfile, 'w')
for line in lines_set:
out.write(line)
def main():
parser = optparse.OptionParser('usage %prog ' +\
'-i <inputfile> -o <outputfile>')
parser.add_option('-i', dest='inputfile', type='string',
help='specify your input file')
parser.add_option('-o', dest='outputfile', type='string',
help='specify your output file')
(options, args) = parser.parse_args()
inputfile = options.inputfile
outputfile = options.outputfile
if (inputfile == None) or (outputfile == None):
print parser.usage
exit(1)
else:
removeDups(inputfile, outputfile)
if __name__ == '__main__':
main()
Python One liners :
python -c "import sys; lines = sys.stdin.readlines(); print ''.join(sorted(set(lines)))" < InputFile > OutputFile
adding to #David Locke's answer, with *nix systems you can run
sort -u messy_file.txt > clean_file.txt
which will create clean_file.txt removing duplicates in alphabetical order.
Look at script I created to remove duplicate emails from text files. Hope this helps!
# function to remove duplicate emails
def remove_duplicate():
# opens emails.txt in r mode as one long string and assigns to var
emails = open('emails.txt', 'r').read()
# .split() removes excess whitespaces from str, return str as list
emails = emails.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate emails to clean list
for email in emails:
if email not in clean_list:
clean_list.append(email)
return clean_list
# close emails.txt file
emails.close()
# assigns no_duplicate_emails.txt to variable below
no_duplicate_emails = open('no_duplicate_emails.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for email in remove_duplicate():
# .strip() method to remove commas
email = email.strip(',')
no_duplicate_emails.write(f"E-mail: {email}\n")
# close no_duplicate_emails.txt file
no_duplicate_emails.close()
If anyone is looking for a solution that uses a hashing and is a little more flashy, this is what I currently use:
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(line)
return line
output_file.writelines((add_line(line) for line in input_file
if line not in seen_lines))
edit it within the same file
lines_seen = set() # holds lines already seen
with open("file.txt", "r+") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
Readable and Concise
with open('sample.txt') as fl:
content = fl.read().split('\n')
content = set([line for line in content if line != ''])
content = '\n'.join(content)
with open('sample.txt', 'w') as fl:
fl.writelines(content)
Here is my solution
if __name__ == '__main__':
f = open('temp.txt','w+')
flag = False
with open('file.txt') as fp:
for line in fp:
for temp in f:
if temp == line:
flag = True
print('Found Match')
break
if flag == False:
f.write(line)
elif flag == True:
flag = False
f.seek(0)
f.close()
cat <filename> | grep '^[a-zA-Z]+$' | sort -u > outfile.txt
To filter and remove duplicate values from the file.
Here is my solution
d = input("your file:") #write your file name here
file1 = open(d, mode="r")
file2 = open('file2.txt', mode='w')
file2 = open('file2.txt', mode='a')
file1row = file1.readline()
while file1row != "" :
file2 = open('file2.txt', mode='a')
file2read = open('file2.txt', mode='r')
file2r = file2read.read().strip()
if file1row not in file2r:
file2.write(file1row)
file1row = file1.readline()
file2read.close()
file2.close