Replace sequences in Python - python

I want to make a replacement script.
It should replace str1 to str2.
My file has xml-based structure.
For example, I have:
...word1'#13#10'word2'#13#10'word3... = ...word1'#13#10'word3...
I want to remove some part of string.
I use this in a script:
Lines[i] = Lines[i].replace(key, DataBase[key])
I've already checked that "key" and "DataBase[key]" are correctly defined. If I print them into console with "print()" - it looks just like it has to.
But then script is executing it don't change sequences like this - with '#13#10'. Pairs of keys without any specific simbols works fine.
What can I do? And why it doesn't works well?
Full script:
import configparser
#import time
config = configparser.ConfigParser() # init configparser
config.optionxform = str
config.read("SocratToCortesExpress.cfg") # config file
print("Config file - readed")
filePath = config.get("PATH", "old_file") # config file - with names of files, pairs of words
DataStrings = config.items("DATA") # read pairs
DataBase = dict() # initialization of dictionary
print("Dictionary - initialized")
for Dstr in DataStrings: # old and new words for a replacement
SocratName = Dstr[0]
CortesName = Dstr[1]
DataBase[SocratName] = CortesName
print("Dictionary - fulfilled")
with open(filePath, "r", encoding='utf-8-sig') as ResultFile: # input file Lines = ResultFile.readlines()
print("Old file - uploaded")
f1 = open('logkeys.txt', 'w')
for key in DataBase.keys():
try:
f1.write('\n'+key+'\n'+DataBase[key]+'\n')
except Exception as e: #errors
f2 = open('log.txt', 'w')
f2.write('An exceptional thing happed - %s' %e)
f2.close()
f1.close()
for i in range(len(Lines)): # brutforce - all over input file
#Lines[i] = Lines[i].replace('\ufeff', '') #some weird symbol
for key in DataBase.keys():
try:
Lines[i] = Lines[i].replace(key, DataBase[key]) #replacing
except Exception as e: #errors
f2 = open('log.txt', 'w')
f2.write('An exceptional thing happed - %s' %e)
f2.close()
print("Sequences - replaced")
outFileName = config.get("PATH", "new_file") # define output file
print("Exit file - initialized")
with open(outFileName, "a", encoding='utf-8-sig') as outFile: # save
for line in Lines:
outFile.write(line)
print("OK")

Have you tried this?
>>> s = "word1'#13#10'word2'#13#10'word3"
>>> s.replace("'word2'#13#10'", '')
"word1'#13#10word3"

Related

Compare each line and remove the repeated/same line having the same numbers in python [duplicate]

I have a file with one column. How to delete repeated lines in a file?
On Unix/Linux, use the uniq command, as per David Locke's answer, or sort, as per William Pursell's comment.
If you need a Python script:
lines_seen = set() # holds lines already seen
outfile = open(outfilename, "w")
for line in open(infilename, "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
Update: The sort/uniq combination will remove duplicates but return a file with the lines sorted, which may or may not be what you want. The Python script above won't reorder lines, but just drop duplicates. Of course, to get the script above to sort as well, just leave out the outfile.write(line) and instead, immediately after the loop, do outfile.writelines(sorted(lines_seen)).
If you're on *nix, try running the following command:
sort <file name> | uniq
uniqlines = set(open('/tmp/foo').readlines())
this will give you the list of unique lines.
writing that back to some file would be as easy as:
bar = open('/tmp/bar', 'w').writelines(uniqlines)
bar.close()
You can do:
import os
os.system("awk '!x[$0]++' /path/to/file > /path/to/rem-dups")
Here You are using bash into python :)
You have also other way:
with open('/tmp/result.txt') as result:
uniqlines = set(result.readlines())
with open('/tmp/rmdup.txt', 'w') as rmdup:
rmdup.writelines(set(uniqlines))
get all your lines in the list and make a set of lines and you are done.
for example,
>>> x = ["line1","line2","line3","line2","line1"]
>>> list(set(x))
['line3', 'line2', 'line1']
>>>
If you need to preserve the ordering of lines - as set is unordered collection - try this:
y = []
for l in x:
if l not in y:
y.append(l)
and write the content back to the file.
Its a rehash of whats already been said here - here what I use.
import optparse
def removeDups(inputfile, outputfile):
lines=open(inputfile, 'r').readlines()
lines_set = set(lines)
out=open(outputfile, 'w')
for line in lines_set:
out.write(line)
def main():
parser = optparse.OptionParser('usage %prog ' +\
'-i <inputfile> -o <outputfile>')
parser.add_option('-i', dest='inputfile', type='string',
help='specify your input file')
parser.add_option('-o', dest='outputfile', type='string',
help='specify your output file')
(options, args) = parser.parse_args()
inputfile = options.inputfile
outputfile = options.outputfile
if (inputfile == None) or (outputfile == None):
print parser.usage
exit(1)
else:
removeDups(inputfile, outputfile)
if __name__ == '__main__':
main()
Python One liners :
python -c "import sys; lines = sys.stdin.readlines(); print ''.join(sorted(set(lines)))" < InputFile > OutputFile
adding to #David Locke's answer, with *nix systems you can run
sort -u messy_file.txt > clean_file.txt
which will create clean_file.txt removing duplicates in alphabetical order.
Look at script I created to remove duplicate emails from text files. Hope this helps!
# function to remove duplicate emails
def remove_duplicate():
# opens emails.txt in r mode as one long string and assigns to var
emails = open('emails.txt', 'r').read()
# .split() removes excess whitespaces from str, return str as list
emails = emails.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate emails to clean list
for email in emails:
if email not in clean_list:
clean_list.append(email)
return clean_list
# close emails.txt file
emails.close()
# assigns no_duplicate_emails.txt to variable below
no_duplicate_emails = open('no_duplicate_emails.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for email in remove_duplicate():
# .strip() method to remove commas
email = email.strip(',')
no_duplicate_emails.write(f"E-mail: {email}\n")
# close no_duplicate_emails.txt file
no_duplicate_emails.close()
If anyone is looking for a solution that uses a hashing and is a little more flashy, this is what I currently use:
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(line)
return line
output_file.writelines((add_line(line) for line in input_file
if line not in seen_lines))
edit it within the same file
lines_seen = set() # holds lines already seen
with open("file.txt", "r+") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
Readable and Concise
with open('sample.txt') as fl:
content = fl.read().split('\n')
content = set([line for line in content if line != ''])
content = '\n'.join(content)
with open('sample.txt', 'w') as fl:
fl.writelines(content)
Here is my solution
if __name__ == '__main__':
f = open('temp.txt','w+')
flag = False
with open('file.txt') as fp:
for line in fp:
for temp in f:
if temp == line:
flag = True
print('Found Match')
break
if flag == False:
f.write(line)
elif flag == True:
flag = False
f.seek(0)
f.close()
cat <filename> | grep '^[a-zA-Z]+$' | sort -u > outfile.txt
To filter and remove duplicate values from the file.
Here is my solution
d = input("your file:") #write your file name here
file1 = open(d, mode="r")
file2 = open('file2.txt', mode='w')
file2 = open('file2.txt', mode='a')
file1row = file1.readline()
while file1row != "" :
file2 = open('file2.txt', mode='a')
file2read = open('file2.txt', mode='r')
file2r = file2read.read().strip()
if file1row not in file2r:
file2.write(file1row)
file1row = file1.readline()
file2read.close()
file2.close

Read a line from a file in python

I have one file named mcelog.conf and I am reading this file in my code. Contents of the file are
no-syslog = yes # (or no to disable)
logfile = /tmp/logfile
Program will read the mcelog.conf file and will check for the no-syslog tag, if no-syslog = yes then program has to check for the tag logfile and will read the logfile tag. Can anyone let me know how I can get the value /tmp/logfile
with open('/etc/mcelog/mcelog.conf', 'r+') as fp:
for line in fp:
if re.search("no-syslog =", line) and re.search("= no", line):
memoryErrors = readLogFile("/var/log/messages")
mcelogPathFound = true
break
elif re.search("no-syslog =", line) and re.search("= yes", line):
continue
elif re.search("logfile =", line):
memoryErrors = readLogFile(line) # Here I want to pass the value "/tmp/logfile" but currently "logfile = /tmp/logfile" is getting passed
mcelogPathFound = true
break
fp.close()
You can just split the line to get the value you want:
line.split(' = ')[1]
However, you might want to look at the documentation for configparser module.
Change the code to:
with open('/etc/mcelog/mcelog.conf', 'r+') as fp:
for line in fp:
if re.search("no-syslog =", line) and re.search("= no", line):
memoryErrors = readLogFile("/var/log/messages")
mcelogPathFound = true
break
elif re.search("no-syslog =", line) and re.search("= yes", line):
continue
elif re.search("logfile =", line):
emoryErrors = readLogFile(line.split("=")[1].strip()) # Here I want to pass the value "/tmp/logfile" but currently "logfile = /tmp/logfile" is getting passed
mcelogPathFound = true
break
fp.close()
This is because you want to read only a part of the line rather the whole thing so I have just split it up by the "=" sign and then stripped it to remove any blanks
I liked the suggestion of the configparser module, so here is an example of that (Python 3)
For the given input, it will output reading /var/log/messages
import configparser, itertools
config = configparser.ConfigParser()
filename = "/tmp/mcelog.conf"
def readLogFile(filename):
if filename:
print("reading", filename)
else:
raise ValueError("unable to read file")
section = 'global'
with open(filename) as fp:
config.read_file(itertools.chain(['[{}]'.format(section)], fp), source = filename)
no_syslog = config[section]['no-syslog']
if no_syslog == 'yes':
logfile = "/var/log/messages"
elif no_syslog == 'no':
logfile = config[section]['logfile']
if logfile:
mcelogPathFound = True
memoryErrors = readLogFile(logfile)

Python Writing to txt error

Im trying to write different things onto a text file in a while loop but it only writes it once. I want to write something to unmigrated.txt
import urllib.request
import json
Txtfile = input("Name of the TXT file: ")
fw = open(Txtfile + ".txt", "r")
red = fw.read()
blue = red.split("\n")
i=0
while i<len(blue):
try:
url = "https://api.mojang.com/users/profiles/minecraft/" + blue[i]
rawdata = urllib.request.urlopen(url)
newrawdata = rawdata.read()
jsondata = json.loads(newrawdata.decode('utf-8'))
results = jsondata['id']
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/" + results
rawdata_uuid = urllib.request.urlopen(url_uuid)
newrawdata_uuid = rawdata_uuid.read()
jsondata_uuid = json.loads(newrawdata_uuid.decode('utf-8'))
try:
results = jsondata_uuid['legacy']
print (blue[i] + " is " + "Unmigrated")
wf = open("unmigrated.txt", "w")
wring = wf.write(blue[i] + " is " + "Unmigrated\n")
except:
print(blue[i] + " is " + "Migrated")
except:
print(blue[i] + " is " + "Not-Premium")
i+=1
You keep overwriting opening the file with w inside the loop so you only see the last data that was written to the file, either open the file once outside the loop or open with a to append. Opening once would be the simplest approach, you can also use range instead of your while or better again just iterate over the list:
with open("unmigrated.txt", "w") as f: # with close your file automatically
for ele in blue:
.....
Also wring = wf.write(blue[i] + " is " + "Unmigrated\n") sets wring to None which is what write returns so probably not of any real use.
Lastly using a blank expect is usually never a good idea, catch the specific exceptions you expect and log or at least print when you get an error.
Using the requests library, I would break up your code doing something like:
import requests
def get_json(url):
try:
rawdata = requests.get(url)
return rawdata.json()
except requests.exceptions.RequestException as e:
print(e)
except ValueError as e:
print(e)
return {}
txt_file = input("Name of the TXT file: ")
with open(txt_file + ".txt") as fw, open("unmigrated.txt", "w") as f: # with close your file automatically
for line in map(str.rstrip, fw): # remove newlines
url = "https://api.mojang.com/users/profiles/minecraft/{}".format(line)
results = get_json(url).get("id")
if not results:
continue
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/{}".format(results)
results = get_json(url_uuid).get('legacy')
print("{} is Unmigrated".format(line))
f.write("{} is Unmigrated\n".format(line))
I am not sure where 'legacy' fits into the code, that logic I will leave to you. You can also iterate directly over the file object so you can forget about splitting the lines into blue.
try:
with open("filename", "w") as f:
f.write("your content")
But that will overwrite all contents of the file.
Instead, if you want to append to the file use:
with open("filename", "a") as f:
If you choose to not use the with syntax, remember to close the file.
Read more here:
https://docs.python.org/2/library/functions.html#open

Search, count and add - Python

properties = ["color", "font-size", "font-family", "width", "height"]
inPath = "style.css"
outPath = "output.txt"
#Open a file for reading
file = open(inPath, 'rU')
if file:
# read from the file
filecontents = file.read()
file.close()
else:
print "Error Opening File."
#Open a file for writing
file = open(outPath, 'wb')
if file:
for i in properties:
search = i
index = filecontents.find(search)
file.write(str(index), "\n")
file.close()
else:
print "Error Opening File."
seems to work, but:
It only searches a keyword once?
Its not writing to the output file. function takes exactly 1 argument
I don't want it to print the index actually, but the number of time the keyword appears.
Many thanks
First, you want .count(search), not .find(search), if what you're looking for is # of occurrences.
Second, .write() only takes a single parameter - if you want to write a newline, you need to concatenate it first, or call .write() twice.
Third, doing for i in properties: search = i is redundant; just use the name you want in your for loop.
for search in properties:
cnt = filecontents.count(search)
file.write(str(cnt) + "\n")
from itertools import imap
properties = ("color", "font-size", "font-family", "width", "height")
inPath = "style.css"
outPath = "output.txt"
try:
#Open a file for reading
filecontents = file(inPath).read()
except Exception as exc:
print exc
else:
#Open a file for writing
with open(outPath, 'wb') as out_file:
#for property in properties:
# out_string = "%s %s\n"
# out_file.write( out_string % (
# property, filecontents.count(property)))
outfile.write('\n'.join(
imap(str, imap(filecontents.count, properties))))

How might I remove duplicate lines from a file?

I have a file with one column. How to delete repeated lines in a file?
On Unix/Linux, use the uniq command, as per David Locke's answer, or sort, as per William Pursell's comment.
If you need a Python script:
lines_seen = set() # holds lines already seen
outfile = open(outfilename, "w")
for line in open(infilename, "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
Update: The sort/uniq combination will remove duplicates but return a file with the lines sorted, which may or may not be what you want. The Python script above won't reorder lines, but just drop duplicates. Of course, to get the script above to sort as well, just leave out the outfile.write(line) and instead, immediately after the loop, do outfile.writelines(sorted(lines_seen)).
If you're on *nix, try running the following command:
sort <file name> | uniq
uniqlines = set(open('/tmp/foo').readlines())
this will give you the list of unique lines.
writing that back to some file would be as easy as:
bar = open('/tmp/bar', 'w').writelines(uniqlines)
bar.close()
You can do:
import os
os.system("awk '!x[$0]++' /path/to/file > /path/to/rem-dups")
Here You are using bash into python :)
You have also other way:
with open('/tmp/result.txt') as result:
uniqlines = set(result.readlines())
with open('/tmp/rmdup.txt', 'w') as rmdup:
rmdup.writelines(set(uniqlines))
get all your lines in the list and make a set of lines and you are done.
for example,
>>> x = ["line1","line2","line3","line2","line1"]
>>> list(set(x))
['line3', 'line2', 'line1']
>>>
If you need to preserve the ordering of lines - as set is unordered collection - try this:
y = []
for l in x:
if l not in y:
y.append(l)
and write the content back to the file.
Its a rehash of whats already been said here - here what I use.
import optparse
def removeDups(inputfile, outputfile):
lines=open(inputfile, 'r').readlines()
lines_set = set(lines)
out=open(outputfile, 'w')
for line in lines_set:
out.write(line)
def main():
parser = optparse.OptionParser('usage %prog ' +\
'-i <inputfile> -o <outputfile>')
parser.add_option('-i', dest='inputfile', type='string',
help='specify your input file')
parser.add_option('-o', dest='outputfile', type='string',
help='specify your output file')
(options, args) = parser.parse_args()
inputfile = options.inputfile
outputfile = options.outputfile
if (inputfile == None) or (outputfile == None):
print parser.usage
exit(1)
else:
removeDups(inputfile, outputfile)
if __name__ == '__main__':
main()
Python One liners :
python -c "import sys; lines = sys.stdin.readlines(); print ''.join(sorted(set(lines)))" < InputFile > OutputFile
adding to #David Locke's answer, with *nix systems you can run
sort -u messy_file.txt > clean_file.txt
which will create clean_file.txt removing duplicates in alphabetical order.
Look at script I created to remove duplicate emails from text files. Hope this helps!
# function to remove duplicate emails
def remove_duplicate():
# opens emails.txt in r mode as one long string and assigns to var
emails = open('emails.txt', 'r').read()
# .split() removes excess whitespaces from str, return str as list
emails = emails.split()
# empty list to store non-duplicate e-mails
clean_list = []
# for loop to append non-duplicate emails to clean list
for email in emails:
if email not in clean_list:
clean_list.append(email)
return clean_list
# close emails.txt file
emails.close()
# assigns no_duplicate_emails.txt to variable below
no_duplicate_emails = open('no_duplicate_emails.txt', 'w')
# function to convert clean_list 'list' elements in to strings
for email in remove_duplicate():
# .strip() method to remove commas
email = email.strip(',')
no_duplicate_emails.write(f"E-mail: {email}\n")
# close no_duplicate_emails.txt file
no_duplicate_emails.close()
If anyone is looking for a solution that uses a hashing and is a little more flashy, this is what I currently use:
def remove_duplicate_lines(input_path, output_path):
if os.path.isfile(output_path):
raise OSError('File at {} (output file location) exists.'.format(output_path))
with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
seen_lines = set()
def add_line(line):
seen_lines.add(line)
return line
output_file.writelines((add_line(line) for line in input_file
if line not in seen_lines))
edit it within the same file
lines_seen = set() # holds lines already seen
with open("file.txt", "r+") as f:
d = f.readlines()
f.seek(0)
for i in d:
if i not in lines_seen:
f.write(i)
lines_seen.add(i)
f.truncate()
Readable and Concise
with open('sample.txt') as fl:
content = fl.read().split('\n')
content = set([line for line in content if line != ''])
content = '\n'.join(content)
with open('sample.txt', 'w') as fl:
fl.writelines(content)
Here is my solution
if __name__ == '__main__':
f = open('temp.txt','w+')
flag = False
with open('file.txt') as fp:
for line in fp:
for temp in f:
if temp == line:
flag = True
print('Found Match')
break
if flag == False:
f.write(line)
elif flag == True:
flag = False
f.seek(0)
f.close()
cat <filename> | grep '^[a-zA-Z]+$' | sort -u > outfile.txt
To filter and remove duplicate values from the file.
Here is my solution
d = input("your file:") #write your file name here
file1 = open(d, mode="r")
file2 = open('file2.txt', mode='w')
file2 = open('file2.txt', mode='a')
file1row = file1.readline()
while file1row != "" :
file2 = open('file2.txt', mode='a')
file2read = open('file2.txt', mode='r')
file2r = file2read.read().strip()
if file1row not in file2r:
file2.write(file1row)
file1row = file1.readline()
file2read.close()
file2.close

Categories