Nested loops iterating on a single file - python

I want to delete some specific lines in a file.
The part I want to delete is enclosed between two lines (that will be deleted too), named STARTING_LINE and CLOSING_LINE. If there is no closing line before the end of the file, then the operation should stop.
Example:
...blabla...
[Start] <-- # STARTING_LINE
This is the body that I want to delete
[End] <-- # CLOSING_LINE
...blabla...
I came out with three different ways to achieve the same thing (plus one provided by tdelaney's answer below), but I am wondering which one is the best. Please note that I am not looking for a subjective opinion: I would like to know if there are some real reasons why I should choose one method over another.
1. A lot of if conditions (just one for loop):
def delete_lines(filename):
with open(filename, 'r+') as my_file:
text = ''
found_start = False
found_end = False
for line in my_file:
if not found_start and line.strip() == STARTING_LINE.strip():
found_start = True
elif found_start and not found_end:
if line.strip() == CLOSING_LINE.strip():
found_end = True
continue
else:
print(line)
text += line
# Go to the top and write the new text
my_file.seek(0)
my_file.truncate()
my_file.write(text)
2. Nested for loops on the open file:
def delete_lines(filename):
with open(filename, 'r+') as my_file:
text = ''
for line in my_file:
if line.strip() == STARTING_LINE.strip():
# Skip lines until we reach the end of the function
# Note: the next `for` loop iterates on the following lines, not
# on the entire my_file (i.e. it is not starting from the first
# line). This will allow us to avoid manually handling the
# StopIteration exception.
found_end = False
for function_line in my_file:
if function_line.strip() == CLOSING_LINE.strip():
print("stop")
found_end = True
break
if not found_end:
print("There is no closing line. Stopping")
return False
else:
text += line
# Go to the top and write the new text
my_file.seek(0)
my_file.truncate()
my_file.write(text)
3. while True and next() (with StopIteration exception)
def delete_lines(filename):
with open(filename, 'r+') as my_file:
text = ''
for line in my_file:
if line.strip() == STARTING_LINE.strip():
# Skip lines until we reach the end of the function
while True:
try:
line = next(my_file)
if line.strip() == CLOSING_LINE.strip():
print("stop")
break
except StopIteration as ex:
print("There is no closing line.")
else:
text += line
# Go to the top and write the new text
my_file.seek(0)
my_file.truncate()
my_file.write(text)
4. itertools (from tdelaney's answer):
def delete_lines_iter(filename):
with open(filename, 'r+') as wrfile:
with open(filename, 'r') as rdfile:
# write everything before startline
wrfile.writelines(itertools.takewhile(lambda l: l.strip() != STARTING_LINE.strip(), rdfile))
# drop everything before stopline.. and the stopline itself
try:
next(itertools.dropwhile(lambda l: l.strip() != CLOSING_LINE.strip(), rdfile))
except StopIteration:
pass
# include everything after
wrfile.writelines(rdfile)
wrfile.truncate()
It seems that these four implementations achieve the same result. So...
Question: which one should I use? Which one is the most Pythonic? Which one is the most efficient?
Is there a better solution instead?
Edit: I tried to evaluate the methods on a big file using timeit. In order to have the same file on each iteration, I removed the writing parts of each code; this means that the evaluation mostly regards the reading (and file opening) task.
t_if = timeit.Timer("delete_lines_if('test.txt')", "from __main__ import delete_lines_if")
t_for = timeit.Timer("delete_lines_for('test.txt')", "from __main__ import delete_lines_for")
t_while = timeit.Timer("delete_lines_while('test.txt')", "from __main__ import delete_lines_while")
t_iter = timeit.Timer("delete_lines_iter('test.txt')", "from __main__ import delete_lines_iter")
print(t_if.repeat(3, 4000))
print(t_for.repeat(3, 4000))
print(t_while.repeat(3, 4000))
print(t_iter.repeat(3, 4000))
Result:
# Using IF statements:
[13.85873354100022, 13.858520206999856, 13.851908310999988]
# Using nested FOR:
[13.22578497800032, 13.178281234999758, 13.155530822999935]
# Using while:
[13.254994718000034, 13.193942980999964, 13.20395484699975]
# Using itertools:
[10.547019549000197, 10.506679693000024, 10.512742852999963]

You can make it fancy with itertools. I'd be interested in how timing compares.
import itertools
def delete_lines(filename):
with open(filename, 'r+') as wrfile:
with open(filename, 'r') as rdfile:
# write everything before startline
wrfile.writelines(itertools.takewhile(lambda l: l.strip() != STARTING_LINE.strip(), rdfile))
# drop everything before stopline.. and the stopline itself
next(itertools.dropwhile(lambda l: l.strip() != CLOSING_LINE.strip(), rdfile))
# include everything after
wrfile.writelines(rdfile)
wrfile.truncate()

Related

So I made a file editing program in python... and one of the functions isn't working right

As the title says, I made a file editing program with python.
Here is the code that I'm have a problem with:
#fileEditing.py
def fileError(file):
raise OSError("file {} does not exist".format(file))
class AccessFile():
def fileExists(self, file):
import os
return bool(os.path.exists(file))
def filecreate(self, file):
if not self.fileExists(file):
with open(file, "w") as f:
f.close()
else: raise OSError("file {} already exists".format(file))
def filedelete(self, file):
import os
if self.fileExists(file):
os.remove(file)
else: fileError(file)
def fileread(self, file):
#check if file exists
if self.fileExists(file):
#detect length of file
with open(file, "r") as f:
line = " "
x = 0
while line != "":
line = f.readline()
x += 1
#piece lines together in a list
filelines = []
with open(file, "r") as f:
for i in range(x - 1):
filelines.append(str(f.readline()))
#return a tuple
return tuple(filelines)
else: fileError(file)
def filewrite(self, file, line, text):
''' BUG: apparently this either overwrites the line its writing or appends
to the line its writing... make up your mind!'''
if self.fileExists(file):
#get file contents
filelines = list(self.fileread(file))
#see if line parameter is out of range or not
try:
filelines[line] = text
except IndexError:
for i in range(line - len(filelines)):
filelines.append("")
filelines.append(str(text) + "\n")
#apply changes
with open(file, "w") as f:
f.write("") #delete contents
with open(file, "w") as f:
for l in filelines:
f.write(l)
else: fileError(file)
def fileoverwrite(self, file, data):
#if there is no file to delete, it will make a new one
try:
self.filedelete(file)
except:
pass
self.filecreate(file)
x = 0
for line in data:
print(line)
self.filewrite(file, x, line)
x += 1
accessfile = AccessFile()
The bug is in the filewrite(self, file, line, text) function. When called, it either writes a new line (which is what I want it to do), appends to the line its supposed to replace, or just doesn't write any lines at all.
Say I want to write a python file with this program:
#pytesting.py
from fileEditing import *
file = "/Users/ashton/Desktop/Atom/Python/FileEditing/FileManager.py"
data = [
"from fileEditing import *",
"",
"class FileEditing():",
" def __init__(options, immutable_files):",
" self.options, self.immutable_files = options, immutable_files",
" ",
" def prompt():",
" ",
"",
"while True:",
" pass"
]
accessfile.fileoverwrite(file, data)
When I run it, it makes a file with accessfile.fileoverwrite(file, data), like its supposed to.
But thats where things get whacky.
(FileManager.py below)
from fileEditing import *
class FileEditing():
def __init__(options, immutable_files): self.options, self.immutable_files = options, immutable_files
def prompt():
while True:
If you know how to fix the filewrite(self, file, line, text), please let me know.
(I use python 2.7 but python 3 is fine)
So this is definitely a Python 3.x solution but you said that it is fine, don't know if it will work in Python 2.x but it is so simple it should:
def file_overwrite(self, file, data):
with open(file, 'w') as file:
file.write('\n'.join(data))
And you seemingly also need to fix that data list because it is missing a few commas. Also the fact that this is all in a class is a bit weird, you do nothing with the instance, they all might as well be separate functions or #classmethods or #staticmethods. Also several things could be improved with your other functions. For example you shouldn't open the file twice and count its lines to read it. Just do file.readlines() at it will return a list of all lines:
def fileread(self, file):
if self.fileExists(file):
with open(file) as file:
return file.readlines()
else:
fileError(file)
Then also import os once at the start of the file, you don't need to import it in every function where you use os, also:
with open(file, "w") as f:
f.close()
f.close() is completely pointless because the context manger closes the file anyways and also there is mode "x" which is specifically made for file creation and will raise an error if the file already exists: https://www.w3schools.com/python/python_file_handling.asp

how to skip blocks of text when writing file in python

Is it possible to use python to skip blocks of text when writing a file from another file?
For example lets say the input file is:
This is the file I would like to write this line
I would like to skip this line
and this one...
and this one...
and this one...
but I want to write this one
and this one...
How can I write a script that allows me to skip certain lines that differ in content and size which resumes writing the lines to another file once it recognizes a certain line?
My code reads through the lines, doesn't write duplicate lines and performs some operation on the line by using dictionaries and regex.
def is_wanted(line):
#
# You have to define this!
#
# return True to keep the line, or False to discard it
def copy_some_lines(infname, outfname, wanted_fn=is_wanted):
with open(infname) as inf, open(outfname, "w") as outf:
outf.writelines(line for line in inf if wanted_fn(line))
copy_some_lines("file_a.txt", "some_of_a.txt")
In order to extend this to multi-line blocks, you can implement a finite state machine like
which would turn into something like
class BlockState:
GOOD_BLOCK = True
BAD_BLOCK = False
def __init__(self):
self.state = self.GOOD_BLOCK
def is_bad(self, line):
# *** Implement this! ***
# return True if line is bad
def is_good(self, line):
# *** Implement this! ***
# return True if line is good
def __call__(self, line):
if self.state == self.GOOD_BLOCK:
if self.is_bad(line):
self.state = self.BAD_BLOCK
else:
if self.is_good(line):
self.state = self.GOOD_BLOCK
return self.state
then
copy_some_lines("file_a.txt", "some_of_a.txt", BlockState())
Pseudo-code:
# Open input and output files, and declare the unwanted function
for line in file1:
if unwanted(line):
continue
file2.write(line)
# Close files etc...
You can read the file line by line, and have control on each line you read:
with open(<your_file>, 'r') as lines:
for line in lines:
# skip this line
# but not this one
Note that if you want to read all lines despite the content and only then manipulate it, you can:
with open(<your_file>) as fil:
lines = fil.readlines()
This should work:
SIZE_TO_SKIP = ?
CONTENT_TO_SKIP = "skip it"
with open("my/input/file") as input_file:
with open("my/output/file",'w') as output_file:
for line in input_file:
if len(line)!=SIZE_TO_SKIP and line!=CONTENT_TO_SKIP:
output_file.write(line)

Python Function return value for 1 iteration only

I do have following code where I am its doing following thing.
Parsing Whole file and checking for patter in each line. If the pattern exists, it should return that pattern to main function and print the value.
Issue: The function is only returning 1st pattern and do not check for same pattern into multiple lines.
code:
import re
import sys
import os
def find_pattern(file):
with open(file) as fp:
for line in fp:
if "abc" in line:
return line
else:
continue
def check(file):
return_list = []
data=find_pattern(file)
if data != None:
return_list.append(data)
if not data:
return "working"
else:
return return_list
if __name__== '__main__':
file = sys.argv[1]
print check(file)
If the file has multiple line containing abc, it will print only 1st line and skip other lines. I want to print all lines that contains abc.
Sample file
sdfdsffdabcafsfse
asasssadfsdadsadsaddsadadabc
asfasfdsafdfabc
output with above code:
sdfdsffdabcafsfse
You are prematurely returning from the function on this line:
return line
Which means you exit the function and the loop ceases to iterate assoon as the first instance is found.
Consider something like this instead, where you capture and return all matches:
def find_pattern(file):
out = []
with open(file) as fp:
for line in fp:
if "abc" in line:
out.append(line)
else:
continue
return out
Alternatively, you can manage this in a single, simple list comprehension:
def find_pattern(file):
with open(file) as fp:
return [line for line in fp if "abc" in line]

Why my function is returning empty string in python?

What I am doing is, removing all parts of speech except nouns from a text.
I have written a function for that. It may not be the best or optimized code to do that because I have just started coding in python. I am sure the bug must be very basic but I am just not able to figure it out.
In my function two inputs go as parameters. One is the location of text on hard drive and other is the location of file where we want the output.
Following is the code.
def extract_nouns(i_location, o_location):
import nltk
with open(i_location, "r") as myfile:
data = myfile.read().replace('\n', '')
tokens = nltk.word_tokenize(data)
tagged = nltk.pos_tag(tokens)
length = len(tagged)
a = list()
for i in range(0,length):
print(i)
log = (tagged[i][1][0] == 'N')
if log == False:
a.append(tagged[i][0])
fin = open(i_location, 'r')
fout = open(o_location, "w+")
for line in fin:
for word in a:
line = line.replace(word, "")
fout.write(line)
with open(o_location, "r") as myfile_new:
data_out = myfile_new.read().replace('\n', '')
return data_out
When I call this function it is working just fine. I am getting the output on hard disk as I had intended but it does not return the output on the interface or should I say, it is returning a blank string instead of the actual output string.
This is how I am calling it.
t = extract_nouns("input.txt","output.txt")
If you want to try it, take following as the content of input file
"At eight o'clock on
Thursday film morning word line test
best beautiful Ram Aaron design"
This is the output I am getting in the output file (output.txt) when I call the function but the function returns blank string on the interface instead. It does not even print the output.
"
Thursday film morning word line test
Ram Aar design"
You need to close the file first:
for line in fin:
for word in a:
line = line.replace(word, "")
fout.write(line)
fout.close()
Using with is usually the best way to open files as it automatically closes them and file.seek() to go back to the start of the file to read :
def extract_nouns(i_location, o_location):
import nltk
with open(i_location, "r") as myfile:
data = myfile.read().replace('\n', '')
tokens = nltk.word_tokenize(data)
tagged = nltk.pos_tag(tokens)
length = len(tagged)
a = []
for i in range(0,length):
print(i)
log = (tagged[i][1][0] == 'N')
if not log:
a.append(tagged[i][0])
with open(i_location, 'r') as fin, open(o_location, "w+") as fout:
for line in fin:
for word in a:
line = line.replace(word, "")
fout.write(line)
fout.seek(0) # go back to start of file
data_out = fout.read().replace('\n' , '')
return data_out
The last statement in the function should be the return.
Because there is the print data_out, you return the return value of print which is none.
E.g:
In []: def test():
..: print 'Hello!'
..:
In []: res = test()
Hello!
In []: res is None
Out[]: True

Search for a string with in a module in a python file using Python

#!/usr/bin/env python`
import sys`
import binascii`
import string
sample = "foo.apples"
data_file = open("file1.py","r")
dat_file = open("file2.txt", "w")
for line in data_file:
if sample in line:
dat_file.writelines(line)
dat_file.close()`
When I do this I am able to find the string foo.apples. The problem is foo.apples is present in various lines in the python file. I want those lines which are inside a particular function. I need the lines within this def function.
Example:
def start():
foo.apples(a,b)
foo.apples(c,d) ... so on.
The following program finds defs and will append the sample string to the output file if the indentation remains within the def.
import re
sample = 'foo.apples'
data_file = open("file1.py", "r")
out_file = open("file2.txt", "w")
within_def = False
def_indent = 0
for line in data_file:
def_match = re.match(r'(\s*)def\s+start\s*\(', line) # EDIT: fixed regex
if def_match and not within_def:
within_def = True
def_indent = len(def_match.group(1))
elif within_def and re.match(r'\s{%s}\S' % def_indent, line):
within_def = False
if within_def and sample in line:
out_file.writelines(line)
out_file.close()
data_file.close()
Tested working on an example file1.py.
One, slightly off the beaten path approach to this would be to use the getsource method of the inspect module. Consider the following (theoretical) test1.py file:
class foo(object):
apples = 'granny_smith'
#classmethod
def new_apples(cls):
cls.apples = 'macintosh'
def start():
"""This is a pretty meaningless python function.
Attempts to run it will definitely result in an exception being thrown"""
print foo.apples
foo.apples = 3
[x for x in range(10)]
import bar as foo
Now you want to know about the start code:
import inspect
import test1 #assume it is somewhere that can be imported
print inspect.getsource(test1.start)
Ok, now we have only the source of that function. We can now parse through that:
for line in inspect.getsource(test1.start).splitlines():
if 'foo.apples' in line:
print line
There are some advantages here -- python does all the work of parsing out the function blocks when it imports the file. The downside though is that the file actually needs to be imported. Depending on where your files are coming from, this could introduce a HUGE security hole in your program -- You'll be running (potentially) "untrusted" code.
Here's a very non pythonic way, untested, but it should work.
sample = "foo.apples"
infile = open("file1.py", "r")
outfile = open("file2.txt", "w")
in_function = False
for line in infile.readlines():
if in_function:
if line[0] in(" ", "\t"):
if sample in line:
outfile.write(line)
else:
in_function = False
elif line.strip() == "def start():":
in_function = True
infile.close()
outfile.close()
I would suggest doing a function of this, which takes sample, the input file, and the function which we're supposed to search from as it's parameters. It would then return a list or tuple of all the lines that had the text in them.
def findFromFile(file, word, function):
in_function = False
matches = []
infile = open(file, "r")
for line in infile.readlines():
if in_function:
if line[0] in(" ", "\t"):
if word in line:
matches.append(line)
else:
in_function = False
elif line.strip() == "def %s():"%function:
in_function = True
infile.close()
return matches

Categories